summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--include/linux/sched.h10
-rw-r--r--include/linux/sched/sysctl.h1
-rw-r--r--include/trace/events/sched.h115
-rw-r--r--kernel/sched/core.c735
-rw-r--r--kernel/sched/fair.c26
-rw-r--r--kernel/sched/sched.h26
-rw-r--r--kernel/sched/sched_avg.c2
-rw-r--r--kernel/sysctl.c7
8 files changed, 764 insertions, 158 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 7e107c3d7a5c..61a5c00e66cd 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -328,6 +328,16 @@ enum task_event {
IRQ_UPDATE = 5,
};
+/* Note: this need to be in sync with migrate_type_names array */
+enum migrate_types {
+ GROUP_TO_RQ,
+ RQ_TO_GROUP,
+ RQ_TO_RQ,
+ GROUP_TO_GROUP,
+};
+
+extern const char *migrate_type_names[];
+
#include <linux/spinlock.h>
/*
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 84bac3e07709..2ac84af88802 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -75,6 +75,7 @@ extern unsigned int sysctl_sched_restrict_cluster_spill;
#if defined(CONFIG_SCHED_FREQ_INPUT)
extern unsigned int sysctl_sched_new_task_windows;
extern unsigned int sysctl_sched_pred_alert_freq;
+extern unsigned int sysctl_sched_freq_aggregate;
#endif
#else /* CONFIG_SCHED_HMP */
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 6c5fa35e2875..81415b78ef39 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -9,6 +9,8 @@
#include <linux/binfmts.h>
struct rq;
+struct group_cpu_time;
+struct migration_sum_data;
extern const char *task_event_names[];
/*
@@ -269,9 +271,10 @@ TRACE_EVENT(sched_set_boost,
TRACE_EVENT(sched_update_task_ravg,
TP_PROTO(struct task_struct *p, struct rq *rq, enum task_event evt,
- u64 wallclock, u64 irqtime, u32 cycles, u32 exec_time),
+ u64 wallclock, u64 irqtime, u32 cycles, u32 exec_time,
+ struct group_cpu_time *cpu_time),
- TP_ARGS(p, rq, evt, wallclock, irqtime, cycles, exec_time),
+ TP_ARGS(p, rq, evt, wallclock, irqtime, cycles, exec_time, cpu_time),
TP_STRUCT__entry(
__array( char, comm, TASK_COMM_LEN )
@@ -290,8 +293,12 @@ TRACE_EVENT(sched_update_task_ravg,
__field( int, cpu )
#ifdef CONFIG_SCHED_FREQ_INPUT
__field(unsigned int, pred_demand )
- __field( u64, cs )
- __field( u64, ps )
+ __field( u64, rq_cs )
+ __field( u64, rq_ps )
+ __field( u64, grp_cs )
+ __field( u64, grp_ps )
+ __field( u64, grp_nt_cs )
+ __field( u64, grp_nt_ps )
__field( u32, curr_window )
__field( u32, prev_window )
__field( u64, nt_cs )
@@ -318,8 +325,12 @@ TRACE_EVENT(sched_update_task_ravg,
__entry->irqtime = irqtime;
#ifdef CONFIG_SCHED_FREQ_INPUT
__entry->pred_demand = p->ravg.pred_demand;
- __entry->cs = rq->curr_runnable_sum;
- __entry->ps = rq->prev_runnable_sum;
+ __entry->rq_cs = rq->curr_runnable_sum;
+ __entry->rq_ps = rq->prev_runnable_sum;
+ __entry->grp_cs = cpu_time ? cpu_time->curr_runnable_sum : 0;
+ __entry->grp_ps = cpu_time ? cpu_time->prev_runnable_sum : 0;
+ __entry->grp_nt_cs = cpu_time ? cpu_time->nt_curr_runnable_sum : 0;
+ __entry->grp_nt_ps = cpu_time ? cpu_time->nt_prev_runnable_sum : 0;
__entry->curr_window = p->ravg.curr_window;
__entry->prev_window = p->ravg.prev_window;
__entry->nt_cs = rq->nt_curr_runnable_sum;
@@ -330,7 +341,7 @@ TRACE_EVENT(sched_update_task_ravg,
TP_printk("wc %llu ws %llu delta %llu event %s cpu %d cur_freq %u cur_pid %d task %d (%s) ms %llu delta %llu demand %u sum %u irqtime %llu"
#ifdef CONFIG_SCHED_FREQ_INPUT
- " pred_demand %u cs %llu ps %llu cur_window %u prev_window %u nt_cs %llu nt_ps %llu active_wins %u"
+ " pred_demand %u rq_cs %llu rq_ps %llu cur_window %u prev_window %u nt_cs %llu nt_ps %llu active_wins %u grp_cs %lld grp_ps %lld, grp_nt_cs %llu, grp_nt_ps: %llu"
#endif
, __entry->wallclock, __entry->win_start, __entry->delta,
task_event_names[__entry->evt], __entry->cpu,
@@ -339,10 +350,12 @@ TRACE_EVENT(sched_update_task_ravg,
__entry->delta_m, __entry->demand,
__entry->sum, __entry->irqtime
#ifdef CONFIG_SCHED_FREQ_INPUT
- , __entry->pred_demand, __entry->cs, __entry->ps,
+ , __entry->pred_demand, __entry->rq_cs, __entry->rq_ps,
__entry->curr_window, __entry->prev_window,
__entry->nt_cs, __entry->nt_ps,
- __entry->active_windows
+ __entry->active_windows,
+ __entry->grp_cs, __entry->grp_ps,
+ __entry->grp_nt_cs, __entry->grp_nt_ps
#endif
)
);
@@ -506,31 +519,62 @@ TRACE_EVENT(sched_update_pred_demand,
TRACE_EVENT(sched_migration_update_sum,
- TP_PROTO(struct rq *rq, struct task_struct *p),
+ TP_PROTO(struct task_struct *p, enum migrate_types migrate_type, struct migration_sum_data *d),
- TP_ARGS(rq, p),
+ TP_ARGS(p, migrate_type, d),
TP_STRUCT__entry(
- __field(int, cpu )
+ __field(int, tcpu )
__field(int, pid )
__field( u64, cs )
__field( u64, ps )
__field( s64, nt_cs )
__field( s64, nt_ps )
+ __field(enum migrate_types, migrate_type )
+ __field( s64, src_cs )
+ __field( s64, src_ps )
+ __field( s64, dst_cs )
+ __field( s64, dst_ps )
+ __field( s64, src_nt_cs )
+ __field( s64, src_nt_ps )
+ __field( s64, dst_nt_cs )
+ __field( s64, dst_nt_ps )
),
TP_fast_assign(
- __entry->cpu = cpu_of(rq);
- __entry->cs = rq->curr_runnable_sum;
- __entry->ps = rq->prev_runnable_sum;
- __entry->nt_cs = (s64)rq->nt_curr_runnable_sum;
- __entry->nt_ps = (s64)rq->nt_prev_runnable_sum;
+ __entry->tcpu = task_cpu(p);
__entry->pid = p->pid;
- ),
-
- TP_printk("cpu %d: cs %llu ps %llu nt_cs %lld nt_ps %lld pid %d",
- __entry->cpu, __entry->cs, __entry->ps,
- __entry->nt_cs, __entry->nt_ps, __entry->pid)
+ __entry->migrate_type = migrate_type;
+ __entry->src_cs = d->src_rq ?
+ d->src_rq->curr_runnable_sum :
+ d->src_cpu_time->curr_runnable_sum;
+ __entry->src_ps = d->src_rq ?
+ d->src_rq->prev_runnable_sum :
+ d->src_cpu_time->prev_runnable_sum;
+ __entry->dst_cs = d->dst_rq ?
+ d->dst_rq->curr_runnable_sum :
+ d->dst_cpu_time->curr_runnable_sum;
+ __entry->dst_ps = d->dst_rq ?
+ d->dst_rq->prev_runnable_sum :
+ d->dst_cpu_time->prev_runnable_sum;
+ __entry->src_nt_cs = d->src_rq ?
+ d->src_rq->nt_curr_runnable_sum :
+ d->src_cpu_time->nt_curr_runnable_sum;
+ __entry->src_nt_ps = d->src_rq ?
+ d->src_rq->nt_prev_runnable_sum :
+ d->src_cpu_time->nt_prev_runnable_sum;
+ __entry->dst_nt_cs = d->dst_rq ?
+ d->dst_rq->nt_curr_runnable_sum :
+ d->dst_cpu_time->nt_curr_runnable_sum;
+ __entry->dst_nt_ps = d->dst_rq ?
+ d->dst_rq->nt_prev_runnable_sum :
+ d->dst_cpu_time->nt_prev_runnable_sum;
+ ),
+
+ TP_printk("pid %d task_cpu %d migrate_type %s src_cs %llu src_ps %llu dst_cs %lld dst_ps %lld src_nt_cs %llu src_nt_ps %llu dst_nt_cs %lld dst_nt_ps %lld",
+ __entry->pid, __entry->tcpu, migrate_type_names[__entry->migrate_type],
+ __entry->src_cs, __entry->src_ps, __entry->dst_cs, __entry->dst_ps,
+ __entry->src_nt_cs, __entry->src_nt_ps, __entry->dst_nt_cs, __entry->dst_nt_ps)
);
TRACE_EVENT(sched_get_busy,
@@ -562,15 +606,17 @@ TRACE_EVENT(sched_get_busy,
TRACE_EVENT(sched_freq_alert,
- TP_PROTO(int cpu, int pd_notif, u64 old_load, u64 new_load,
- u64 old_pred, u64 new_pred),
+ TP_PROTO(int cpu, int pd_notif, int check_groups, struct rq *rq,
+ u64 new_load),
- TP_ARGS(cpu, pd_notif, old_load, new_load, old_pred, new_pred),
+ TP_ARGS(cpu, pd_notif, check_groups, rq, new_load),
TP_STRUCT__entry(
__field( int, cpu )
__field( int, pd_notif )
- __field( u64, old_load )
+ __field( int, check_groups )
+ __field( u64, old_busy_time )
+ __field( u64, ps )
__field( u64, new_load )
__field( u64, old_pred )
__field( u64, new_pred )
@@ -579,17 +625,18 @@ TRACE_EVENT(sched_freq_alert,
TP_fast_assign(
__entry->cpu = cpu;
__entry->pd_notif = pd_notif;
- __entry->old_load = old_load;
+ __entry->check_groups = check_groups;
+ __entry->old_busy_time = rq->old_busy_time;
+ __entry->ps = rq->prev_runnable_sum;
__entry->new_load = new_load;
- __entry->old_pred = old_pred;
- __entry->new_pred = new_pred;
+ __entry->old_pred = rq->old_estimated_time;
+ __entry->new_pred = rq->hmp_stats.pred_demands_sum;
),
- TP_printk("cpu %d pd_notif=%d old_load=%llu new_load=%llu "
- "old_pred=%llu new_pred=%llu",
- __entry->cpu, __entry->pd_notif, __entry->old_load,
- __entry->new_load, __entry->old_pred,
- __entry->new_pred)
+ TP_printk("cpu %d pd_notif=%d check_groups %d old_busy_time=%llu prev_sum=%lld new_load=%llu old_pred=%llu new_pred=%llu",
+ __entry->cpu, __entry->pd_notif, __entry->check_groups,
+ __entry->old_busy_time, __entry->ps, __entry->new_load,
+ __entry->old_pred, __entry->new_pred)
);
#endif /* CONFIG_SCHED_FREQ_INPUT */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0b55bbbd7431..87e93b3f3b4e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -97,6 +97,9 @@ const char *task_event_names[] = {"PUT_PREV_TASK", "PICK_NEXT_TASK",
"TASK_WAKE", "TASK_MIGRATE", "TASK_UPDATE",
"IRQ_UPDATE"};
+const char *migrate_type_names[] = {"GROUP_TO_RQ", "RQ_TO_GROUP",
+ "RQ_TO_RQ", "GROUP_TO_GROUP"};
+
ATOMIC_NOTIFIER_HEAD(migration_notifier_head);
ATOMIC_NOTIFIER_HEAD(load_alert_notifier_head);
@@ -1864,6 +1867,61 @@ __read_mostly unsigned int sched_major_task_runtime = 10000000;
static unsigned int sync_cpu;
+static LIST_HEAD(related_thread_groups);
+static DEFINE_RWLOCK(related_thread_group_lock);
+
+#define for_each_related_thread_group(grp) \
+ list_for_each_entry(grp, &related_thread_groups, list)
+
+/*
+ * Demand aggregation for frequency purpose:
+ *
+ * 'sched_freq_aggregate' controls aggregation of cpu demand of related threads
+ * for frequency determination purpose. This aggregation is done per-cluster.
+ *
+ * CPU demand of tasks from various related groups is aggregated per-cluster and
+ * added to the "max_busy_cpu" in that cluster, where max_busy_cpu is determined
+ * by just rq->prev_runnable_sum.
+ *
+ * Some examples follow, which assume:
+ * Cluster0 = CPU0-3, Cluster1 = CPU4-7
+ * One related thread group A that has tasks A0, A1, A2
+ *
+ * A->cpu_time[X].curr/prev_sum = counters in which cpu execution stats of
+ * tasks belonging to group A are accumulated when they run on cpu X.
+ *
+ * CX->curr/prev_sum = counters in which cpu execution stats of all tasks
+ * not belonging to group A are accumulated when they run on cpu X
+ *
+ * Lets say the stats for window M was as below:
+ *
+ * C0->prev_sum = 1ms, A->cpu_time[0].prev_sum = 5ms
+ * Task A0 ran 5ms on CPU0
+ * Task B0 ran 1ms on CPU0
+ *
+ * C1->prev_sum = 5ms, A->cpu_time[1].prev_sum = 6ms
+ * Task A1 ran 4ms on CPU1
+ * Task A2 ran 2ms on CPU1
+ * Task B1 ran 5ms on CPU1
+ *
+ * C2->prev_sum = 0ms, A->cpu_time[2].prev_sum = 0
+ * CPU2 idle
+ *
+ * C3->prev_sum = 0ms, A->cpu_time[3].prev_sum = 0
+ * CPU3 idle
+ *
+ * In this case, CPU1 was most busy going by just its prev_sum counter. Demand
+ * from all group A tasks are added to CPU1. IOW, at end of window M, cpu busy
+ * time reported to governor will be:
+ *
+ *
+ * C0 busy time = 1ms
+ * C1 busy time = 5 + 5 + 6 = 16ms
+ *
+ */
+static __read_mostly unsigned int sched_freq_aggregate;
+__read_mostly unsigned int sysctl_sched_freq_aggregate;
+
#define EXITING_TASK_MARKER 0xdeaddead
static inline int exiting_task(struct task_struct *p)
@@ -1955,12 +2013,67 @@ static inline unsigned int load_to_freq(struct rq *rq, u64 load)
return freq;
}
-/* Should scheduler alert governor for changing frequency? */
-static int send_notification(struct rq *rq, int check_pred)
+static inline struct group_cpu_time *
+_group_cpu_time(struct related_thread_group *grp, int cpu);
+
+/*
+ * Return load from all related group in given cpu.
+ * Caller must ensure that related_thread_group_lock is held.
+ */
+static void _group_load_in_cpu(int cpu, u64 *grp_load, u64 *new_grp_load)
+{
+ struct related_thread_group *grp;
+
+ for_each_related_thread_group(grp) {
+ struct group_cpu_time *cpu_time;
+
+ cpu_time = _group_cpu_time(grp, cpu);
+ *grp_load += cpu_time->prev_runnable_sum;
+ if (new_grp_load)
+ *new_grp_load += cpu_time->nt_prev_runnable_sum;
+ }
+}
+
+/*
+ * Return load from all related groups in given frequency domain.
+ * Caller must ensure that related_thread_group_lock is held.
+ */
+static void group_load_in_freq_domain(struct cpumask *cpus,
+ u64 *grp_load, u64 *new_grp_load)
+{
+ struct related_thread_group *grp;
+ int j;
+
+ for_each_related_thread_group(grp) {
+ for_each_cpu(j, cpus) {
+ struct group_cpu_time *cpu_time;
+
+ cpu_time = _group_cpu_time(grp, j);
+ *grp_load += cpu_time->prev_runnable_sum;
+ *new_grp_load += cpu_time->nt_prev_runnable_sum;
+ }
+ }
+}
+
+/*
+ * Should scheduler alert governor for changing frequency?
+ *
+ * @check_pred - evaluate frequency based on the predictive demand
+ * @check_groups - add load from all related groups on given cpu
+ *
+ * check_groups is set to 1 if a "related" task movement/wakeup is triggering
+ * the notification check. To avoid "re-aggregation" of demand in such cases,
+ * we check whether the migrated/woken tasks demand (along with demand from
+ * existing tasks on the cpu) can be met on target cpu
+ *
+ */
+
+static int send_notification(struct rq *rq, int check_pred, int check_groups)
{
unsigned int cur_freq, freq_required;
unsigned long flags;
int rc = 0;
+ u64 group_load = 0, new_load;
if (!sched_enable_hmp)
return 0;
@@ -1982,8 +2095,22 @@ static int send_notification(struct rq *rq, int check_pred)
if (freq_required < cur_freq + sysctl_sched_pred_alert_freq)
return 0;
} else {
+ read_lock(&related_thread_group_lock);
+ /*
+ * Protect from concurrent update of rq->prev_runnable_sum and
+ * group cpu load
+ */
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ if (check_groups)
+ _group_load_in_cpu(cpu_of(rq), &group_load, NULL);
+
+ new_load = rq->prev_runnable_sum + group_load;
+
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+ read_unlock(&related_thread_group_lock);
+
cur_freq = load_to_freq(rq, rq->old_busy_time);
- freq_required = load_to_freq(rq, rq->prev_runnable_sum);
+ freq_required = load_to_freq(rq, new_load);
if (nearly_same_freq(cur_freq, freq_required))
return 0;
@@ -1993,6 +2120,8 @@ static int send_notification(struct rq *rq, int check_pred)
if (!rq->notifier_sent) {
rq->notifier_sent = 1;
rc = 1;
+ trace_sched_freq_alert(cpu_of(rq), check_pred, check_groups, rq,
+ new_load);
}
raw_spin_unlock_irqrestore(&rq->lock, flags);
@@ -2000,17 +2129,13 @@ static int send_notification(struct rq *rq, int check_pred)
}
/* Alert governor if there is a need to change frequency */
-void check_for_freq_change(struct rq *rq, bool check_pred)
+void check_for_freq_change(struct rq *rq, bool check_pred, bool check_groups)
{
int cpu = cpu_of(rq);
- if (!send_notification(rq, check_pred))
+ if (!send_notification(rq, check_pred, check_groups))
return;
- trace_sched_freq_alert(cpu, check_pred, rq->old_busy_time,
- rq->prev_runnable_sum, rq->old_estimated_time,
- rq->hmp_stats.pred_demands_sum);
-
atomic_notifier_call_chain(
&load_alert_notifier_head, 0,
(void *)(long)cpu);
@@ -2031,11 +2156,21 @@ static int account_busy_for_cpu_time(struct rq *rq, struct task_struct *p,
if (event == TASK_WAKE)
return 0;
- if (event == PUT_PREV_TASK || event == IRQ_UPDATE ||
- event == TASK_UPDATE)
+ if (event == PUT_PREV_TASK || event == IRQ_UPDATE)
return 1;
- /* Only TASK_MIGRATE && PICK_NEXT_TASK left */
+ /*
+ * TASK_UPDATE can be called on sleeping task, when its moved between
+ * related groups
+ */
+ if (event == TASK_UPDATE) {
+ if (rq->curr == p)
+ return 1;
+
+ return p->on_rq ? sched_freq_account_wait_time : 0;
+ }
+
+ /* TASK_MIGRATE, PICK_NEXT_TASK left */
return sched_freq_account_wait_time;
}
@@ -2262,6 +2397,15 @@ void update_task_pred_demand(struct rq *rq, struct task_struct *p, int event)
event != PICK_NEXT_TASK)))
return;
+ /*
+ * TASK_UPDATE can be called on sleeping task, when its moved between
+ * related groups
+ */
+ if (event == TASK_UPDATE) {
+ if (!p->on_rq && !sched_freq_account_wait_time)
+ return;
+ }
+
new = calc_pred_demand(rq, p);
old = p->ravg.pred_demand;
@@ -2290,7 +2434,14 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
u64 window_start = rq->window_start;
u32 window_size = sched_ravg_window;
u64 delta;
+ u64 *curr_runnable_sum = &rq->curr_runnable_sum;
+ u64 *prev_runnable_sum = &rq->prev_runnable_sum;
+ u64 *nt_curr_runnable_sum = &rq->nt_curr_runnable_sum;
+ u64 *nt_prev_runnable_sum = &rq->nt_prev_runnable_sum;
+ int flip_counters = 0;
+ int prev_sum_reset = 0;
bool new_task;
+ struct related_thread_group *grp;
new_window = mark_start < window_start;
if (new_window) {
@@ -2302,6 +2453,51 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
new_task = is_new_task(p);
+ grp = p->grp;
+ if (grp && sched_freq_aggregate) {
+ /* cpu_time protected by rq_lock */
+ struct group_cpu_time *cpu_time =
+ _group_cpu_time(grp, cpu_of(rq));
+
+ curr_runnable_sum = &cpu_time->curr_runnable_sum;
+ prev_runnable_sum = &cpu_time->prev_runnable_sum;
+
+ nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
+ nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
+
+ if (cpu_time->window_start != rq->window_start) {
+ int nr_windows;
+
+ delta = rq->window_start - cpu_time->window_start;
+ nr_windows = div64_u64(delta, window_size);
+ if (nr_windows > 1)
+ prev_sum_reset = 1;
+
+ cpu_time->window_start = rq->window_start;
+ flip_counters = 1;
+ }
+
+ if (p_is_curr_task && new_window) {
+ u64 curr_sum = rq->curr_runnable_sum;
+ u64 nt_curr_sum = rq->nt_curr_runnable_sum;
+
+ if (nr_full_windows)
+ curr_sum = nt_curr_sum = 0;
+
+ rq->prev_runnable_sum = curr_sum;
+ rq->nt_prev_runnable_sum = nt_curr_sum;
+
+ rq->curr_runnable_sum = 0;
+ rq->nt_curr_runnable_sum = 0;
+ }
+ } else {
+ if (p_is_curr_task && new_window) {
+ flip_counters = 1;
+ if (nr_full_windows)
+ prev_sum_reset = 1;
+ }
+ }
+
/* Handle per-task window rollover. We don't care about the idle
* task or exiting tasks. */
if (new_window && !is_idle_task(p) && !exiting_task(p)) {
@@ -2314,6 +2510,20 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
p->ravg.curr_window = 0;
}
+ if (flip_counters) {
+ u64 curr_sum = *curr_runnable_sum;
+ u64 nt_curr_sum = *nt_curr_runnable_sum;
+
+ if (prev_sum_reset)
+ curr_sum = nt_curr_sum = 0;
+
+ *prev_runnable_sum = curr_sum;
+ *nt_prev_runnable_sum = nt_curr_sum;
+
+ *curr_runnable_sum = 0;
+ *nt_curr_runnable_sum = 0;
+ }
+
if (!account_busy_for_cpu_time(rq, p, irqtime, event)) {
/* account_busy_for_cpu_time() = 0, so no update to the
* task's current window needs to be made. This could be
@@ -2331,19 +2541,8 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
/* A new window has started. The RQ demand must be rolled
* over if p is the current task. */
if (p_is_curr_task) {
- u64 prev_sum = 0, nt_prev_sum = 0;
-
- /* p is either idle task or an exiting task */
- if (!nr_full_windows) {
- prev_sum = rq->curr_runnable_sum;
- nt_prev_sum = rq->nt_curr_runnable_sum;
- }
-
- rq->prev_runnable_sum = prev_sum;
- rq->curr_runnable_sum = 0;
- rq->nt_prev_runnable_sum = nt_prev_sum;
- rq->nt_curr_runnable_sum = 0;
-
+ /* p is idle task */
+ BUG_ON(p != rq->idle);
} else if (heavy_task_wakeup(p, rq, event)) {
/* A new window has started. If p is a waking
* heavy task its prev_window contribution is faked
@@ -2353,9 +2552,9 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
* can be controlled via the sched_heavy_task
* tunable. */
p->ravg.prev_window = p->ravg.demand;
- rq->prev_runnable_sum += p->ravg.demand;
+ *prev_runnable_sum += p->ravg.demand;
if (new_task)
- rq->nt_prev_runnable_sum += p->ravg.demand;
+ *nt_prev_runnable_sum += p->ravg.demand;
}
return;
@@ -2373,9 +2572,10 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
else
delta = irqtime;
delta = scale_exec_time(delta, rq, cc);
- rq->curr_runnable_sum += delta;
+ *curr_runnable_sum += delta;
if (new_task)
- rq->nt_curr_runnable_sum += delta;
+ *nt_curr_runnable_sum += delta;
+
if (!is_idle_task(p) && !exiting_task(p))
p->ravg.curr_window += delta;
@@ -2409,15 +2609,17 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
if (!exiting_task(p))
p->ravg.prev_window = delta;
}
- rq->prev_runnable_sum += delta;
+
+ *prev_runnable_sum += delta;
if (new_task)
- rq->nt_prev_runnable_sum += delta;
+ *nt_prev_runnable_sum += delta;
/* Account piece of busy time in the current window. */
delta = scale_exec_time(wallclock - window_start, rq, cc);
- rq->curr_runnable_sum += delta;
+ *curr_runnable_sum += delta;
if (new_task)
- rq->nt_curr_runnable_sum += delta;
+ *nt_curr_runnable_sum += delta;
+
if (!exiting_task(p))
p->ravg.curr_window = delta;
@@ -2444,12 +2646,6 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
cc);
if (!is_idle_task(p) && !exiting_task(p))
p->ravg.prev_window += delta;
-
- rq->nt_prev_runnable_sum = rq->nt_curr_runnable_sum;
- if (new_task)
- rq->nt_prev_runnable_sum += delta;
-
- delta += rq->curr_runnable_sum;
} else {
/* Since at least one full window has elapsed,
* the contribution to the previous window is the
@@ -2457,27 +2653,20 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
delta = scale_exec_time(window_size, rq, cc);
if (!is_idle_task(p) && !exiting_task(p))
p->ravg.prev_window = delta;
-
- if (new_task)
- rq->nt_prev_runnable_sum = delta;
- else
- rq->nt_prev_runnable_sum = 0;
}
- /*
- * Rollover for normal runnable sum is done here by overwriting
- * the values in prev_runnable_sum and curr_runnable_sum.
- * Rollover for new task runnable sum has completed by previous
- * if-else statement.
- */
- rq->prev_runnable_sum = delta;
+
+ /* Rollover is done here by overwriting the values in
+ * prev_runnable_sum and curr_runnable_sum. */
+ *prev_runnable_sum += delta;
+ if (new_task)
+ *nt_prev_runnable_sum += delta;
/* Account piece of busy time in the current window. */
delta = scale_exec_time(wallclock - window_start, rq, cc);
- rq->curr_runnable_sum = delta;
+ *curr_runnable_sum += delta;
if (new_task)
- rq->nt_curr_runnable_sum = delta;
- else
- rq->nt_curr_runnable_sum = 0;
+ *nt_curr_runnable_sum += delta;
+
if (!is_idle_task(p) && !exiting_task(p))
p->ravg.curr_window = delta;
@@ -2500,12 +2689,8 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
/* Roll window over. If IRQ busy time was just in the current
* window then that is all that need be accounted. */
- rq->prev_runnable_sum = rq->curr_runnable_sum;
- rq->nt_prev_runnable_sum = rq->nt_curr_runnable_sum;
- rq->nt_curr_runnable_sum = 0;
if (mark_start > window_start) {
- rq->curr_runnable_sum = scale_exec_time(irqtime, rq,
- cc);
+ *curr_runnable_sum = scale_exec_time(irqtime, rq, cc);
return;
}
@@ -2515,7 +2700,7 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
if (delta > window_size)
delta = window_size;
delta = scale_exec_time(delta, rq, cc);
- rq->prev_runnable_sum += delta;
+ *prev_runnable_sum += delta;
/* Process the remaining IRQ busy time in the current window. */
delta = wallclock - window_start;
@@ -2820,7 +3005,8 @@ update_task_ravg(struct task_struct *p, struct rq *rq, int event,
update_task_pred_demand(rq, p, event);
done:
trace_sched_update_task_ravg(p, rq, event, wallclock, irqtime,
- cc.cycles, cc.time);
+ cc.cycles, cc.time,
+ _group_cpu_time(p->grp, cpu_of(rq)));
p->ravg.mark_start = wallclock;
@@ -3002,7 +3188,8 @@ enum reset_reason_code {
ACCOUNT_WAIT_TIME_CHANGE,
HIST_SIZE_CHANGE,
MIGRATION_FIXUP_CHANGE,
- FREQ_ACCOUNT_WAIT_TIME_CHANGE
+ FREQ_ACCOUNT_WAIT_TIME_CHANGE,
+ FREQ_AGGREGATE_CHANGE,
};
const char *sched_window_reset_reasons[] = {
@@ -3021,6 +3208,7 @@ void reset_all_window_stats(u64 window_start, unsigned int window_size)
u64 start_ts = sched_ktime_clock();
int reason = WINDOW_CHANGE;
unsigned int old = 0, new = 0;
+ struct related_thread_group *grp;
disable_window_stats();
@@ -3028,11 +3216,26 @@ void reset_all_window_stats(u64 window_start, unsigned int window_size)
local_irq_save(flags);
+ read_lock(&related_thread_group_lock);
+
for_each_possible_cpu(cpu) {
struct rq *rq = cpu_rq(cpu);
raw_spin_lock(&rq->lock);
}
+ list_for_each_entry(grp, &related_thread_groups, list) {
+ int j;
+
+ for_each_possible_cpu(j) {
+ struct group_cpu_time *cpu_time;
+ /* Protected by rq lock */
+ cpu_time = _group_cpu_time(grp, j);
+ memset(cpu_time, 0, sizeof(struct group_cpu_time));
+ if (window_start)
+ cpu_time->window_start = window_start;
+ }
+ }
+
if (window_size) {
sched_ravg_window = window_size * TICK_NSEC;
set_hmp_defaults();
@@ -3081,6 +3284,12 @@ void reset_all_window_stats(u64 window_start, unsigned int window_size)
new = sysctl_sched_freq_account_wait_time;
sched_freq_account_wait_time =
sysctl_sched_freq_account_wait_time;
+ } else if (sched_freq_aggregate !=
+ sysctl_sched_freq_aggregate) {
+ reason = FREQ_AGGREGATE_CHANGE;
+ old = sched_freq_aggregate;
+ new = sysctl_sched_freq_aggregate;
+ sched_freq_aggregate = sysctl_sched_freq_aggregate;
}
#endif
@@ -3089,6 +3298,8 @@ void reset_all_window_stats(u64 window_start, unsigned int window_size)
raw_spin_unlock(&rq->lock);
}
+ read_unlock(&related_thread_group_lock);
+
local_irq_restore(flags);
trace_sched_reset_all_window_stats(window_start, window_size,
@@ -3097,13 +3308,17 @@ void reset_all_window_stats(u64 window_start, unsigned int window_size)
#ifdef CONFIG_SCHED_FREQ_INPUT
+static inline void
+sync_window_start(struct rq *rq, struct group_cpu_time *cpu_time);
+
void sched_get_cpus_busy(struct sched_load *busy,
const struct cpumask *query_cpus)
{
unsigned long flags;
struct rq *rq;
const int cpus = cpumask_weight(query_cpus);
- u64 load[cpus], nload[cpus];
+ u64 load[cpus], group_load[cpus];
+ u64 nload[cpus], ngload[cpus];
u64 pload[cpus];
unsigned int cur_freq[cpus], max_freq[cpus];
int notifier_sent[cpus];
@@ -3111,6 +3326,9 @@ void sched_get_cpus_busy(struct sched_load *busy,
int cpu, i = 0;
unsigned int window_size;
struct cpu_cycle cc;
+ u64 max_prev_sum = 0;
+ int max_busy_cpu = cpumask_first(query_cpus);
+ struct related_thread_group *grp;
if (unlikely(cpus == 0))
return;
@@ -3120,6 +3338,8 @@ void sched_get_cpus_busy(struct sched_load *busy,
* current task may have been executing for a long time. Ensure
* that the window stats are current by doing an update.
*/
+ read_lock(&related_thread_group_lock);
+
local_irq_save(flags);
for_each_cpu(cpu, query_cpus)
raw_spin_lock(&cpu_rq(cpu)->lock);
@@ -3137,6 +3357,49 @@ void sched_get_cpus_busy(struct sched_load *busy,
nload[i] = rq->nt_prev_runnable_sum;
pload[i] = rq->hmp_stats.pred_demands_sum;
rq->old_estimated_time = pload[i];
+
+ if (load[i] > max_prev_sum) {
+ max_prev_sum = load[i];
+ max_busy_cpu = cpu;
+ }
+
+ notifier_sent[i] = rq->notifier_sent;
+ early_detection[i] = (rq->ed_task != NULL);
+ rq->notifier_sent = 0;
+ cur_freq[i] = cpu_cur_freq(cpu);
+ max_freq[i] = cpu_max_freq(cpu);
+ i++;
+ }
+
+ for_each_related_thread_group(grp) {
+ for_each_cpu(cpu, query_cpus) {
+ /* Protected by rq_lock */
+ struct group_cpu_time *cpu_time =
+ _group_cpu_time(grp, cpu);
+ sync_window_start(cpu_rq(cpu), cpu_time);
+ }
+ }
+
+ i = 0;
+ for_each_cpu(cpu, query_cpus) {
+ group_load[i] = 0;
+ ngload[i] = 0;
+
+ if (early_detection[i])
+ goto skip_early;
+
+ rq = cpu_rq(cpu);
+ if (!notifier_sent[i]) {
+ if (cpu == max_busy_cpu)
+ group_load_in_freq_domain(
+ &rq->freq_domain_cpumask,
+ &group_load[i], &ngload[i]);
+ } else {
+ _group_load_in_cpu(cpu, &group_load[i], &ngload[i]);
+ }
+
+ load[i] += group_load[i];
+ nload[i] += ngload[i];
/*
* Scale load in reference to cluster max_possible_freq.
*
@@ -3146,11 +3409,7 @@ void sched_get_cpus_busy(struct sched_load *busy,
load[i] = scale_load_to_cpu(load[i], cpu);
nload[i] = scale_load_to_cpu(nload[i], cpu);
pload[i] = scale_load_to_cpu(pload[i], cpu);
-
- notifier_sent[i] = rq->notifier_sent;
- early_detection[i] = (rq->ed_task != NULL);
- rq->notifier_sent = 0;
- max_freq[i] = cpu_max_freq(cpu);
+skip_early:
i++;
}
@@ -3158,6 +3417,8 @@ void sched_get_cpus_busy(struct sched_load *busy,
raw_spin_unlock(&(cpu_rq(cpu))->lock);
local_irq_restore(flags);
+ read_unlock(&related_thread_group_lock);
+
i = 0;
for_each_cpu(cpu, query_cpus) {
rq = cpu_rq(cpu);
@@ -3205,17 +3466,6 @@ exit_early:
}
}
-unsigned long sched_get_busy(int cpu)
-{
- struct cpumask query_cpu = CPU_MASK_NONE;
- struct sched_load busy;
-
- cpumask_set_cpu(cpu, &query_cpu);
- sched_get_cpus_busy(&busy, &query_cpu);
-
- return busy.prev_load;
-}
-
void sched_set_io_is_busy(int val)
{
sched_io_is_busy = val;
@@ -3267,7 +3517,14 @@ static void fixup_busy_time(struct task_struct *p, int new_cpu)
struct rq *src_rq = task_rq(p);
struct rq *dest_rq = cpu_rq(new_cpu);
u64 wallclock;
+ u64 *src_curr_runnable_sum, *dst_curr_runnable_sum;
+ u64 *src_prev_runnable_sum, *dst_prev_runnable_sum;
+ u64 *src_nt_curr_runnable_sum, *dst_nt_curr_runnable_sum;
+ u64 *src_nt_prev_runnable_sum, *dst_nt_prev_runnable_sum;
+ int migrate_type;
+ struct migration_sum_data d;
bool new_task;
+ struct related_thread_group *grp;
if (!sched_enable_hmp || !sched_migration_fixup ||
(!p->on_rq && p->state != TASK_WAKING))
@@ -3298,22 +3555,62 @@ static void fixup_busy_time(struct task_struct *p, int new_cpu)
update_task_cpu_cycles(p, new_cpu);
new_task = is_new_task(p);
+ /* Protected by rq_lock */
+ grp = p->grp;
+ if (grp && sched_freq_aggregate) {
+ struct group_cpu_time *cpu_time;
+
+ migrate_type = GROUP_TO_GROUP;
+ /* Protected by rq_lock */
+ cpu_time = _group_cpu_time(grp, cpu_of(src_rq));
+ d.src_rq = NULL;
+ d.src_cpu_time = cpu_time;
+ src_curr_runnable_sum = &cpu_time->curr_runnable_sum;
+ src_prev_runnable_sum = &cpu_time->prev_runnable_sum;
+ src_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
+ src_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
+
+ /* Protected by rq_lock */
+ cpu_time = _group_cpu_time(grp, cpu_of(dest_rq));
+ d.dst_rq = NULL;
+ d.dst_cpu_time = cpu_time;
+ dst_curr_runnable_sum = &cpu_time->curr_runnable_sum;
+ dst_prev_runnable_sum = &cpu_time->prev_runnable_sum;
+ dst_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
+ dst_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
+ sync_window_start(dest_rq, cpu_time);
+ } else {
+ migrate_type = RQ_TO_RQ;
+ d.src_rq = src_rq;
+ d.src_cpu_time = NULL;
+ d.dst_rq = dest_rq;
+ d.dst_cpu_time = NULL;
+ src_curr_runnable_sum = &src_rq->curr_runnable_sum;
+ src_prev_runnable_sum = &src_rq->prev_runnable_sum;
+ src_nt_curr_runnable_sum = &src_rq->nt_curr_runnable_sum;
+ src_nt_prev_runnable_sum = &src_rq->nt_prev_runnable_sum;
+
+ dst_curr_runnable_sum = &dest_rq->curr_runnable_sum;
+ dst_prev_runnable_sum = &dest_rq->prev_runnable_sum;
+ dst_nt_curr_runnable_sum = &dest_rq->nt_curr_runnable_sum;
+ dst_nt_prev_runnable_sum = &dest_rq->nt_prev_runnable_sum;
+ }
if (p->ravg.curr_window) {
- src_rq->curr_runnable_sum -= p->ravg.curr_window;
- dest_rq->curr_runnable_sum += p->ravg.curr_window;
+ *src_curr_runnable_sum -= p->ravg.curr_window;
+ *dst_curr_runnable_sum += p->ravg.curr_window;
if (new_task) {
- src_rq->nt_curr_runnable_sum -= p->ravg.curr_window;
- dest_rq->nt_curr_runnable_sum += p->ravg.curr_window;
+ *src_nt_curr_runnable_sum -= p->ravg.curr_window;
+ *dst_nt_curr_runnable_sum += p->ravg.curr_window;
}
}
if (p->ravg.prev_window) {
- src_rq->prev_runnable_sum -= p->ravg.prev_window;
- dest_rq->prev_runnable_sum += p->ravg.prev_window;
+ *src_prev_runnable_sum -= p->ravg.prev_window;
+ *dst_prev_runnable_sum += p->ravg.prev_window;
if (new_task) {
- src_rq->nt_prev_runnable_sum -= p->ravg.prev_window;
- dest_rq->nt_prev_runnable_sum += p->ravg.prev_window;
+ *src_nt_prev_runnable_sum -= p->ravg.prev_window;
+ *dst_nt_prev_runnable_sum += p->ravg.prev_window;
}
}
@@ -3323,13 +3620,11 @@ static void fixup_busy_time(struct task_struct *p, int new_cpu)
dest_rq->ed_task = p;
}
- BUG_ON((s64)src_rq->prev_runnable_sum < 0);
- BUG_ON((s64)src_rq->curr_runnable_sum < 0);
- BUG_ON((s64)src_rq->nt_prev_runnable_sum < 0);
- BUG_ON((s64)src_rq->nt_curr_runnable_sum < 0);
-
- trace_sched_migration_update_sum(src_rq, p);
- trace_sched_migration_update_sum(dest_rq, p);
+ trace_sched_migration_update_sum(p, migrate_type, &d);
+ BUG_ON((s64)*src_prev_runnable_sum < 0);
+ BUG_ON((s64)*src_curr_runnable_sum < 0);
+ BUG_ON((s64)*src_nt_prev_runnable_sum < 0);
+ BUG_ON((s64)*src_nt_curr_runnable_sum < 0);
done:
if (p->state == TASK_WAKING)
@@ -3368,10 +3663,6 @@ static void check_for_up_down_migrate_update(const struct cpumask *cpus)
update_up_down_migrate();
}
-static LIST_HEAD(related_thread_groups);
-static DEFINE_RWLOCK(related_thread_group_lock);
-static int nr_related_thread_groups;
-
/* Return cluster which can offer required capacity for group */
static struct sched_cluster *
best_cluster(struct related_thread_group *grp, u64 total_demand)
@@ -3421,6 +3712,199 @@ static void set_preferred_cluster(struct related_thread_group *grp)
raw_spin_unlock(&grp->lock);
}
+#define ADD_TASK 0
+#define REM_TASK 1
+
+#ifdef CONFIG_SCHED_FREQ_INPUT
+
+static struct cpu_cycle
+update_task_ravg(struct task_struct *p, struct rq *rq,
+ int event, u64 wallclock, u64 irqtime);
+
+static inline void free_group_cputime(struct related_thread_group *grp)
+{
+ free_percpu(grp->cpu_time);
+}
+
+static int alloc_group_cputime(struct related_thread_group *grp)
+{
+ int i;
+ struct group_cpu_time *cpu_time;
+ int cpu = raw_smp_processor_id();
+ struct rq *rq = cpu_rq(cpu);
+ u64 window_start = rq->window_start;
+
+ grp->cpu_time = alloc_percpu(struct group_cpu_time);
+ if (!grp->cpu_time)
+ return -ENOMEM;
+
+ for_each_possible_cpu(i) {
+ cpu_time = per_cpu_ptr(grp->cpu_time, i);
+ memset(cpu_time, 0, sizeof(struct group_cpu_time));
+ cpu_time->window_start = window_start;
+ }
+
+ return 0;
+}
+
+/*
+ * A group's window_start may be behind. When moving it forward, flip prev/curr
+ * counters. When moving forward > 1 window, prev counter is set to 0
+ */
+static inline void
+sync_window_start(struct rq *rq, struct group_cpu_time *cpu_time)
+{
+ u64 delta;
+ int nr_windows;
+ u64 curr_sum = cpu_time->curr_runnable_sum;
+ u64 nt_curr_sum = cpu_time->nt_curr_runnable_sum;
+
+ delta = rq->window_start - cpu_time->window_start;
+ if (!delta)
+ return;
+
+ nr_windows = div64_u64(delta, sched_ravg_window);
+ if (nr_windows > 1)
+ curr_sum = nt_curr_sum = 0;
+
+ cpu_time->prev_runnable_sum = curr_sum;
+ cpu_time->curr_runnable_sum = 0;
+
+ cpu_time->nt_prev_runnable_sum = nt_curr_sum;
+ cpu_time->nt_curr_runnable_sum = 0;
+
+ cpu_time->window_start = rq->window_start;
+}
+
+/*
+ * Task's cpu usage is accounted in:
+ * rq->curr/prev_runnable_sum, when its ->grp is NULL
+ * grp->cpu_time[cpu]->curr/prev_runnable_sum, when its ->grp is !NULL
+ *
+ * Transfer task's cpu usage between those counters when transitioning between
+ * groups
+ */
+static void transfer_busy_time(struct rq *rq, struct related_thread_group *grp,
+ struct task_struct *p, int event)
+{
+ u64 wallclock;
+ struct group_cpu_time *cpu_time;
+ u64 *src_curr_runnable_sum, *dst_curr_runnable_sum;
+ u64 *src_prev_runnable_sum, *dst_prev_runnable_sum;
+ u64 *src_nt_curr_runnable_sum, *dst_nt_curr_runnable_sum;
+ u64 *src_nt_prev_runnable_sum, *dst_nt_prev_runnable_sum;
+ struct migration_sum_data d;
+ int migrate_type;
+
+ if (!sched_freq_aggregate)
+ return;
+
+ wallclock = sched_ktime_clock();
+
+ update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
+ update_task_ravg(p, rq, TASK_UPDATE, wallclock, 0);
+
+ /* cpu_time protected by related_thread_group_lock, grp->lock rq_lock */
+ cpu_time = _group_cpu_time(grp, cpu_of(rq));
+ if (event == ADD_TASK) {
+ sync_window_start(rq, cpu_time);
+ migrate_type = RQ_TO_GROUP;
+ d.src_rq = rq;
+ d.src_cpu_time = NULL;
+ d.dst_rq = NULL;
+ d.dst_cpu_time = cpu_time;
+ src_curr_runnable_sum = &rq->curr_runnable_sum;
+ dst_curr_runnable_sum = &cpu_time->curr_runnable_sum;
+ src_prev_runnable_sum = &rq->prev_runnable_sum;
+ dst_prev_runnable_sum = &cpu_time->prev_runnable_sum;
+
+ src_nt_curr_runnable_sum = &rq->nt_curr_runnable_sum;
+ dst_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
+ src_nt_prev_runnable_sum = &rq->nt_prev_runnable_sum;
+ dst_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
+ } else if (event == REM_TASK) {
+ migrate_type = GROUP_TO_RQ;
+ d.src_rq = NULL;
+ d.src_cpu_time = cpu_time;
+ d.dst_rq = rq;
+ d.dst_cpu_time = NULL;
+
+ /*
+ * In case of REM_TASK, cpu_time->window_start would be
+ * uptodate, because of the update_task_ravg() we called
+ * above on the moving task. Hence no need for
+ * sync_window_start()
+ */
+ src_curr_runnable_sum = &cpu_time->curr_runnable_sum;
+ dst_curr_runnable_sum = &rq->curr_runnable_sum;
+ src_prev_runnable_sum = &cpu_time->prev_runnable_sum;
+ dst_prev_runnable_sum = &rq->prev_runnable_sum;
+
+ src_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
+ dst_nt_curr_runnable_sum = &rq->nt_curr_runnable_sum;
+ src_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
+ dst_nt_prev_runnable_sum = &rq->nt_prev_runnable_sum;
+ }
+
+ *src_curr_runnable_sum -= p->ravg.curr_window;
+ *dst_curr_runnable_sum += p->ravg.curr_window;
+
+ *src_prev_runnable_sum -= p->ravg.prev_window;
+ *dst_prev_runnable_sum += p->ravg.prev_window;
+
+ if (is_new_task(p)) {
+ *src_nt_curr_runnable_sum -= p->ravg.curr_window;
+ *dst_nt_curr_runnable_sum += p->ravg.curr_window;
+ *src_nt_prev_runnable_sum -= p->ravg.prev_window;
+ *dst_nt_prev_runnable_sum += p->ravg.prev_window;
+ }
+
+ trace_sched_migration_update_sum(p, migrate_type, &d);
+
+ BUG_ON((s64)*src_curr_runnable_sum < 0);
+ BUG_ON((s64)*src_prev_runnable_sum < 0);
+}
+
+static inline struct group_cpu_time *
+task_group_cpu_time(struct task_struct *p, int cpu)
+{
+ return _group_cpu_time(rcu_dereference(p->grp), cpu);
+}
+
+static inline struct group_cpu_time *
+_group_cpu_time(struct related_thread_group *grp, int cpu)
+{
+ return grp ? per_cpu_ptr(grp->cpu_time, cpu) : NULL;
+}
+
+#else /* CONFIG_SCHED_FREQ_INPUT */
+
+static inline void free_group_cputime(struct related_thread_group *grp) { }
+
+static inline int alloc_group_cputime(struct related_thread_group *grp)
+{
+ return 0;
+}
+
+static inline void transfer_busy_time(struct rq *rq,
+ struct related_thread_group *grp, struct task_struct *p, int event)
+{
+}
+
+static struct group_cpu_time *
+task_group_cpu_time(struct task_struct *p, int cpu)
+{
+ return NULL;
+}
+
+static inline struct group_cpu_time *
+_group_cpu_time(struct related_thread_group *grp, int cpu)
+{
+ return NULL;
+}
+
+#endif
+
struct related_thread_group *alloc_related_thread_group(int group_id)
{
struct related_thread_group *grp;
@@ -3429,6 +3913,11 @@ struct related_thread_group *alloc_related_thread_group(int group_id)
if (!grp)
return ERR_PTR(-ENOMEM);
+ if (alloc_group_cputime(grp)) {
+ kfree(grp);
+ return ERR_PTR(-ENOMEM);
+ }
+
grp->id = group_id;
INIT_LIST_HEAD(&grp->tasks);
INIT_LIST_HEAD(&grp->list);
@@ -3449,6 +3938,16 @@ struct related_thread_group *lookup_related_thread_group(unsigned int group_id)
return NULL;
}
+/* See comments before preferred_cluster() */
+static void free_related_thread_group(struct rcu_head *rcu)
+{
+ struct related_thread_group *grp = container_of(rcu, struct
+ related_thread_group, rcu);
+
+ free_group_cputime(grp);
+ kfree(grp);
+}
+
static void remove_task_from_group(struct task_struct *p)
{
struct related_thread_group *grp = p->grp;
@@ -3458,6 +3957,7 @@ static void remove_task_from_group(struct task_struct *p)
raw_spin_lock(&grp->lock);
rq = __task_rq_lock(p);
+ transfer_busy_time(rq, p->grp, p, REM_TASK);
list_del_init(&p->grp_list);
rcu_assign_pointer(p->grp, NULL);
__task_rq_unlock(rq);
@@ -3471,9 +3971,7 @@ static void remove_task_from_group(struct task_struct *p)
if (empty_group) {
list_del(&grp->list);
- nr_related_thread_groups--;
- /* See comments before preferred_cluster() */
- kfree_rcu(grp, rcu);
+ call_rcu(&grp->rcu, free_related_thread_group);
}
}
@@ -3489,8 +3987,9 @@ add_task_to_group(struct task_struct *p, struct related_thread_group *grp)
* reference of p->grp in various hot-paths
*/
rq = __task_rq_lock(p);
- rcu_assign_pointer(p->grp, grp);
+ transfer_busy_time(rq, grp, p, ADD_TASK);
list_add(&p->grp_list, &grp->tasks);
+ rcu_assign_pointer(p->grp, grp);
__task_rq_unlock(rq);
_set_preferred_cluster(grp);
@@ -3539,7 +4038,6 @@ redo:
} else if (!grp && new) {
/* New group - use object allocated before */
destroy = 0;
- nr_related_thread_groups++;
list_add(&new->list, &related_thread_groups);
grp = new;
}
@@ -3550,8 +4048,10 @@ redo:
done:
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
- if (destroy)
+ if (new && destroy) {
+ free_group_cputime(new);
kfree(new);
+ }
return rc;
}
@@ -3898,13 +4398,19 @@ static void notify_migration(int src_cpu, int dest_cpu, bool src_cpu_dead,
struct task_struct *p)
{
struct migration_notify_data mnd;
+ bool check_groups;
+
+ rcu_read_lock();
+ check_groups = rcu_access_pointer(p->grp) != NULL;
+ rcu_read_unlock();
if (!same_freq_domain(src_cpu, dest_cpu)) {
if (!src_cpu_dead)
- check_for_freq_change(cpu_rq(src_cpu), false);
- check_for_freq_change(cpu_rq(dest_cpu), false);
+ check_for_freq_change(cpu_rq(src_cpu), false,
+ check_groups);
+ check_for_freq_change(cpu_rq(dest_cpu), false, check_groups);
} else {
- check_for_freq_change(cpu_rq(dest_cpu), true);
+ check_for_freq_change(cpu_rq(dest_cpu), true, check_groups);
}
if (task_notify_on_migrate(p)) {
@@ -4771,6 +5277,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
struct related_thread_group *grp = NULL;
#endif
bool freq_notif_allowed = !(wake_flags & WF_NO_NOTIFIER);
+ bool check_group = false;
wake_flags &= ~WF_NO_NOTIFIER;
@@ -4846,6 +5353,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
if (update_preferred_cluster(grp, p, old_load))
set_preferred_cluster(grp);
rcu_read_unlock();
+ check_group = grp != NULL;
p->sched_contributes_to_load = !!task_contributes_to_load(p);
p->state = TASK_WAKING;
@@ -4894,12 +5402,14 @@ out:
if (freq_notif_allowed) {
if (!same_freq_domain(src_cpu, cpu)) {
- check_for_freq_change(cpu_rq(cpu), false);
- check_for_freq_change(cpu_rq(src_cpu), false);
+ check_for_freq_change(cpu_rq(cpu),
+ false, check_group);
+ check_for_freq_change(cpu_rq(src_cpu),
+ false, check_group);
} else if (heavy_task) {
- check_for_freq_change(cpu_rq(cpu), false);
+ check_for_freq_change(cpu_rq(cpu), false, false);
} else if (success) {
- check_for_freq_change(cpu_rq(cpu), true);
+ check_for_freq_change(cpu_rq(cpu), true, false);
}
}
@@ -10543,6 +11053,7 @@ void __init sched_init(void)
rq->nt_curr_runnable_sum = rq->nt_prev_runnable_sum = 0;
rq->old_busy_time = 0;
rq->old_estimated_time = 0;
+ rq->old_busy_time_group = 0;
rq->notifier_sent = 0;
rq->hmp_stats.pred_demands_sum = 0;
#endif
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0288a331e311..a33eddb7b17d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -32,9 +32,8 @@
#include <linux/task_work.h>
#include <linux/ratelimit.h>
-#include <trace/events/sched.h>
-
#include "sched.h"
+#include <trace/events/sched.h>
/*
* Targeted preemption latency for CPU-bound tasks:
@@ -4059,6 +4058,9 @@ static inline int invalid_value_freq_input(unsigned int *data)
if (data == &sysctl_sched_freq_account_wait_time)
return !(*data == 0 || *data == 1);
+ if (data == &sysctl_sched_freq_aggregate)
+ return !(*data == 0 || *data == 1);
+
return 0;
}
#else
@@ -7674,6 +7676,7 @@ enum fbq_type { regular, remote, all };
LBF_BIG_TASK_ACTIVE_BALANCE)
#define LBF_IGNORE_BIG_TASKS 0x100
#define LBF_IGNORE_PREFERRED_CLUSTER_TASKS 0x200
+#define LBF_MOVED_RELATED_THREAD_GROUP_TASK 0x400
struct lb_env {
struct sched_domain *sd;
@@ -7916,6 +7919,8 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
deactivate_task(env->src_rq, p, 0);
double_lock_balance(env->src_rq, env->dst_rq);
set_task_cpu(p, env->dst_cpu);
+ if (rcu_access_pointer(p->grp))
+ env->flags |= LBF_MOVED_RELATED_THREAD_GROUP_TASK;
double_unlock_balance(env->src_rq, env->dst_rq);
}
@@ -9575,10 +9580,13 @@ no_move:
/* Assumes one 'busiest' cpu that we pulled tasks from */
if (!same_freq_domain(this_cpu, cpu_of(busiest))) {
- check_for_freq_change(this_rq, false);
- check_for_freq_change(busiest, false);
+ int check_groups = !!(env.flags &
+ LBF_MOVED_RELATED_THREAD_GROUP_TASK);
+
+ check_for_freq_change(this_rq, false, check_groups);
+ check_for_freq_change(busiest, false, check_groups);
} else {
- check_for_freq_change(this_rq, true);
+ check_for_freq_change(this_rq, true, false);
}
}
if (likely(!active_balance)) {
@@ -9876,10 +9884,12 @@ out_unlock:
local_irq_enable();
if (moved && !same_freq_domain(busiest_cpu, target_cpu)) {
- check_for_freq_change(busiest_rq, false);
- check_for_freq_change(target_rq, false);
+ int check_groups = !!(env.flags &
+ LBF_MOVED_RELATED_THREAD_GROUP_TASK);
+ check_for_freq_change(busiest_rq, false, check_groups);
+ check_for_freq_change(target_rq, false, check_groups);
} else if (moved) {
- check_for_freq_change(target_rq, true);
+ check_for_freq_change(target_rq, true, false);
}
if (per_cpu(dbs_boost_needed, target_cpu)) {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index a66d8a12051c..df9b972195e5 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -409,6 +409,16 @@ struct related_thread_group {
struct sched_cluster *preferred_cluster;
struct rcu_head rcu;
u64 last_update;
+#ifdef CONFIG_SCHED_FREQ_INPUT
+ struct group_cpu_time __percpu *cpu_time; /* one per cluster */
+#endif
+};
+
+struct migration_sum_data {
+ struct rq *src_rq, *dst_rq;
+#ifdef CONFIG_SCHED_FREQ_INPUT
+ struct group_cpu_time *src_cpu_time, *dst_cpu_time;
+#endif
};
extern struct list_head cluster_head;
@@ -741,7 +751,7 @@ struct rq {
struct task_struct *ed_task;
#ifdef CONFIG_SCHED_FREQ_INPUT
- unsigned int old_busy_time;
+ u64 old_busy_time, old_busy_time_group;
int notifier_sent;
u64 old_estimated_time;
#endif
@@ -1337,7 +1347,16 @@ static inline int update_preferred_cluster(struct related_thread_group *grp,
#ifdef CONFIG_SCHED_FREQ_INPUT
#define PRED_DEMAND_DELTA ((s64)new_pred_demand - p->ravg.pred_demand)
-extern void check_for_freq_change(struct rq *rq, bool check_cra);
+extern void
+check_for_freq_change(struct rq *rq, bool check_pred, bool check_groups);
+
+struct group_cpu_time {
+ u64 curr_runnable_sum;
+ u64 prev_runnable_sum;
+ u64 nt_curr_runnable_sum;
+ u64 nt_prev_runnable_sum;
+ u64 window_start;
+};
/* Is frequency of two cpus synchronized with each other? */
static inline int same_freq_domain(int src_cpu, int dst_cpu)
@@ -1355,7 +1374,8 @@ static inline int same_freq_domain(int src_cpu, int dst_cpu)
#define sched_migration_fixup 0
#define PRED_DEMAND_DELTA (0)
-static inline void check_for_freq_change(struct rq *rq, bool check_cra) { }
+static inline void
+check_for_freq_change(struct rq *rq, bool check_pred, bool check_groups) { }
static inline int same_freq_domain(int src_cpu, int dst_cpu)
{
diff --git a/kernel/sched/sched_avg.c b/kernel/sched/sched_avg.c
index cdb1d7c53849..c70e0466c36c 100644
--- a/kernel/sched/sched_avg.c
+++ b/kernel/sched/sched_avg.c
@@ -18,9 +18,9 @@
#include <linux/hrtimer.h>
#include <linux/sched.h>
#include <linux/math64.h>
-#include <trace/events/sched.h>
#include "sched.h"
+#include <trace/events/sched.h>
static DEFINE_PER_CPU(u64, nr_prod_sum);
static DEFINE_PER_CPU(u64, last_time);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 1da3b96368b1..825be75ca1a3 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -472,6 +472,13 @@ static struct ctl_table kern_table[] = {
.proc_handler = proc_dointvec_minmax,
.extra1 = &zero,
},
+ {
+ .procname = "sched_freq_aggregate",
+ .data = &sysctl_sched_freq_aggregate,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sched_window_update_handler,
+ },
#endif
{
.procname = "sched_boost",