summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorLinux Build Service Account <lnxbuild@localhost>2016-05-31 12:11:36 -0600
committerLinux Build Service Account <lnxbuild@localhost>2016-05-31 12:11:36 -0600
commit3ebf81bef3da6dd1f12f70773866e0063ac3f368 (patch)
tree20f20f9902f9df8344fa007a78f64d8e5668dcea /kernel
parent89c198ac7fb60408d3f786ef2bcfaab2a56d4cbe (diff)
parentcd9403e96df5ef8eaefabdb67da22f0a9887964c (diff)
Promotion of kernel.lnx.4.4-160526.
CRs Change ID Subject -------------------------------------------------------------------------------------------------------------- 972519 I35151c460b4350ebd414b67c655684c2019f799f trace: prevent NULL pointer dereference 1013947 I7df9aeb55a95185077c679a217ed8772eb83c8b9 arm64: defconfig: update config options for msm-perf_def 1021612 I312444176373f73f02aa0ceddf5e114a39702641 ARM: msm: dts: fix register dump offsets/ranges for msmc 1019272 I38c637936b398f2fb1665c8233ed5e49e83bf296 thermal: qpnp-temp-alarm: update thermal callback parame 1020529 Ia4f54bfee8111f9f039f772a8bcc7c9a0400d5aa edac: cortex: Update the error strings to reflect Kryo2x 1005061 I4ed9f1c6ad089f80dcd19762fda151ce1572f471 msm: ipa3: WA for incorrect state retention for GSI chan 1019256 I87cca1215134e6d406f60d54f6d0430978eeae9c icnss: Add API to return the CE IRQ number 1015545 I5aad7032f3f8048216a41765f1cf91fde98f6ade msm: ipa3: fix odu debugfs 1006937 Ib8cb979136def6696861a7835bcde763dabe874f net: Warn for cloned packets in ingress path on SMP syst 1008023 Id9949bef91835318a7b344753983eea0aeab7bdc msm: ipa3: add support for TX of sk_buff's with paged da 1021612 I4fbc9aa1f30d36d35a9ad181185761e697cbbef7 msm: mdss: Fix qseed3 clk calculation overflow 1019188 Ib52e6551ac67215dab2bc5770ddcf037568f8b77 net: rmnet_data: Fix use after free when sending MAP com 989851 Ifa42fbd475665a0ca581c907ce5432584ea0e7ed msm: mdss: fix possible out-of-bounds and overflow issue 1016956 I906005680b4cc90cc38dc3d403beebf7aa515ad7 usb: dwc3: Add support handle type-c plug orientation 1019798 I7bccd68866457bb0635ae5166ec935f9e82ba760 soc: qcom: print raw data for gladiator error registers 1017182 I32f312f11fcbebbff0799120448d6e8f0d9ec98d ARM: dts: msm: Add v4l2 rotator node to msm8996 1020265 Id19733e6e075a427c4aa745b5bedc93f29a2dd4f ARM: dts: msm: Add nfc device to msmcobalt 988990 I19aa5983316bec4a87811c8aa8b54f770001c45f msm: mdss: Adding support for destination scaler 993024 I32b0e57c8e958b7e5f1d647e37e46fda052b3d1e ARM: dts: msm: Support partial goods for msmcobalt 1013948 If024f55095a951329976b6c2736ad5760eae1f4f arm64: defconfig: update config options for msmcortex-pe 1020515 I2c1fb7dcc698142f9ce42f40164521b8a78268e1 defconfig: msm: Remove incorrect ISPIF version 1013147 Iae6804bcb3121e0852ec5d14d0939623b97a6e67 qcom-charger: Don't automatically set USB_PD type when P 988990 I9a4b9701e078fa39783f33f023eef2da75c1c162 ARM: dts: msm: add extra destination scaler properties f 1020505 I87d18778fef81671c5e7cc261cc70ce07c662933 regulator: cpr3-regulator: support corner bands with no 1019888 I1a8241c1e0a349394351be2ef98381e24f0c4ff4 defconfig: msmcortex: enable qrng drivers 1003367 I75089e210a6fc72683dcf98cdd4da9d6ab3e6fcf msm: kgsl: Correction of VBIF debug bus data in snapshot 1005061 Id849055526bf70e0cc8161239b4530a7fc575744 ARM: dts: msm: enable WA for IPA channel 20 for msmcobal 1002974 Ic0dedbadc0dd2125bd2a7bcc152972c0555e07f8 msm: kgsl: Defer adding the mem entry to a process 1017182 I6fc5f90512d8024439d56d7c72ae2160df460f7a defconfig: msm: sde: Add config for v4l2 sde rotator dri 1006067 I6add3800c40cd09f6e6e0cf2720e69059bd83cbc msm: kgsl: Avoid race condition in ioctl_syncsource_dest 1013147 I77c5875ee8514395a82fac0109b7cff1d507250b usb: pd: Update power_supply type to USB_PD after PD is 1021612 I62a3bd31997be05181de98307089e2a69d98ab7b msm: mdss: fix amortized prefill calculations 1019888 I2c808713aaac42345b97665a8990f5bbb9b9145e ARM: dts: Add qrng driver support for msmcobalt 1013913 I9a17c83d6613ff37cede4a7bb52612465e4d0101 regulator: labibb: Fix slew rate calculation in LAB/IBB 1016956 Idd236136c9f0a9163b4ae7a8405c412f1d69ca9e usb: pd: Add support to notify plug orientation via extc 1016956 I893c0b729015cd22791d168453309168246961e2 usb: phy: qmp: Configure phy lane based on plug orientat 972998 I6a99fa6961e9205d7d9ccb470873c26adde8a91f ARM: dts: msm: Change csi clock voting from ispif node 1020505 I6b9d663b44c96dafba26ad25bcfc4b61c8c86d56 regulator: cpr3-regulator: support step quot for CPRh co 977896 I71e6047620066323721c6d542034ddd4b2950e7f sched: Aggregate for frequency 992942 Iaf90ab4c1d17f903d03458d76cab1b4c0a5c8836 msm: camera: isp: Fix warning and errors based on static 1013787 Ieb0a7aa1b1b5f23220854092dcc2119d29c57146 msm: camera: sensor: Add support for 3B read 1017182 If634894768b02d124ceab071a9eca1c36f258600 msm: mdss: Export rotator interrupt and share rotator sm Change-Id: I15d2c47b635d84cffdac17adffff8274b6f8e3f4 CRs-Fixed: 1005061, 993024, 989851, 1017182, 1020505, 1021612, 1020529, 1019256, 1003367, 1006937, 1019798, 1016956, 1019272, 1013913, 972519, 1019888, 1013787, 1006067, 1015545, 1019188, 1020515, 1013147, 977896, 1008023, 1002974, 988990, 1013947, 992942, 972998, 1013948, 1020265
Diffstat (limited to 'kernel')
-rw-r--r--kernel/sched/core.c735
-rw-r--r--kernel/sched/fair.c26
-rw-r--r--kernel/sched/sched.h26
-rw-r--r--kernel/sched/sched_avg.c2
-rw-r--r--kernel/sysctl.c7
-rw-r--r--kernel/trace/trace_event_perf.c3
6 files changed, 674 insertions, 125 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0b55bbbd7431..87e93b3f3b4e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -97,6 +97,9 @@ const char *task_event_names[] = {"PUT_PREV_TASK", "PICK_NEXT_TASK",
"TASK_WAKE", "TASK_MIGRATE", "TASK_UPDATE",
"IRQ_UPDATE"};
+const char *migrate_type_names[] = {"GROUP_TO_RQ", "RQ_TO_GROUP",
+ "RQ_TO_RQ", "GROUP_TO_GROUP"};
+
ATOMIC_NOTIFIER_HEAD(migration_notifier_head);
ATOMIC_NOTIFIER_HEAD(load_alert_notifier_head);
@@ -1864,6 +1867,61 @@ __read_mostly unsigned int sched_major_task_runtime = 10000000;
static unsigned int sync_cpu;
+static LIST_HEAD(related_thread_groups);
+static DEFINE_RWLOCK(related_thread_group_lock);
+
+#define for_each_related_thread_group(grp) \
+ list_for_each_entry(grp, &related_thread_groups, list)
+
+/*
+ * Demand aggregation for frequency purpose:
+ *
+ * 'sched_freq_aggregate' controls aggregation of cpu demand of related threads
+ * for frequency determination purpose. This aggregation is done per-cluster.
+ *
+ * CPU demand of tasks from various related groups is aggregated per-cluster and
+ * added to the "max_busy_cpu" in that cluster, where max_busy_cpu is determined
+ * by just rq->prev_runnable_sum.
+ *
+ * Some examples follow, which assume:
+ * Cluster0 = CPU0-3, Cluster1 = CPU4-7
+ * One related thread group A that has tasks A0, A1, A2
+ *
+ * A->cpu_time[X].curr/prev_sum = counters in which cpu execution stats of
+ * tasks belonging to group A are accumulated when they run on cpu X.
+ *
+ * CX->curr/prev_sum = counters in which cpu execution stats of all tasks
+ * not belonging to group A are accumulated when they run on cpu X
+ *
+ * Lets say the stats for window M was as below:
+ *
+ * C0->prev_sum = 1ms, A->cpu_time[0].prev_sum = 5ms
+ * Task A0 ran 5ms on CPU0
+ * Task B0 ran 1ms on CPU0
+ *
+ * C1->prev_sum = 5ms, A->cpu_time[1].prev_sum = 6ms
+ * Task A1 ran 4ms on CPU1
+ * Task A2 ran 2ms on CPU1
+ * Task B1 ran 5ms on CPU1
+ *
+ * C2->prev_sum = 0ms, A->cpu_time[2].prev_sum = 0
+ * CPU2 idle
+ *
+ * C3->prev_sum = 0ms, A->cpu_time[3].prev_sum = 0
+ * CPU3 idle
+ *
+ * In this case, CPU1 was most busy going by just its prev_sum counter. Demand
+ * from all group A tasks are added to CPU1. IOW, at end of window M, cpu busy
+ * time reported to governor will be:
+ *
+ *
+ * C0 busy time = 1ms
+ * C1 busy time = 5 + 5 + 6 = 16ms
+ *
+ */
+static __read_mostly unsigned int sched_freq_aggregate;
+__read_mostly unsigned int sysctl_sched_freq_aggregate;
+
#define EXITING_TASK_MARKER 0xdeaddead
static inline int exiting_task(struct task_struct *p)
@@ -1955,12 +2013,67 @@ static inline unsigned int load_to_freq(struct rq *rq, u64 load)
return freq;
}
-/* Should scheduler alert governor for changing frequency? */
-static int send_notification(struct rq *rq, int check_pred)
+static inline struct group_cpu_time *
+_group_cpu_time(struct related_thread_group *grp, int cpu);
+
+/*
+ * Return load from all related group in given cpu.
+ * Caller must ensure that related_thread_group_lock is held.
+ */
+static void _group_load_in_cpu(int cpu, u64 *grp_load, u64 *new_grp_load)
+{
+ struct related_thread_group *grp;
+
+ for_each_related_thread_group(grp) {
+ struct group_cpu_time *cpu_time;
+
+ cpu_time = _group_cpu_time(grp, cpu);
+ *grp_load += cpu_time->prev_runnable_sum;
+ if (new_grp_load)
+ *new_grp_load += cpu_time->nt_prev_runnable_sum;
+ }
+}
+
+/*
+ * Return load from all related groups in given frequency domain.
+ * Caller must ensure that related_thread_group_lock is held.
+ */
+static void group_load_in_freq_domain(struct cpumask *cpus,
+ u64 *grp_load, u64 *new_grp_load)
+{
+ struct related_thread_group *grp;
+ int j;
+
+ for_each_related_thread_group(grp) {
+ for_each_cpu(j, cpus) {
+ struct group_cpu_time *cpu_time;
+
+ cpu_time = _group_cpu_time(grp, j);
+ *grp_load += cpu_time->prev_runnable_sum;
+ *new_grp_load += cpu_time->nt_prev_runnable_sum;
+ }
+ }
+}
+
+/*
+ * Should scheduler alert governor for changing frequency?
+ *
+ * @check_pred - evaluate frequency based on the predictive demand
+ * @check_groups - add load from all related groups on given cpu
+ *
+ * check_groups is set to 1 if a "related" task movement/wakeup is triggering
+ * the notification check. To avoid "re-aggregation" of demand in such cases,
+ * we check whether the migrated/woken tasks demand (along with demand from
+ * existing tasks on the cpu) can be met on target cpu
+ *
+ */
+
+static int send_notification(struct rq *rq, int check_pred, int check_groups)
{
unsigned int cur_freq, freq_required;
unsigned long flags;
int rc = 0;
+ u64 group_load = 0, new_load;
if (!sched_enable_hmp)
return 0;
@@ -1982,8 +2095,22 @@ static int send_notification(struct rq *rq, int check_pred)
if (freq_required < cur_freq + sysctl_sched_pred_alert_freq)
return 0;
} else {
+ read_lock(&related_thread_group_lock);
+ /*
+ * Protect from concurrent update of rq->prev_runnable_sum and
+ * group cpu load
+ */
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ if (check_groups)
+ _group_load_in_cpu(cpu_of(rq), &group_load, NULL);
+
+ new_load = rq->prev_runnable_sum + group_load;
+
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+ read_unlock(&related_thread_group_lock);
+
cur_freq = load_to_freq(rq, rq->old_busy_time);
- freq_required = load_to_freq(rq, rq->prev_runnable_sum);
+ freq_required = load_to_freq(rq, new_load);
if (nearly_same_freq(cur_freq, freq_required))
return 0;
@@ -1993,6 +2120,8 @@ static int send_notification(struct rq *rq, int check_pred)
if (!rq->notifier_sent) {
rq->notifier_sent = 1;
rc = 1;
+ trace_sched_freq_alert(cpu_of(rq), check_pred, check_groups, rq,
+ new_load);
}
raw_spin_unlock_irqrestore(&rq->lock, flags);
@@ -2000,17 +2129,13 @@ static int send_notification(struct rq *rq, int check_pred)
}
/* Alert governor if there is a need to change frequency */
-void check_for_freq_change(struct rq *rq, bool check_pred)
+void check_for_freq_change(struct rq *rq, bool check_pred, bool check_groups)
{
int cpu = cpu_of(rq);
- if (!send_notification(rq, check_pred))
+ if (!send_notification(rq, check_pred, check_groups))
return;
- trace_sched_freq_alert(cpu, check_pred, rq->old_busy_time,
- rq->prev_runnable_sum, rq->old_estimated_time,
- rq->hmp_stats.pred_demands_sum);
-
atomic_notifier_call_chain(
&load_alert_notifier_head, 0,
(void *)(long)cpu);
@@ -2031,11 +2156,21 @@ static int account_busy_for_cpu_time(struct rq *rq, struct task_struct *p,
if (event == TASK_WAKE)
return 0;
- if (event == PUT_PREV_TASK || event == IRQ_UPDATE ||
- event == TASK_UPDATE)
+ if (event == PUT_PREV_TASK || event == IRQ_UPDATE)
return 1;
- /* Only TASK_MIGRATE && PICK_NEXT_TASK left */
+ /*
+ * TASK_UPDATE can be called on sleeping task, when its moved between
+ * related groups
+ */
+ if (event == TASK_UPDATE) {
+ if (rq->curr == p)
+ return 1;
+
+ return p->on_rq ? sched_freq_account_wait_time : 0;
+ }
+
+ /* TASK_MIGRATE, PICK_NEXT_TASK left */
return sched_freq_account_wait_time;
}
@@ -2262,6 +2397,15 @@ void update_task_pred_demand(struct rq *rq, struct task_struct *p, int event)
event != PICK_NEXT_TASK)))
return;
+ /*
+ * TASK_UPDATE can be called on sleeping task, when its moved between
+ * related groups
+ */
+ if (event == TASK_UPDATE) {
+ if (!p->on_rq && !sched_freq_account_wait_time)
+ return;
+ }
+
new = calc_pred_demand(rq, p);
old = p->ravg.pred_demand;
@@ -2290,7 +2434,14 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
u64 window_start = rq->window_start;
u32 window_size = sched_ravg_window;
u64 delta;
+ u64 *curr_runnable_sum = &rq->curr_runnable_sum;
+ u64 *prev_runnable_sum = &rq->prev_runnable_sum;
+ u64 *nt_curr_runnable_sum = &rq->nt_curr_runnable_sum;
+ u64 *nt_prev_runnable_sum = &rq->nt_prev_runnable_sum;
+ int flip_counters = 0;
+ int prev_sum_reset = 0;
bool new_task;
+ struct related_thread_group *grp;
new_window = mark_start < window_start;
if (new_window) {
@@ -2302,6 +2453,51 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
new_task = is_new_task(p);
+ grp = p->grp;
+ if (grp && sched_freq_aggregate) {
+ /* cpu_time protected by rq_lock */
+ struct group_cpu_time *cpu_time =
+ _group_cpu_time(grp, cpu_of(rq));
+
+ curr_runnable_sum = &cpu_time->curr_runnable_sum;
+ prev_runnable_sum = &cpu_time->prev_runnable_sum;
+
+ nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
+ nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
+
+ if (cpu_time->window_start != rq->window_start) {
+ int nr_windows;
+
+ delta = rq->window_start - cpu_time->window_start;
+ nr_windows = div64_u64(delta, window_size);
+ if (nr_windows > 1)
+ prev_sum_reset = 1;
+
+ cpu_time->window_start = rq->window_start;
+ flip_counters = 1;
+ }
+
+ if (p_is_curr_task && new_window) {
+ u64 curr_sum = rq->curr_runnable_sum;
+ u64 nt_curr_sum = rq->nt_curr_runnable_sum;
+
+ if (nr_full_windows)
+ curr_sum = nt_curr_sum = 0;
+
+ rq->prev_runnable_sum = curr_sum;
+ rq->nt_prev_runnable_sum = nt_curr_sum;
+
+ rq->curr_runnable_sum = 0;
+ rq->nt_curr_runnable_sum = 0;
+ }
+ } else {
+ if (p_is_curr_task && new_window) {
+ flip_counters = 1;
+ if (nr_full_windows)
+ prev_sum_reset = 1;
+ }
+ }
+
/* Handle per-task window rollover. We don't care about the idle
* task or exiting tasks. */
if (new_window && !is_idle_task(p) && !exiting_task(p)) {
@@ -2314,6 +2510,20 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
p->ravg.curr_window = 0;
}
+ if (flip_counters) {
+ u64 curr_sum = *curr_runnable_sum;
+ u64 nt_curr_sum = *nt_curr_runnable_sum;
+
+ if (prev_sum_reset)
+ curr_sum = nt_curr_sum = 0;
+
+ *prev_runnable_sum = curr_sum;
+ *nt_prev_runnable_sum = nt_curr_sum;
+
+ *curr_runnable_sum = 0;
+ *nt_curr_runnable_sum = 0;
+ }
+
if (!account_busy_for_cpu_time(rq, p, irqtime, event)) {
/* account_busy_for_cpu_time() = 0, so no update to the
* task's current window needs to be made. This could be
@@ -2331,19 +2541,8 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
/* A new window has started. The RQ demand must be rolled
* over if p is the current task. */
if (p_is_curr_task) {
- u64 prev_sum = 0, nt_prev_sum = 0;
-
- /* p is either idle task or an exiting task */
- if (!nr_full_windows) {
- prev_sum = rq->curr_runnable_sum;
- nt_prev_sum = rq->nt_curr_runnable_sum;
- }
-
- rq->prev_runnable_sum = prev_sum;
- rq->curr_runnable_sum = 0;
- rq->nt_prev_runnable_sum = nt_prev_sum;
- rq->nt_curr_runnable_sum = 0;
-
+ /* p is idle task */
+ BUG_ON(p != rq->idle);
} else if (heavy_task_wakeup(p, rq, event)) {
/* A new window has started. If p is a waking
* heavy task its prev_window contribution is faked
@@ -2353,9 +2552,9 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
* can be controlled via the sched_heavy_task
* tunable. */
p->ravg.prev_window = p->ravg.demand;
- rq->prev_runnable_sum += p->ravg.demand;
+ *prev_runnable_sum += p->ravg.demand;
if (new_task)
- rq->nt_prev_runnable_sum += p->ravg.demand;
+ *nt_prev_runnable_sum += p->ravg.demand;
}
return;
@@ -2373,9 +2572,10 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
else
delta = irqtime;
delta = scale_exec_time(delta, rq, cc);
- rq->curr_runnable_sum += delta;
+ *curr_runnable_sum += delta;
if (new_task)
- rq->nt_curr_runnable_sum += delta;
+ *nt_curr_runnable_sum += delta;
+
if (!is_idle_task(p) && !exiting_task(p))
p->ravg.curr_window += delta;
@@ -2409,15 +2609,17 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
if (!exiting_task(p))
p->ravg.prev_window = delta;
}
- rq->prev_runnable_sum += delta;
+
+ *prev_runnable_sum += delta;
if (new_task)
- rq->nt_prev_runnable_sum += delta;
+ *nt_prev_runnable_sum += delta;
/* Account piece of busy time in the current window. */
delta = scale_exec_time(wallclock - window_start, rq, cc);
- rq->curr_runnable_sum += delta;
+ *curr_runnable_sum += delta;
if (new_task)
- rq->nt_curr_runnable_sum += delta;
+ *nt_curr_runnable_sum += delta;
+
if (!exiting_task(p))
p->ravg.curr_window = delta;
@@ -2444,12 +2646,6 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
cc);
if (!is_idle_task(p) && !exiting_task(p))
p->ravg.prev_window += delta;
-
- rq->nt_prev_runnable_sum = rq->nt_curr_runnable_sum;
- if (new_task)
- rq->nt_prev_runnable_sum += delta;
-
- delta += rq->curr_runnable_sum;
} else {
/* Since at least one full window has elapsed,
* the contribution to the previous window is the
@@ -2457,27 +2653,20 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
delta = scale_exec_time(window_size, rq, cc);
if (!is_idle_task(p) && !exiting_task(p))
p->ravg.prev_window = delta;
-
- if (new_task)
- rq->nt_prev_runnable_sum = delta;
- else
- rq->nt_prev_runnable_sum = 0;
}
- /*
- * Rollover for normal runnable sum is done here by overwriting
- * the values in prev_runnable_sum and curr_runnable_sum.
- * Rollover for new task runnable sum has completed by previous
- * if-else statement.
- */
- rq->prev_runnable_sum = delta;
+
+ /* Rollover is done here by overwriting the values in
+ * prev_runnable_sum and curr_runnable_sum. */
+ *prev_runnable_sum += delta;
+ if (new_task)
+ *nt_prev_runnable_sum += delta;
/* Account piece of busy time in the current window. */
delta = scale_exec_time(wallclock - window_start, rq, cc);
- rq->curr_runnable_sum = delta;
+ *curr_runnable_sum += delta;
if (new_task)
- rq->nt_curr_runnable_sum = delta;
- else
- rq->nt_curr_runnable_sum = 0;
+ *nt_curr_runnable_sum += delta;
+
if (!is_idle_task(p) && !exiting_task(p))
p->ravg.curr_window = delta;
@@ -2500,12 +2689,8 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
/* Roll window over. If IRQ busy time was just in the current
* window then that is all that need be accounted. */
- rq->prev_runnable_sum = rq->curr_runnable_sum;
- rq->nt_prev_runnable_sum = rq->nt_curr_runnable_sum;
- rq->nt_curr_runnable_sum = 0;
if (mark_start > window_start) {
- rq->curr_runnable_sum = scale_exec_time(irqtime, rq,
- cc);
+ *curr_runnable_sum = scale_exec_time(irqtime, rq, cc);
return;
}
@@ -2515,7 +2700,7 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
if (delta > window_size)
delta = window_size;
delta = scale_exec_time(delta, rq, cc);
- rq->prev_runnable_sum += delta;
+ *prev_runnable_sum += delta;
/* Process the remaining IRQ busy time in the current window. */
delta = wallclock - window_start;
@@ -2820,7 +3005,8 @@ update_task_ravg(struct task_struct *p, struct rq *rq, int event,
update_task_pred_demand(rq, p, event);
done:
trace_sched_update_task_ravg(p, rq, event, wallclock, irqtime,
- cc.cycles, cc.time);
+ cc.cycles, cc.time,
+ _group_cpu_time(p->grp, cpu_of(rq)));
p->ravg.mark_start = wallclock;
@@ -3002,7 +3188,8 @@ enum reset_reason_code {
ACCOUNT_WAIT_TIME_CHANGE,
HIST_SIZE_CHANGE,
MIGRATION_FIXUP_CHANGE,
- FREQ_ACCOUNT_WAIT_TIME_CHANGE
+ FREQ_ACCOUNT_WAIT_TIME_CHANGE,
+ FREQ_AGGREGATE_CHANGE,
};
const char *sched_window_reset_reasons[] = {
@@ -3021,6 +3208,7 @@ void reset_all_window_stats(u64 window_start, unsigned int window_size)
u64 start_ts = sched_ktime_clock();
int reason = WINDOW_CHANGE;
unsigned int old = 0, new = 0;
+ struct related_thread_group *grp;
disable_window_stats();
@@ -3028,11 +3216,26 @@ void reset_all_window_stats(u64 window_start, unsigned int window_size)
local_irq_save(flags);
+ read_lock(&related_thread_group_lock);
+
for_each_possible_cpu(cpu) {
struct rq *rq = cpu_rq(cpu);
raw_spin_lock(&rq->lock);
}
+ list_for_each_entry(grp, &related_thread_groups, list) {
+ int j;
+
+ for_each_possible_cpu(j) {
+ struct group_cpu_time *cpu_time;
+ /* Protected by rq lock */
+ cpu_time = _group_cpu_time(grp, j);
+ memset(cpu_time, 0, sizeof(struct group_cpu_time));
+ if (window_start)
+ cpu_time->window_start = window_start;
+ }
+ }
+
if (window_size) {
sched_ravg_window = window_size * TICK_NSEC;
set_hmp_defaults();
@@ -3081,6 +3284,12 @@ void reset_all_window_stats(u64 window_start, unsigned int window_size)
new = sysctl_sched_freq_account_wait_time;
sched_freq_account_wait_time =
sysctl_sched_freq_account_wait_time;
+ } else if (sched_freq_aggregate !=
+ sysctl_sched_freq_aggregate) {
+ reason = FREQ_AGGREGATE_CHANGE;
+ old = sched_freq_aggregate;
+ new = sysctl_sched_freq_aggregate;
+ sched_freq_aggregate = sysctl_sched_freq_aggregate;
}
#endif
@@ -3089,6 +3298,8 @@ void reset_all_window_stats(u64 window_start, unsigned int window_size)
raw_spin_unlock(&rq->lock);
}
+ read_unlock(&related_thread_group_lock);
+
local_irq_restore(flags);
trace_sched_reset_all_window_stats(window_start, window_size,
@@ -3097,13 +3308,17 @@ void reset_all_window_stats(u64 window_start, unsigned int window_size)
#ifdef CONFIG_SCHED_FREQ_INPUT
+static inline void
+sync_window_start(struct rq *rq, struct group_cpu_time *cpu_time);
+
void sched_get_cpus_busy(struct sched_load *busy,
const struct cpumask *query_cpus)
{
unsigned long flags;
struct rq *rq;
const int cpus = cpumask_weight(query_cpus);
- u64 load[cpus], nload[cpus];
+ u64 load[cpus], group_load[cpus];
+ u64 nload[cpus], ngload[cpus];
u64 pload[cpus];
unsigned int cur_freq[cpus], max_freq[cpus];
int notifier_sent[cpus];
@@ -3111,6 +3326,9 @@ void sched_get_cpus_busy(struct sched_load *busy,
int cpu, i = 0;
unsigned int window_size;
struct cpu_cycle cc;
+ u64 max_prev_sum = 0;
+ int max_busy_cpu = cpumask_first(query_cpus);
+ struct related_thread_group *grp;
if (unlikely(cpus == 0))
return;
@@ -3120,6 +3338,8 @@ void sched_get_cpus_busy(struct sched_load *busy,
* current task may have been executing for a long time. Ensure
* that the window stats are current by doing an update.
*/
+ read_lock(&related_thread_group_lock);
+
local_irq_save(flags);
for_each_cpu(cpu, query_cpus)
raw_spin_lock(&cpu_rq(cpu)->lock);
@@ -3137,6 +3357,49 @@ void sched_get_cpus_busy(struct sched_load *busy,
nload[i] = rq->nt_prev_runnable_sum;
pload[i] = rq->hmp_stats.pred_demands_sum;
rq->old_estimated_time = pload[i];
+
+ if (load[i] > max_prev_sum) {
+ max_prev_sum = load[i];
+ max_busy_cpu = cpu;
+ }
+
+ notifier_sent[i] = rq->notifier_sent;
+ early_detection[i] = (rq->ed_task != NULL);
+ rq->notifier_sent = 0;
+ cur_freq[i] = cpu_cur_freq(cpu);
+ max_freq[i] = cpu_max_freq(cpu);
+ i++;
+ }
+
+ for_each_related_thread_group(grp) {
+ for_each_cpu(cpu, query_cpus) {
+ /* Protected by rq_lock */
+ struct group_cpu_time *cpu_time =
+ _group_cpu_time(grp, cpu);
+ sync_window_start(cpu_rq(cpu), cpu_time);
+ }
+ }
+
+ i = 0;
+ for_each_cpu(cpu, query_cpus) {
+ group_load[i] = 0;
+ ngload[i] = 0;
+
+ if (early_detection[i])
+ goto skip_early;
+
+ rq = cpu_rq(cpu);
+ if (!notifier_sent[i]) {
+ if (cpu == max_busy_cpu)
+ group_load_in_freq_domain(
+ &rq->freq_domain_cpumask,
+ &group_load[i], &ngload[i]);
+ } else {
+ _group_load_in_cpu(cpu, &group_load[i], &ngload[i]);
+ }
+
+ load[i] += group_load[i];
+ nload[i] += ngload[i];
/*
* Scale load in reference to cluster max_possible_freq.
*
@@ -3146,11 +3409,7 @@ void sched_get_cpus_busy(struct sched_load *busy,
load[i] = scale_load_to_cpu(load[i], cpu);
nload[i] = scale_load_to_cpu(nload[i], cpu);
pload[i] = scale_load_to_cpu(pload[i], cpu);
-
- notifier_sent[i] = rq->notifier_sent;
- early_detection[i] = (rq->ed_task != NULL);
- rq->notifier_sent = 0;
- max_freq[i] = cpu_max_freq(cpu);
+skip_early:
i++;
}
@@ -3158,6 +3417,8 @@ void sched_get_cpus_busy(struct sched_load *busy,
raw_spin_unlock(&(cpu_rq(cpu))->lock);
local_irq_restore(flags);
+ read_unlock(&related_thread_group_lock);
+
i = 0;
for_each_cpu(cpu, query_cpus) {
rq = cpu_rq(cpu);
@@ -3205,17 +3466,6 @@ exit_early:
}
}
-unsigned long sched_get_busy(int cpu)
-{
- struct cpumask query_cpu = CPU_MASK_NONE;
- struct sched_load busy;
-
- cpumask_set_cpu(cpu, &query_cpu);
- sched_get_cpus_busy(&busy, &query_cpu);
-
- return busy.prev_load;
-}
-
void sched_set_io_is_busy(int val)
{
sched_io_is_busy = val;
@@ -3267,7 +3517,14 @@ static void fixup_busy_time(struct task_struct *p, int new_cpu)
struct rq *src_rq = task_rq(p);
struct rq *dest_rq = cpu_rq(new_cpu);
u64 wallclock;
+ u64 *src_curr_runnable_sum, *dst_curr_runnable_sum;
+ u64 *src_prev_runnable_sum, *dst_prev_runnable_sum;
+ u64 *src_nt_curr_runnable_sum, *dst_nt_curr_runnable_sum;
+ u64 *src_nt_prev_runnable_sum, *dst_nt_prev_runnable_sum;
+ int migrate_type;
+ struct migration_sum_data d;
bool new_task;
+ struct related_thread_group *grp;
if (!sched_enable_hmp || !sched_migration_fixup ||
(!p->on_rq && p->state != TASK_WAKING))
@@ -3298,22 +3555,62 @@ static void fixup_busy_time(struct task_struct *p, int new_cpu)
update_task_cpu_cycles(p, new_cpu);
new_task = is_new_task(p);
+ /* Protected by rq_lock */
+ grp = p->grp;
+ if (grp && sched_freq_aggregate) {
+ struct group_cpu_time *cpu_time;
+
+ migrate_type = GROUP_TO_GROUP;
+ /* Protected by rq_lock */
+ cpu_time = _group_cpu_time(grp, cpu_of(src_rq));
+ d.src_rq = NULL;
+ d.src_cpu_time = cpu_time;
+ src_curr_runnable_sum = &cpu_time->curr_runnable_sum;
+ src_prev_runnable_sum = &cpu_time->prev_runnable_sum;
+ src_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
+ src_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
+
+ /* Protected by rq_lock */
+ cpu_time = _group_cpu_time(grp, cpu_of(dest_rq));
+ d.dst_rq = NULL;
+ d.dst_cpu_time = cpu_time;
+ dst_curr_runnable_sum = &cpu_time->curr_runnable_sum;
+ dst_prev_runnable_sum = &cpu_time->prev_runnable_sum;
+ dst_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
+ dst_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
+ sync_window_start(dest_rq, cpu_time);
+ } else {
+ migrate_type = RQ_TO_RQ;
+ d.src_rq = src_rq;
+ d.src_cpu_time = NULL;
+ d.dst_rq = dest_rq;
+ d.dst_cpu_time = NULL;
+ src_curr_runnable_sum = &src_rq->curr_runnable_sum;
+ src_prev_runnable_sum = &src_rq->prev_runnable_sum;
+ src_nt_curr_runnable_sum = &src_rq->nt_curr_runnable_sum;
+ src_nt_prev_runnable_sum = &src_rq->nt_prev_runnable_sum;
+
+ dst_curr_runnable_sum = &dest_rq->curr_runnable_sum;
+ dst_prev_runnable_sum = &dest_rq->prev_runnable_sum;
+ dst_nt_curr_runnable_sum = &dest_rq->nt_curr_runnable_sum;
+ dst_nt_prev_runnable_sum = &dest_rq->nt_prev_runnable_sum;
+ }
if (p->ravg.curr_window) {
- src_rq->curr_runnable_sum -= p->ravg.curr_window;
- dest_rq->curr_runnable_sum += p->ravg.curr_window;
+ *src_curr_runnable_sum -= p->ravg.curr_window;
+ *dst_curr_runnable_sum += p->ravg.curr_window;
if (new_task) {
- src_rq->nt_curr_runnable_sum -= p->ravg.curr_window;
- dest_rq->nt_curr_runnable_sum += p->ravg.curr_window;
+ *src_nt_curr_runnable_sum -= p->ravg.curr_window;
+ *dst_nt_curr_runnable_sum += p->ravg.curr_window;
}
}
if (p->ravg.prev_window) {
- src_rq->prev_runnable_sum -= p->ravg.prev_window;
- dest_rq->prev_runnable_sum += p->ravg.prev_window;
+ *src_prev_runnable_sum -= p->ravg.prev_window;
+ *dst_prev_runnable_sum += p->ravg.prev_window;
if (new_task) {
- src_rq->nt_prev_runnable_sum -= p->ravg.prev_window;
- dest_rq->nt_prev_runnable_sum += p->ravg.prev_window;
+ *src_nt_prev_runnable_sum -= p->ravg.prev_window;
+ *dst_nt_prev_runnable_sum += p->ravg.prev_window;
}
}
@@ -3323,13 +3620,11 @@ static void fixup_busy_time(struct task_struct *p, int new_cpu)
dest_rq->ed_task = p;
}
- BUG_ON((s64)src_rq->prev_runnable_sum < 0);
- BUG_ON((s64)src_rq->curr_runnable_sum < 0);
- BUG_ON((s64)src_rq->nt_prev_runnable_sum < 0);
- BUG_ON((s64)src_rq->nt_curr_runnable_sum < 0);
-
- trace_sched_migration_update_sum(src_rq, p);
- trace_sched_migration_update_sum(dest_rq, p);
+ trace_sched_migration_update_sum(p, migrate_type, &d);
+ BUG_ON((s64)*src_prev_runnable_sum < 0);
+ BUG_ON((s64)*src_curr_runnable_sum < 0);
+ BUG_ON((s64)*src_nt_prev_runnable_sum < 0);
+ BUG_ON((s64)*src_nt_curr_runnable_sum < 0);
done:
if (p->state == TASK_WAKING)
@@ -3368,10 +3663,6 @@ static void check_for_up_down_migrate_update(const struct cpumask *cpus)
update_up_down_migrate();
}
-static LIST_HEAD(related_thread_groups);
-static DEFINE_RWLOCK(related_thread_group_lock);
-static int nr_related_thread_groups;
-
/* Return cluster which can offer required capacity for group */
static struct sched_cluster *
best_cluster(struct related_thread_group *grp, u64 total_demand)
@@ -3421,6 +3712,199 @@ static void set_preferred_cluster(struct related_thread_group *grp)
raw_spin_unlock(&grp->lock);
}
+#define ADD_TASK 0
+#define REM_TASK 1
+
+#ifdef CONFIG_SCHED_FREQ_INPUT
+
+static struct cpu_cycle
+update_task_ravg(struct task_struct *p, struct rq *rq,
+ int event, u64 wallclock, u64 irqtime);
+
+static inline void free_group_cputime(struct related_thread_group *grp)
+{
+ free_percpu(grp->cpu_time);
+}
+
+static int alloc_group_cputime(struct related_thread_group *grp)
+{
+ int i;
+ struct group_cpu_time *cpu_time;
+ int cpu = raw_smp_processor_id();
+ struct rq *rq = cpu_rq(cpu);
+ u64 window_start = rq->window_start;
+
+ grp->cpu_time = alloc_percpu(struct group_cpu_time);
+ if (!grp->cpu_time)
+ return -ENOMEM;
+
+ for_each_possible_cpu(i) {
+ cpu_time = per_cpu_ptr(grp->cpu_time, i);
+ memset(cpu_time, 0, sizeof(struct group_cpu_time));
+ cpu_time->window_start = window_start;
+ }
+
+ return 0;
+}
+
+/*
+ * A group's window_start may be behind. When moving it forward, flip prev/curr
+ * counters. When moving forward > 1 window, prev counter is set to 0
+ */
+static inline void
+sync_window_start(struct rq *rq, struct group_cpu_time *cpu_time)
+{
+ u64 delta;
+ int nr_windows;
+ u64 curr_sum = cpu_time->curr_runnable_sum;
+ u64 nt_curr_sum = cpu_time->nt_curr_runnable_sum;
+
+ delta = rq->window_start - cpu_time->window_start;
+ if (!delta)
+ return;
+
+ nr_windows = div64_u64(delta, sched_ravg_window);
+ if (nr_windows > 1)
+ curr_sum = nt_curr_sum = 0;
+
+ cpu_time->prev_runnable_sum = curr_sum;
+ cpu_time->curr_runnable_sum = 0;
+
+ cpu_time->nt_prev_runnable_sum = nt_curr_sum;
+ cpu_time->nt_curr_runnable_sum = 0;
+
+ cpu_time->window_start = rq->window_start;
+}
+
+/*
+ * Task's cpu usage is accounted in:
+ * rq->curr/prev_runnable_sum, when its ->grp is NULL
+ * grp->cpu_time[cpu]->curr/prev_runnable_sum, when its ->grp is !NULL
+ *
+ * Transfer task's cpu usage between those counters when transitioning between
+ * groups
+ */
+static void transfer_busy_time(struct rq *rq, struct related_thread_group *grp,
+ struct task_struct *p, int event)
+{
+ u64 wallclock;
+ struct group_cpu_time *cpu_time;
+ u64 *src_curr_runnable_sum, *dst_curr_runnable_sum;
+ u64 *src_prev_runnable_sum, *dst_prev_runnable_sum;
+ u64 *src_nt_curr_runnable_sum, *dst_nt_curr_runnable_sum;
+ u64 *src_nt_prev_runnable_sum, *dst_nt_prev_runnable_sum;
+ struct migration_sum_data d;
+ int migrate_type;
+
+ if (!sched_freq_aggregate)
+ return;
+
+ wallclock = sched_ktime_clock();
+
+ update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
+ update_task_ravg(p, rq, TASK_UPDATE, wallclock, 0);
+
+ /* cpu_time protected by related_thread_group_lock, grp->lock rq_lock */
+ cpu_time = _group_cpu_time(grp, cpu_of(rq));
+ if (event == ADD_TASK) {
+ sync_window_start(rq, cpu_time);
+ migrate_type = RQ_TO_GROUP;
+ d.src_rq = rq;
+ d.src_cpu_time = NULL;
+ d.dst_rq = NULL;
+ d.dst_cpu_time = cpu_time;
+ src_curr_runnable_sum = &rq->curr_runnable_sum;
+ dst_curr_runnable_sum = &cpu_time->curr_runnable_sum;
+ src_prev_runnable_sum = &rq->prev_runnable_sum;
+ dst_prev_runnable_sum = &cpu_time->prev_runnable_sum;
+
+ src_nt_curr_runnable_sum = &rq->nt_curr_runnable_sum;
+ dst_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
+ src_nt_prev_runnable_sum = &rq->nt_prev_runnable_sum;
+ dst_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
+ } else if (event == REM_TASK) {
+ migrate_type = GROUP_TO_RQ;
+ d.src_rq = NULL;
+ d.src_cpu_time = cpu_time;
+ d.dst_rq = rq;
+ d.dst_cpu_time = NULL;
+
+ /*
+ * In case of REM_TASK, cpu_time->window_start would be
+ * uptodate, because of the update_task_ravg() we called
+ * above on the moving task. Hence no need for
+ * sync_window_start()
+ */
+ src_curr_runnable_sum = &cpu_time->curr_runnable_sum;
+ dst_curr_runnable_sum = &rq->curr_runnable_sum;
+ src_prev_runnable_sum = &cpu_time->prev_runnable_sum;
+ dst_prev_runnable_sum = &rq->prev_runnable_sum;
+
+ src_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
+ dst_nt_curr_runnable_sum = &rq->nt_curr_runnable_sum;
+ src_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
+ dst_nt_prev_runnable_sum = &rq->nt_prev_runnable_sum;
+ }
+
+ *src_curr_runnable_sum -= p->ravg.curr_window;
+ *dst_curr_runnable_sum += p->ravg.curr_window;
+
+ *src_prev_runnable_sum -= p->ravg.prev_window;
+ *dst_prev_runnable_sum += p->ravg.prev_window;
+
+ if (is_new_task(p)) {
+ *src_nt_curr_runnable_sum -= p->ravg.curr_window;
+ *dst_nt_curr_runnable_sum += p->ravg.curr_window;
+ *src_nt_prev_runnable_sum -= p->ravg.prev_window;
+ *dst_nt_prev_runnable_sum += p->ravg.prev_window;
+ }
+
+ trace_sched_migration_update_sum(p, migrate_type, &d);
+
+ BUG_ON((s64)*src_curr_runnable_sum < 0);
+ BUG_ON((s64)*src_prev_runnable_sum < 0);
+}
+
+static inline struct group_cpu_time *
+task_group_cpu_time(struct task_struct *p, int cpu)
+{
+ return _group_cpu_time(rcu_dereference(p->grp), cpu);
+}
+
+static inline struct group_cpu_time *
+_group_cpu_time(struct related_thread_group *grp, int cpu)
+{
+ return grp ? per_cpu_ptr(grp->cpu_time, cpu) : NULL;
+}
+
+#else /* CONFIG_SCHED_FREQ_INPUT */
+
+static inline void free_group_cputime(struct related_thread_group *grp) { }
+
+static inline int alloc_group_cputime(struct related_thread_group *grp)
+{
+ return 0;
+}
+
+static inline void transfer_busy_time(struct rq *rq,
+ struct related_thread_group *grp, struct task_struct *p, int event)
+{
+}
+
+static struct group_cpu_time *
+task_group_cpu_time(struct task_struct *p, int cpu)
+{
+ return NULL;
+}
+
+static inline struct group_cpu_time *
+_group_cpu_time(struct related_thread_group *grp, int cpu)
+{
+ return NULL;
+}
+
+#endif
+
struct related_thread_group *alloc_related_thread_group(int group_id)
{
struct related_thread_group *grp;
@@ -3429,6 +3913,11 @@ struct related_thread_group *alloc_related_thread_group(int group_id)
if (!grp)
return ERR_PTR(-ENOMEM);
+ if (alloc_group_cputime(grp)) {
+ kfree(grp);
+ return ERR_PTR(-ENOMEM);
+ }
+
grp->id = group_id;
INIT_LIST_HEAD(&grp->tasks);
INIT_LIST_HEAD(&grp->list);
@@ -3449,6 +3938,16 @@ struct related_thread_group *lookup_related_thread_group(unsigned int group_id)
return NULL;
}
+/* See comments before preferred_cluster() */
+static void free_related_thread_group(struct rcu_head *rcu)
+{
+ struct related_thread_group *grp = container_of(rcu, struct
+ related_thread_group, rcu);
+
+ free_group_cputime(grp);
+ kfree(grp);
+}
+
static void remove_task_from_group(struct task_struct *p)
{
struct related_thread_group *grp = p->grp;
@@ -3458,6 +3957,7 @@ static void remove_task_from_group(struct task_struct *p)
raw_spin_lock(&grp->lock);
rq = __task_rq_lock(p);
+ transfer_busy_time(rq, p->grp, p, REM_TASK);
list_del_init(&p->grp_list);
rcu_assign_pointer(p->grp, NULL);
__task_rq_unlock(rq);
@@ -3471,9 +3971,7 @@ static void remove_task_from_group(struct task_struct *p)
if (empty_group) {
list_del(&grp->list);
- nr_related_thread_groups--;
- /* See comments before preferred_cluster() */
- kfree_rcu(grp, rcu);
+ call_rcu(&grp->rcu, free_related_thread_group);
}
}
@@ -3489,8 +3987,9 @@ add_task_to_group(struct task_struct *p, struct related_thread_group *grp)
* reference of p->grp in various hot-paths
*/
rq = __task_rq_lock(p);
- rcu_assign_pointer(p->grp, grp);
+ transfer_busy_time(rq, grp, p, ADD_TASK);
list_add(&p->grp_list, &grp->tasks);
+ rcu_assign_pointer(p->grp, grp);
__task_rq_unlock(rq);
_set_preferred_cluster(grp);
@@ -3539,7 +4038,6 @@ redo:
} else if (!grp && new) {
/* New group - use object allocated before */
destroy = 0;
- nr_related_thread_groups++;
list_add(&new->list, &related_thread_groups);
grp = new;
}
@@ -3550,8 +4048,10 @@ redo:
done:
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
- if (destroy)
+ if (new && destroy) {
+ free_group_cputime(new);
kfree(new);
+ }
return rc;
}
@@ -3898,13 +4398,19 @@ static void notify_migration(int src_cpu, int dest_cpu, bool src_cpu_dead,
struct task_struct *p)
{
struct migration_notify_data mnd;
+ bool check_groups;
+
+ rcu_read_lock();
+ check_groups = rcu_access_pointer(p->grp) != NULL;
+ rcu_read_unlock();
if (!same_freq_domain(src_cpu, dest_cpu)) {
if (!src_cpu_dead)
- check_for_freq_change(cpu_rq(src_cpu), false);
- check_for_freq_change(cpu_rq(dest_cpu), false);
+ check_for_freq_change(cpu_rq(src_cpu), false,
+ check_groups);
+ check_for_freq_change(cpu_rq(dest_cpu), false, check_groups);
} else {
- check_for_freq_change(cpu_rq(dest_cpu), true);
+ check_for_freq_change(cpu_rq(dest_cpu), true, check_groups);
}
if (task_notify_on_migrate(p)) {
@@ -4771,6 +5277,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
struct related_thread_group *grp = NULL;
#endif
bool freq_notif_allowed = !(wake_flags & WF_NO_NOTIFIER);
+ bool check_group = false;
wake_flags &= ~WF_NO_NOTIFIER;
@@ -4846,6 +5353,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
if (update_preferred_cluster(grp, p, old_load))
set_preferred_cluster(grp);
rcu_read_unlock();
+ check_group = grp != NULL;
p->sched_contributes_to_load = !!task_contributes_to_load(p);
p->state = TASK_WAKING;
@@ -4894,12 +5402,14 @@ out:
if (freq_notif_allowed) {
if (!same_freq_domain(src_cpu, cpu)) {
- check_for_freq_change(cpu_rq(cpu), false);
- check_for_freq_change(cpu_rq(src_cpu), false);
+ check_for_freq_change(cpu_rq(cpu),
+ false, check_group);
+ check_for_freq_change(cpu_rq(src_cpu),
+ false, check_group);
} else if (heavy_task) {
- check_for_freq_change(cpu_rq(cpu), false);
+ check_for_freq_change(cpu_rq(cpu), false, false);
} else if (success) {
- check_for_freq_change(cpu_rq(cpu), true);
+ check_for_freq_change(cpu_rq(cpu), true, false);
}
}
@@ -10543,6 +11053,7 @@ void __init sched_init(void)
rq->nt_curr_runnable_sum = rq->nt_prev_runnable_sum = 0;
rq->old_busy_time = 0;
rq->old_estimated_time = 0;
+ rq->old_busy_time_group = 0;
rq->notifier_sent = 0;
rq->hmp_stats.pred_demands_sum = 0;
#endif
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0288a331e311..a33eddb7b17d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -32,9 +32,8 @@
#include <linux/task_work.h>
#include <linux/ratelimit.h>
-#include <trace/events/sched.h>
-
#include "sched.h"
+#include <trace/events/sched.h>
/*
* Targeted preemption latency for CPU-bound tasks:
@@ -4059,6 +4058,9 @@ static inline int invalid_value_freq_input(unsigned int *data)
if (data == &sysctl_sched_freq_account_wait_time)
return !(*data == 0 || *data == 1);
+ if (data == &sysctl_sched_freq_aggregate)
+ return !(*data == 0 || *data == 1);
+
return 0;
}
#else
@@ -7674,6 +7676,7 @@ enum fbq_type { regular, remote, all };
LBF_BIG_TASK_ACTIVE_BALANCE)
#define LBF_IGNORE_BIG_TASKS 0x100
#define LBF_IGNORE_PREFERRED_CLUSTER_TASKS 0x200
+#define LBF_MOVED_RELATED_THREAD_GROUP_TASK 0x400
struct lb_env {
struct sched_domain *sd;
@@ -7916,6 +7919,8 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
deactivate_task(env->src_rq, p, 0);
double_lock_balance(env->src_rq, env->dst_rq);
set_task_cpu(p, env->dst_cpu);
+ if (rcu_access_pointer(p->grp))
+ env->flags |= LBF_MOVED_RELATED_THREAD_GROUP_TASK;
double_unlock_balance(env->src_rq, env->dst_rq);
}
@@ -9575,10 +9580,13 @@ no_move:
/* Assumes one 'busiest' cpu that we pulled tasks from */
if (!same_freq_domain(this_cpu, cpu_of(busiest))) {
- check_for_freq_change(this_rq, false);
- check_for_freq_change(busiest, false);
+ int check_groups = !!(env.flags &
+ LBF_MOVED_RELATED_THREAD_GROUP_TASK);
+
+ check_for_freq_change(this_rq, false, check_groups);
+ check_for_freq_change(busiest, false, check_groups);
} else {
- check_for_freq_change(this_rq, true);
+ check_for_freq_change(this_rq, true, false);
}
}
if (likely(!active_balance)) {
@@ -9876,10 +9884,12 @@ out_unlock:
local_irq_enable();
if (moved && !same_freq_domain(busiest_cpu, target_cpu)) {
- check_for_freq_change(busiest_rq, false);
- check_for_freq_change(target_rq, false);
+ int check_groups = !!(env.flags &
+ LBF_MOVED_RELATED_THREAD_GROUP_TASK);
+ check_for_freq_change(busiest_rq, false, check_groups);
+ check_for_freq_change(target_rq, false, check_groups);
} else if (moved) {
- check_for_freq_change(target_rq, true);
+ check_for_freq_change(target_rq, true, false);
}
if (per_cpu(dbs_boost_needed, target_cpu)) {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index a66d8a12051c..df9b972195e5 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -409,6 +409,16 @@ struct related_thread_group {
struct sched_cluster *preferred_cluster;
struct rcu_head rcu;
u64 last_update;
+#ifdef CONFIG_SCHED_FREQ_INPUT
+ struct group_cpu_time __percpu *cpu_time; /* one per cluster */
+#endif
+};
+
+struct migration_sum_data {
+ struct rq *src_rq, *dst_rq;
+#ifdef CONFIG_SCHED_FREQ_INPUT
+ struct group_cpu_time *src_cpu_time, *dst_cpu_time;
+#endif
};
extern struct list_head cluster_head;
@@ -741,7 +751,7 @@ struct rq {
struct task_struct *ed_task;
#ifdef CONFIG_SCHED_FREQ_INPUT
- unsigned int old_busy_time;
+ u64 old_busy_time, old_busy_time_group;
int notifier_sent;
u64 old_estimated_time;
#endif
@@ -1337,7 +1347,16 @@ static inline int update_preferred_cluster(struct related_thread_group *grp,
#ifdef CONFIG_SCHED_FREQ_INPUT
#define PRED_DEMAND_DELTA ((s64)new_pred_demand - p->ravg.pred_demand)
-extern void check_for_freq_change(struct rq *rq, bool check_cra);
+extern void
+check_for_freq_change(struct rq *rq, bool check_pred, bool check_groups);
+
+struct group_cpu_time {
+ u64 curr_runnable_sum;
+ u64 prev_runnable_sum;
+ u64 nt_curr_runnable_sum;
+ u64 nt_prev_runnable_sum;
+ u64 window_start;
+};
/* Is frequency of two cpus synchronized with each other? */
static inline int same_freq_domain(int src_cpu, int dst_cpu)
@@ -1355,7 +1374,8 @@ static inline int same_freq_domain(int src_cpu, int dst_cpu)
#define sched_migration_fixup 0
#define PRED_DEMAND_DELTA (0)
-static inline void check_for_freq_change(struct rq *rq, bool check_cra) { }
+static inline void
+check_for_freq_change(struct rq *rq, bool check_pred, bool check_groups) { }
static inline int same_freq_domain(int src_cpu, int dst_cpu)
{
diff --git a/kernel/sched/sched_avg.c b/kernel/sched/sched_avg.c
index cdb1d7c53849..c70e0466c36c 100644
--- a/kernel/sched/sched_avg.c
+++ b/kernel/sched/sched_avg.c
@@ -18,9 +18,9 @@
#include <linux/hrtimer.h>
#include <linux/sched.h>
#include <linux/math64.h>
-#include <trace/events/sched.h>
#include "sched.h"
+#include <trace/events/sched.h>
static DEFINE_PER_CPU(u64, nr_prod_sum);
static DEFINE_PER_CPU(u64, last_time);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 1da3b96368b1..825be75ca1a3 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -472,6 +472,13 @@ static struct ctl_table kern_table[] = {
.proc_handler = proc_dointvec_minmax,
.extra1 = &zero,
},
+ {
+ .procname = "sched_freq_aggregate",
+ .data = &sysctl_sched_freq_aggregate,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sched_window_update_handler,
+ },
#endif
{
.procname = "sched_boost",
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index cc9f7a9319be..731f6484b811 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -256,7 +256,8 @@ int perf_trace_add(struct perf_event *p_event, int flags)
void perf_trace_del(struct perf_event *p_event, int flags)
{
struct trace_event_call *tp_event = p_event->tp_event;
- hlist_del_rcu(&p_event->hlist_entry);
+ if (!hlist_unhashed(&p_event->hlist_entry))
+ hlist_del_rcu(&p_event->hlist_entry);
tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event);
}