diff options
| author | Linux Build Service Account <lnxbuild@localhost> | 2016-05-31 12:11:36 -0600 |
|---|---|---|
| committer | Linux Build Service Account <lnxbuild@localhost> | 2016-05-31 12:11:36 -0600 |
| commit | 3ebf81bef3da6dd1f12f70773866e0063ac3f368 (patch) | |
| tree | 20f20f9902f9df8344fa007a78f64d8e5668dcea /kernel | |
| parent | 89c198ac7fb60408d3f786ef2bcfaab2a56d4cbe (diff) | |
| parent | cd9403e96df5ef8eaefabdb67da22f0a9887964c (diff) | |
Promotion of kernel.lnx.4.4-160526.
CRs Change ID Subject
--------------------------------------------------------------------------------------------------------------
972519 I35151c460b4350ebd414b67c655684c2019f799f trace: prevent NULL pointer dereference
1013947 I7df9aeb55a95185077c679a217ed8772eb83c8b9 arm64: defconfig: update config options for msm-perf_def
1021612 I312444176373f73f02aa0ceddf5e114a39702641 ARM: msm: dts: fix register dump offsets/ranges for msmc
1019272 I38c637936b398f2fb1665c8233ed5e49e83bf296 thermal: qpnp-temp-alarm: update thermal callback parame
1020529 Ia4f54bfee8111f9f039f772a8bcc7c9a0400d5aa edac: cortex: Update the error strings to reflect Kryo2x
1005061 I4ed9f1c6ad089f80dcd19762fda151ce1572f471 msm: ipa3: WA for incorrect state retention for GSI chan
1019256 I87cca1215134e6d406f60d54f6d0430978eeae9c icnss: Add API to return the CE IRQ number
1015545 I5aad7032f3f8048216a41765f1cf91fde98f6ade msm: ipa3: fix odu debugfs
1006937 Ib8cb979136def6696861a7835bcde763dabe874f net: Warn for cloned packets in ingress path on SMP syst
1008023 Id9949bef91835318a7b344753983eea0aeab7bdc msm: ipa3: add support for TX of sk_buff's with paged da
1021612 I4fbc9aa1f30d36d35a9ad181185761e697cbbef7 msm: mdss: Fix qseed3 clk calculation overflow
1019188 Ib52e6551ac67215dab2bc5770ddcf037568f8b77 net: rmnet_data: Fix use after free when sending MAP com
989851 Ifa42fbd475665a0ca581c907ce5432584ea0e7ed msm: mdss: fix possible out-of-bounds and overflow issue
1016956 I906005680b4cc90cc38dc3d403beebf7aa515ad7 usb: dwc3: Add support handle type-c plug orientation
1019798 I7bccd68866457bb0635ae5166ec935f9e82ba760 soc: qcom: print raw data for gladiator error registers
1017182 I32f312f11fcbebbff0799120448d6e8f0d9ec98d ARM: dts: msm: Add v4l2 rotator node to msm8996
1020265 Id19733e6e075a427c4aa745b5bedc93f29a2dd4f ARM: dts: msm: Add nfc device to msmcobalt
988990 I19aa5983316bec4a87811c8aa8b54f770001c45f msm: mdss: Adding support for destination scaler
993024 I32b0e57c8e958b7e5f1d647e37e46fda052b3d1e ARM: dts: msm: Support partial goods for msmcobalt
1013948 If024f55095a951329976b6c2736ad5760eae1f4f arm64: defconfig: update config options for msmcortex-pe
1020515 I2c1fb7dcc698142f9ce42f40164521b8a78268e1 defconfig: msm: Remove incorrect ISPIF version
1013147 Iae6804bcb3121e0852ec5d14d0939623b97a6e67 qcom-charger: Don't automatically set USB_PD type when P
988990 I9a4b9701e078fa39783f33f023eef2da75c1c162 ARM: dts: msm: add extra destination scaler properties f
1020505 I87d18778fef81671c5e7cc261cc70ce07c662933 regulator: cpr3-regulator: support corner bands with no
1019888 I1a8241c1e0a349394351be2ef98381e24f0c4ff4 defconfig: msmcortex: enable qrng drivers
1003367 I75089e210a6fc72683dcf98cdd4da9d6ab3e6fcf msm: kgsl: Correction of VBIF debug bus data in snapshot
1005061 Id849055526bf70e0cc8161239b4530a7fc575744 ARM: dts: msm: enable WA for IPA channel 20 for msmcobal
1002974 Ic0dedbadc0dd2125bd2a7bcc152972c0555e07f8 msm: kgsl: Defer adding the mem entry to a process
1017182 I6fc5f90512d8024439d56d7c72ae2160df460f7a defconfig: msm: sde: Add config for v4l2 sde rotator dri
1006067 I6add3800c40cd09f6e6e0cf2720e69059bd83cbc msm: kgsl: Avoid race condition in ioctl_syncsource_dest
1013147 I77c5875ee8514395a82fac0109b7cff1d507250b usb: pd: Update power_supply type to USB_PD after PD is
1021612 I62a3bd31997be05181de98307089e2a69d98ab7b msm: mdss: fix amortized prefill calculations
1019888 I2c808713aaac42345b97665a8990f5bbb9b9145e ARM: dts: Add qrng driver support for msmcobalt
1013913 I9a17c83d6613ff37cede4a7bb52612465e4d0101 regulator: labibb: Fix slew rate calculation in LAB/IBB
1016956 Idd236136c9f0a9163b4ae7a8405c412f1d69ca9e usb: pd: Add support to notify plug orientation via extc
1016956 I893c0b729015cd22791d168453309168246961e2 usb: phy: qmp: Configure phy lane based on plug orientat
972998 I6a99fa6961e9205d7d9ccb470873c26adde8a91f ARM: dts: msm: Change csi clock voting from ispif node
1020505 I6b9d663b44c96dafba26ad25bcfc4b61c8c86d56 regulator: cpr3-regulator: support step quot for CPRh co
977896 I71e6047620066323721c6d542034ddd4b2950e7f sched: Aggregate for frequency
992942 Iaf90ab4c1d17f903d03458d76cab1b4c0a5c8836 msm: camera: isp: Fix warning and errors based on static
1013787 Ieb0a7aa1b1b5f23220854092dcc2119d29c57146 msm: camera: sensor: Add support for 3B read
1017182 If634894768b02d124ceab071a9eca1c36f258600 msm: mdss: Export rotator interrupt and share rotator sm
Change-Id: I15d2c47b635d84cffdac17adffff8274b6f8e3f4
CRs-Fixed: 1005061, 993024, 989851, 1017182, 1020505, 1021612, 1020529, 1019256, 1003367, 1006937, 1019798, 1016956, 1019272, 1013913, 972519, 1019888, 1013787, 1006067, 1015545, 1019188, 1020515, 1013147, 977896, 1008023, 1002974, 988990, 1013947, 992942, 972998, 1013948, 1020265
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/sched/core.c | 735 | ||||
| -rw-r--r-- | kernel/sched/fair.c | 26 | ||||
| -rw-r--r-- | kernel/sched/sched.h | 26 | ||||
| -rw-r--r-- | kernel/sched/sched_avg.c | 2 | ||||
| -rw-r--r-- | kernel/sysctl.c | 7 | ||||
| -rw-r--r-- | kernel/trace/trace_event_perf.c | 3 |
6 files changed, 674 insertions, 125 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0b55bbbd7431..87e93b3f3b4e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -97,6 +97,9 @@ const char *task_event_names[] = {"PUT_PREV_TASK", "PICK_NEXT_TASK", "TASK_WAKE", "TASK_MIGRATE", "TASK_UPDATE", "IRQ_UPDATE"}; +const char *migrate_type_names[] = {"GROUP_TO_RQ", "RQ_TO_GROUP", + "RQ_TO_RQ", "GROUP_TO_GROUP"}; + ATOMIC_NOTIFIER_HEAD(migration_notifier_head); ATOMIC_NOTIFIER_HEAD(load_alert_notifier_head); @@ -1864,6 +1867,61 @@ __read_mostly unsigned int sched_major_task_runtime = 10000000; static unsigned int sync_cpu; +static LIST_HEAD(related_thread_groups); +static DEFINE_RWLOCK(related_thread_group_lock); + +#define for_each_related_thread_group(grp) \ + list_for_each_entry(grp, &related_thread_groups, list) + +/* + * Demand aggregation for frequency purpose: + * + * 'sched_freq_aggregate' controls aggregation of cpu demand of related threads + * for frequency determination purpose. This aggregation is done per-cluster. + * + * CPU demand of tasks from various related groups is aggregated per-cluster and + * added to the "max_busy_cpu" in that cluster, where max_busy_cpu is determined + * by just rq->prev_runnable_sum. + * + * Some examples follow, which assume: + * Cluster0 = CPU0-3, Cluster1 = CPU4-7 + * One related thread group A that has tasks A0, A1, A2 + * + * A->cpu_time[X].curr/prev_sum = counters in which cpu execution stats of + * tasks belonging to group A are accumulated when they run on cpu X. + * + * CX->curr/prev_sum = counters in which cpu execution stats of all tasks + * not belonging to group A are accumulated when they run on cpu X + * + * Lets say the stats for window M was as below: + * + * C0->prev_sum = 1ms, A->cpu_time[0].prev_sum = 5ms + * Task A0 ran 5ms on CPU0 + * Task B0 ran 1ms on CPU0 + * + * C1->prev_sum = 5ms, A->cpu_time[1].prev_sum = 6ms + * Task A1 ran 4ms on CPU1 + * Task A2 ran 2ms on CPU1 + * Task B1 ran 5ms on CPU1 + * + * C2->prev_sum = 0ms, A->cpu_time[2].prev_sum = 0 + * CPU2 idle + * + * C3->prev_sum = 0ms, A->cpu_time[3].prev_sum = 0 + * CPU3 idle + * + * In this case, CPU1 was most busy going by just its prev_sum counter. Demand + * from all group A tasks are added to CPU1. IOW, at end of window M, cpu busy + * time reported to governor will be: + * + * + * C0 busy time = 1ms + * C1 busy time = 5 + 5 + 6 = 16ms + * + */ +static __read_mostly unsigned int sched_freq_aggregate; +__read_mostly unsigned int sysctl_sched_freq_aggregate; + #define EXITING_TASK_MARKER 0xdeaddead static inline int exiting_task(struct task_struct *p) @@ -1955,12 +2013,67 @@ static inline unsigned int load_to_freq(struct rq *rq, u64 load) return freq; } -/* Should scheduler alert governor for changing frequency? */ -static int send_notification(struct rq *rq, int check_pred) +static inline struct group_cpu_time * +_group_cpu_time(struct related_thread_group *grp, int cpu); + +/* + * Return load from all related group in given cpu. + * Caller must ensure that related_thread_group_lock is held. + */ +static void _group_load_in_cpu(int cpu, u64 *grp_load, u64 *new_grp_load) +{ + struct related_thread_group *grp; + + for_each_related_thread_group(grp) { + struct group_cpu_time *cpu_time; + + cpu_time = _group_cpu_time(grp, cpu); + *grp_load += cpu_time->prev_runnable_sum; + if (new_grp_load) + *new_grp_load += cpu_time->nt_prev_runnable_sum; + } +} + +/* + * Return load from all related groups in given frequency domain. + * Caller must ensure that related_thread_group_lock is held. + */ +static void group_load_in_freq_domain(struct cpumask *cpus, + u64 *grp_load, u64 *new_grp_load) +{ + struct related_thread_group *grp; + int j; + + for_each_related_thread_group(grp) { + for_each_cpu(j, cpus) { + struct group_cpu_time *cpu_time; + + cpu_time = _group_cpu_time(grp, j); + *grp_load += cpu_time->prev_runnable_sum; + *new_grp_load += cpu_time->nt_prev_runnable_sum; + } + } +} + +/* + * Should scheduler alert governor for changing frequency? + * + * @check_pred - evaluate frequency based on the predictive demand + * @check_groups - add load from all related groups on given cpu + * + * check_groups is set to 1 if a "related" task movement/wakeup is triggering + * the notification check. To avoid "re-aggregation" of demand in such cases, + * we check whether the migrated/woken tasks demand (along with demand from + * existing tasks on the cpu) can be met on target cpu + * + */ + +static int send_notification(struct rq *rq, int check_pred, int check_groups) { unsigned int cur_freq, freq_required; unsigned long flags; int rc = 0; + u64 group_load = 0, new_load; if (!sched_enable_hmp) return 0; @@ -1982,8 +2095,22 @@ static int send_notification(struct rq *rq, int check_pred) if (freq_required < cur_freq + sysctl_sched_pred_alert_freq) return 0; } else { + read_lock(&related_thread_group_lock); + /* + * Protect from concurrent update of rq->prev_runnable_sum and + * group cpu load + */ + raw_spin_lock_irqsave(&rq->lock, flags); + if (check_groups) + _group_load_in_cpu(cpu_of(rq), &group_load, NULL); + + new_load = rq->prev_runnable_sum + group_load; + + raw_spin_unlock_irqrestore(&rq->lock, flags); + read_unlock(&related_thread_group_lock); + cur_freq = load_to_freq(rq, rq->old_busy_time); - freq_required = load_to_freq(rq, rq->prev_runnable_sum); + freq_required = load_to_freq(rq, new_load); if (nearly_same_freq(cur_freq, freq_required)) return 0; @@ -1993,6 +2120,8 @@ static int send_notification(struct rq *rq, int check_pred) if (!rq->notifier_sent) { rq->notifier_sent = 1; rc = 1; + trace_sched_freq_alert(cpu_of(rq), check_pred, check_groups, rq, + new_load); } raw_spin_unlock_irqrestore(&rq->lock, flags); @@ -2000,17 +2129,13 @@ static int send_notification(struct rq *rq, int check_pred) } /* Alert governor if there is a need to change frequency */ -void check_for_freq_change(struct rq *rq, bool check_pred) +void check_for_freq_change(struct rq *rq, bool check_pred, bool check_groups) { int cpu = cpu_of(rq); - if (!send_notification(rq, check_pred)) + if (!send_notification(rq, check_pred, check_groups)) return; - trace_sched_freq_alert(cpu, check_pred, rq->old_busy_time, - rq->prev_runnable_sum, rq->old_estimated_time, - rq->hmp_stats.pred_demands_sum); - atomic_notifier_call_chain( &load_alert_notifier_head, 0, (void *)(long)cpu); @@ -2031,11 +2156,21 @@ static int account_busy_for_cpu_time(struct rq *rq, struct task_struct *p, if (event == TASK_WAKE) return 0; - if (event == PUT_PREV_TASK || event == IRQ_UPDATE || - event == TASK_UPDATE) + if (event == PUT_PREV_TASK || event == IRQ_UPDATE) return 1; - /* Only TASK_MIGRATE && PICK_NEXT_TASK left */ + /* + * TASK_UPDATE can be called on sleeping task, when its moved between + * related groups + */ + if (event == TASK_UPDATE) { + if (rq->curr == p) + return 1; + + return p->on_rq ? sched_freq_account_wait_time : 0; + } + + /* TASK_MIGRATE, PICK_NEXT_TASK left */ return sched_freq_account_wait_time; } @@ -2262,6 +2397,15 @@ void update_task_pred_demand(struct rq *rq, struct task_struct *p, int event) event != PICK_NEXT_TASK))) return; + /* + * TASK_UPDATE can be called on sleeping task, when its moved between + * related groups + */ + if (event == TASK_UPDATE) { + if (!p->on_rq && !sched_freq_account_wait_time) + return; + } + new = calc_pred_demand(rq, p); old = p->ravg.pred_demand; @@ -2290,7 +2434,14 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq, u64 window_start = rq->window_start; u32 window_size = sched_ravg_window; u64 delta; + u64 *curr_runnable_sum = &rq->curr_runnable_sum; + u64 *prev_runnable_sum = &rq->prev_runnable_sum; + u64 *nt_curr_runnable_sum = &rq->nt_curr_runnable_sum; + u64 *nt_prev_runnable_sum = &rq->nt_prev_runnable_sum; + int flip_counters = 0; + int prev_sum_reset = 0; bool new_task; + struct related_thread_group *grp; new_window = mark_start < window_start; if (new_window) { @@ -2302,6 +2453,51 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq, new_task = is_new_task(p); + grp = p->grp; + if (grp && sched_freq_aggregate) { + /* cpu_time protected by rq_lock */ + struct group_cpu_time *cpu_time = + _group_cpu_time(grp, cpu_of(rq)); + + curr_runnable_sum = &cpu_time->curr_runnable_sum; + prev_runnable_sum = &cpu_time->prev_runnable_sum; + + nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum; + nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum; + + if (cpu_time->window_start != rq->window_start) { + int nr_windows; + + delta = rq->window_start - cpu_time->window_start; + nr_windows = div64_u64(delta, window_size); + if (nr_windows > 1) + prev_sum_reset = 1; + + cpu_time->window_start = rq->window_start; + flip_counters = 1; + } + + if (p_is_curr_task && new_window) { + u64 curr_sum = rq->curr_runnable_sum; + u64 nt_curr_sum = rq->nt_curr_runnable_sum; + + if (nr_full_windows) + curr_sum = nt_curr_sum = 0; + + rq->prev_runnable_sum = curr_sum; + rq->nt_prev_runnable_sum = nt_curr_sum; + + rq->curr_runnable_sum = 0; + rq->nt_curr_runnable_sum = 0; + } + } else { + if (p_is_curr_task && new_window) { + flip_counters = 1; + if (nr_full_windows) + prev_sum_reset = 1; + } + } + /* Handle per-task window rollover. We don't care about the idle * task or exiting tasks. */ if (new_window && !is_idle_task(p) && !exiting_task(p)) { @@ -2314,6 +2510,20 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq, p->ravg.curr_window = 0; } + if (flip_counters) { + u64 curr_sum = *curr_runnable_sum; + u64 nt_curr_sum = *nt_curr_runnable_sum; + + if (prev_sum_reset) + curr_sum = nt_curr_sum = 0; + + *prev_runnable_sum = curr_sum; + *nt_prev_runnable_sum = nt_curr_sum; + + *curr_runnable_sum = 0; + *nt_curr_runnable_sum = 0; + } + if (!account_busy_for_cpu_time(rq, p, irqtime, event)) { /* account_busy_for_cpu_time() = 0, so no update to the * task's current window needs to be made. This could be @@ -2331,19 +2541,8 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq, /* A new window has started. The RQ demand must be rolled * over if p is the current task. */ if (p_is_curr_task) { - u64 prev_sum = 0, nt_prev_sum = 0; - - /* p is either idle task or an exiting task */ - if (!nr_full_windows) { - prev_sum = rq->curr_runnable_sum; - nt_prev_sum = rq->nt_curr_runnable_sum; - } - - rq->prev_runnable_sum = prev_sum; - rq->curr_runnable_sum = 0; - rq->nt_prev_runnable_sum = nt_prev_sum; - rq->nt_curr_runnable_sum = 0; - + /* p is idle task */ + BUG_ON(p != rq->idle); } else if (heavy_task_wakeup(p, rq, event)) { /* A new window has started. If p is a waking * heavy task its prev_window contribution is faked @@ -2353,9 +2552,9 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq, * can be controlled via the sched_heavy_task * tunable. */ p->ravg.prev_window = p->ravg.demand; - rq->prev_runnable_sum += p->ravg.demand; + *prev_runnable_sum += p->ravg.demand; if (new_task) - rq->nt_prev_runnable_sum += p->ravg.demand; + *nt_prev_runnable_sum += p->ravg.demand; } return; @@ -2373,9 +2572,10 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq, else delta = irqtime; delta = scale_exec_time(delta, rq, cc); - rq->curr_runnable_sum += delta; + *curr_runnable_sum += delta; if (new_task) - rq->nt_curr_runnable_sum += delta; + *nt_curr_runnable_sum += delta; + if (!is_idle_task(p) && !exiting_task(p)) p->ravg.curr_window += delta; @@ -2409,15 +2609,17 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq, if (!exiting_task(p)) p->ravg.prev_window = delta; } - rq->prev_runnable_sum += delta; + + *prev_runnable_sum += delta; if (new_task) - rq->nt_prev_runnable_sum += delta; + *nt_prev_runnable_sum += delta; /* Account piece of busy time in the current window. */ delta = scale_exec_time(wallclock - window_start, rq, cc); - rq->curr_runnable_sum += delta; + *curr_runnable_sum += delta; if (new_task) - rq->nt_curr_runnable_sum += delta; + *nt_curr_runnable_sum += delta; + if (!exiting_task(p)) p->ravg.curr_window = delta; @@ -2444,12 +2646,6 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq, cc); if (!is_idle_task(p) && !exiting_task(p)) p->ravg.prev_window += delta; - - rq->nt_prev_runnable_sum = rq->nt_curr_runnable_sum; - if (new_task) - rq->nt_prev_runnable_sum += delta; - - delta += rq->curr_runnable_sum; } else { /* Since at least one full window has elapsed, * the contribution to the previous window is the @@ -2457,27 +2653,20 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq, delta = scale_exec_time(window_size, rq, cc); if (!is_idle_task(p) && !exiting_task(p)) p->ravg.prev_window = delta; - - if (new_task) - rq->nt_prev_runnable_sum = delta; - else - rq->nt_prev_runnable_sum = 0; } - /* - * Rollover for normal runnable sum is done here by overwriting - * the values in prev_runnable_sum and curr_runnable_sum. - * Rollover for new task runnable sum has completed by previous - * if-else statement. - */ - rq->prev_runnable_sum = delta; + + /* Rollover is done here by overwriting the values in + * prev_runnable_sum and curr_runnable_sum. */ + *prev_runnable_sum += delta; + if (new_task) + *nt_prev_runnable_sum += delta; /* Account piece of busy time in the current window. */ delta = scale_exec_time(wallclock - window_start, rq, cc); - rq->curr_runnable_sum = delta; + *curr_runnable_sum += delta; if (new_task) - rq->nt_curr_runnable_sum = delta; - else - rq->nt_curr_runnable_sum = 0; + *nt_curr_runnable_sum += delta; + if (!is_idle_task(p) && !exiting_task(p)) p->ravg.curr_window = delta; @@ -2500,12 +2689,8 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq, /* Roll window over. If IRQ busy time was just in the current * window then that is all that need be accounted. */ - rq->prev_runnable_sum = rq->curr_runnable_sum; - rq->nt_prev_runnable_sum = rq->nt_curr_runnable_sum; - rq->nt_curr_runnable_sum = 0; if (mark_start > window_start) { - rq->curr_runnable_sum = scale_exec_time(irqtime, rq, - cc); + *curr_runnable_sum = scale_exec_time(irqtime, rq, cc); return; } @@ -2515,7 +2700,7 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq, if (delta > window_size) delta = window_size; delta = scale_exec_time(delta, rq, cc); - rq->prev_runnable_sum += delta; + *prev_runnable_sum += delta; /* Process the remaining IRQ busy time in the current window. */ delta = wallclock - window_start; @@ -2820,7 +3005,8 @@ update_task_ravg(struct task_struct *p, struct rq *rq, int event, update_task_pred_demand(rq, p, event); done: trace_sched_update_task_ravg(p, rq, event, wallclock, irqtime, - cc.cycles, cc.time); + cc.cycles, cc.time, + _group_cpu_time(p->grp, cpu_of(rq))); p->ravg.mark_start = wallclock; @@ -3002,7 +3188,8 @@ enum reset_reason_code { ACCOUNT_WAIT_TIME_CHANGE, HIST_SIZE_CHANGE, MIGRATION_FIXUP_CHANGE, - FREQ_ACCOUNT_WAIT_TIME_CHANGE + FREQ_ACCOUNT_WAIT_TIME_CHANGE, + FREQ_AGGREGATE_CHANGE, }; const char *sched_window_reset_reasons[] = { @@ -3021,6 +3208,7 @@ void reset_all_window_stats(u64 window_start, unsigned int window_size) u64 start_ts = sched_ktime_clock(); int reason = WINDOW_CHANGE; unsigned int old = 0, new = 0; + struct related_thread_group *grp; disable_window_stats(); @@ -3028,11 +3216,26 @@ void reset_all_window_stats(u64 window_start, unsigned int window_size) local_irq_save(flags); + read_lock(&related_thread_group_lock); + for_each_possible_cpu(cpu) { struct rq *rq = cpu_rq(cpu); raw_spin_lock(&rq->lock); } + list_for_each_entry(grp, &related_thread_groups, list) { + int j; + + for_each_possible_cpu(j) { + struct group_cpu_time *cpu_time; + /* Protected by rq lock */ + cpu_time = _group_cpu_time(grp, j); + memset(cpu_time, 0, sizeof(struct group_cpu_time)); + if (window_start) + cpu_time->window_start = window_start; + } + } + if (window_size) { sched_ravg_window = window_size * TICK_NSEC; set_hmp_defaults(); @@ -3081,6 +3284,12 @@ void reset_all_window_stats(u64 window_start, unsigned int window_size) new = sysctl_sched_freq_account_wait_time; sched_freq_account_wait_time = sysctl_sched_freq_account_wait_time; + } else if (sched_freq_aggregate != + sysctl_sched_freq_aggregate) { + reason = FREQ_AGGREGATE_CHANGE; + old = sched_freq_aggregate; + new = sysctl_sched_freq_aggregate; + sched_freq_aggregate = sysctl_sched_freq_aggregate; } #endif @@ -3089,6 +3298,8 @@ void reset_all_window_stats(u64 window_start, unsigned int window_size) raw_spin_unlock(&rq->lock); } + read_unlock(&related_thread_group_lock); + local_irq_restore(flags); trace_sched_reset_all_window_stats(window_start, window_size, @@ -3097,13 +3308,17 @@ void reset_all_window_stats(u64 window_start, unsigned int window_size) #ifdef CONFIG_SCHED_FREQ_INPUT +static inline void +sync_window_start(struct rq *rq, struct group_cpu_time *cpu_time); + void sched_get_cpus_busy(struct sched_load *busy, const struct cpumask *query_cpus) { unsigned long flags; struct rq *rq; const int cpus = cpumask_weight(query_cpus); - u64 load[cpus], nload[cpus]; + u64 load[cpus], group_load[cpus]; + u64 nload[cpus], ngload[cpus]; u64 pload[cpus]; unsigned int cur_freq[cpus], max_freq[cpus]; int notifier_sent[cpus]; @@ -3111,6 +3326,9 @@ void sched_get_cpus_busy(struct sched_load *busy, int cpu, i = 0; unsigned int window_size; struct cpu_cycle cc; + u64 max_prev_sum = 0; + int max_busy_cpu = cpumask_first(query_cpus); + struct related_thread_group *grp; if (unlikely(cpus == 0)) return; @@ -3120,6 +3338,8 @@ void sched_get_cpus_busy(struct sched_load *busy, * current task may have been executing for a long time. Ensure * that the window stats are current by doing an update. */ + read_lock(&related_thread_group_lock); + local_irq_save(flags); for_each_cpu(cpu, query_cpus) raw_spin_lock(&cpu_rq(cpu)->lock); @@ -3137,6 +3357,49 @@ void sched_get_cpus_busy(struct sched_load *busy, nload[i] = rq->nt_prev_runnable_sum; pload[i] = rq->hmp_stats.pred_demands_sum; rq->old_estimated_time = pload[i]; + + if (load[i] > max_prev_sum) { + max_prev_sum = load[i]; + max_busy_cpu = cpu; + } + + notifier_sent[i] = rq->notifier_sent; + early_detection[i] = (rq->ed_task != NULL); + rq->notifier_sent = 0; + cur_freq[i] = cpu_cur_freq(cpu); + max_freq[i] = cpu_max_freq(cpu); + i++; + } + + for_each_related_thread_group(grp) { + for_each_cpu(cpu, query_cpus) { + /* Protected by rq_lock */ + struct group_cpu_time *cpu_time = + _group_cpu_time(grp, cpu); + sync_window_start(cpu_rq(cpu), cpu_time); + } + } + + i = 0; + for_each_cpu(cpu, query_cpus) { + group_load[i] = 0; + ngload[i] = 0; + + if (early_detection[i]) + goto skip_early; + + rq = cpu_rq(cpu); + if (!notifier_sent[i]) { + if (cpu == max_busy_cpu) + group_load_in_freq_domain( + &rq->freq_domain_cpumask, + &group_load[i], &ngload[i]); + } else { + _group_load_in_cpu(cpu, &group_load[i], &ngload[i]); + } + + load[i] += group_load[i]; + nload[i] += ngload[i]; /* * Scale load in reference to cluster max_possible_freq. * @@ -3146,11 +3409,7 @@ void sched_get_cpus_busy(struct sched_load *busy, load[i] = scale_load_to_cpu(load[i], cpu); nload[i] = scale_load_to_cpu(nload[i], cpu); pload[i] = scale_load_to_cpu(pload[i], cpu); - - notifier_sent[i] = rq->notifier_sent; - early_detection[i] = (rq->ed_task != NULL); - rq->notifier_sent = 0; - max_freq[i] = cpu_max_freq(cpu); +skip_early: i++; } @@ -3158,6 +3417,8 @@ void sched_get_cpus_busy(struct sched_load *busy, raw_spin_unlock(&(cpu_rq(cpu))->lock); local_irq_restore(flags); + read_unlock(&related_thread_group_lock); + i = 0; for_each_cpu(cpu, query_cpus) { rq = cpu_rq(cpu); @@ -3205,17 +3466,6 @@ exit_early: } } -unsigned long sched_get_busy(int cpu) -{ - struct cpumask query_cpu = CPU_MASK_NONE; - struct sched_load busy; - - cpumask_set_cpu(cpu, &query_cpu); - sched_get_cpus_busy(&busy, &query_cpu); - - return busy.prev_load; -} - void sched_set_io_is_busy(int val) { sched_io_is_busy = val; @@ -3267,7 +3517,14 @@ static void fixup_busy_time(struct task_struct *p, int new_cpu) struct rq *src_rq = task_rq(p); struct rq *dest_rq = cpu_rq(new_cpu); u64 wallclock; + u64 *src_curr_runnable_sum, *dst_curr_runnable_sum; + u64 *src_prev_runnable_sum, *dst_prev_runnable_sum; + u64 *src_nt_curr_runnable_sum, *dst_nt_curr_runnable_sum; + u64 *src_nt_prev_runnable_sum, *dst_nt_prev_runnable_sum; + int migrate_type; + struct migration_sum_data d; bool new_task; + struct related_thread_group *grp; if (!sched_enable_hmp || !sched_migration_fixup || (!p->on_rq && p->state != TASK_WAKING)) @@ -3298,22 +3555,62 @@ static void fixup_busy_time(struct task_struct *p, int new_cpu) update_task_cpu_cycles(p, new_cpu); new_task = is_new_task(p); + /* Protected by rq_lock */ + grp = p->grp; + if (grp && sched_freq_aggregate) { + struct group_cpu_time *cpu_time; + + migrate_type = GROUP_TO_GROUP; + /* Protected by rq_lock */ + cpu_time = _group_cpu_time(grp, cpu_of(src_rq)); + d.src_rq = NULL; + d.src_cpu_time = cpu_time; + src_curr_runnable_sum = &cpu_time->curr_runnable_sum; + src_prev_runnable_sum = &cpu_time->prev_runnable_sum; + src_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum; + src_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum; + + /* Protected by rq_lock */ + cpu_time = _group_cpu_time(grp, cpu_of(dest_rq)); + d.dst_rq = NULL; + d.dst_cpu_time = cpu_time; + dst_curr_runnable_sum = &cpu_time->curr_runnable_sum; + dst_prev_runnable_sum = &cpu_time->prev_runnable_sum; + dst_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum; + dst_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum; + sync_window_start(dest_rq, cpu_time); + } else { + migrate_type = RQ_TO_RQ; + d.src_rq = src_rq; + d.src_cpu_time = NULL; + d.dst_rq = dest_rq; + d.dst_cpu_time = NULL; + src_curr_runnable_sum = &src_rq->curr_runnable_sum; + src_prev_runnable_sum = &src_rq->prev_runnable_sum; + src_nt_curr_runnable_sum = &src_rq->nt_curr_runnable_sum; + src_nt_prev_runnable_sum = &src_rq->nt_prev_runnable_sum; + + dst_curr_runnable_sum = &dest_rq->curr_runnable_sum; + dst_prev_runnable_sum = &dest_rq->prev_runnable_sum; + dst_nt_curr_runnable_sum = &dest_rq->nt_curr_runnable_sum; + dst_nt_prev_runnable_sum = &dest_rq->nt_prev_runnable_sum; + } if (p->ravg.curr_window) { - src_rq->curr_runnable_sum -= p->ravg.curr_window; - dest_rq->curr_runnable_sum += p->ravg.curr_window; + *src_curr_runnable_sum -= p->ravg.curr_window; + *dst_curr_runnable_sum += p->ravg.curr_window; if (new_task) { - src_rq->nt_curr_runnable_sum -= p->ravg.curr_window; - dest_rq->nt_curr_runnable_sum += p->ravg.curr_window; + *src_nt_curr_runnable_sum -= p->ravg.curr_window; + *dst_nt_curr_runnable_sum += p->ravg.curr_window; } } if (p->ravg.prev_window) { - src_rq->prev_runnable_sum -= p->ravg.prev_window; - dest_rq->prev_runnable_sum += p->ravg.prev_window; + *src_prev_runnable_sum -= p->ravg.prev_window; + *dst_prev_runnable_sum += p->ravg.prev_window; if (new_task) { - src_rq->nt_prev_runnable_sum -= p->ravg.prev_window; - dest_rq->nt_prev_runnable_sum += p->ravg.prev_window; + *src_nt_prev_runnable_sum -= p->ravg.prev_window; + *dst_nt_prev_runnable_sum += p->ravg.prev_window; } } @@ -3323,13 +3620,11 @@ static void fixup_busy_time(struct task_struct *p, int new_cpu) dest_rq->ed_task = p; } - BUG_ON((s64)src_rq->prev_runnable_sum < 0); - BUG_ON((s64)src_rq->curr_runnable_sum < 0); - BUG_ON((s64)src_rq->nt_prev_runnable_sum < 0); - BUG_ON((s64)src_rq->nt_curr_runnable_sum < 0); - - trace_sched_migration_update_sum(src_rq, p); - trace_sched_migration_update_sum(dest_rq, p); + trace_sched_migration_update_sum(p, migrate_type, &d); + BUG_ON((s64)*src_prev_runnable_sum < 0); + BUG_ON((s64)*src_curr_runnable_sum < 0); + BUG_ON((s64)*src_nt_prev_runnable_sum < 0); + BUG_ON((s64)*src_nt_curr_runnable_sum < 0); done: if (p->state == TASK_WAKING) @@ -3368,10 +3663,6 @@ static void check_for_up_down_migrate_update(const struct cpumask *cpus) update_up_down_migrate(); } -static LIST_HEAD(related_thread_groups); -static DEFINE_RWLOCK(related_thread_group_lock); -static int nr_related_thread_groups; - /* Return cluster which can offer required capacity for group */ static struct sched_cluster * best_cluster(struct related_thread_group *grp, u64 total_demand) @@ -3421,6 +3712,199 @@ static void set_preferred_cluster(struct related_thread_group *grp) raw_spin_unlock(&grp->lock); } +#define ADD_TASK 0 +#define REM_TASK 1 + +#ifdef CONFIG_SCHED_FREQ_INPUT + +static struct cpu_cycle +update_task_ravg(struct task_struct *p, struct rq *rq, + int event, u64 wallclock, u64 irqtime); + +static inline void free_group_cputime(struct related_thread_group *grp) +{ + free_percpu(grp->cpu_time); +} + +static int alloc_group_cputime(struct related_thread_group *grp) +{ + int i; + struct group_cpu_time *cpu_time; + int cpu = raw_smp_processor_id(); + struct rq *rq = cpu_rq(cpu); + u64 window_start = rq->window_start; + + grp->cpu_time = alloc_percpu(struct group_cpu_time); + if (!grp->cpu_time) + return -ENOMEM; + + for_each_possible_cpu(i) { + cpu_time = per_cpu_ptr(grp->cpu_time, i); + memset(cpu_time, 0, sizeof(struct group_cpu_time)); + cpu_time->window_start = window_start; + } + + return 0; +} + +/* + * A group's window_start may be behind. When moving it forward, flip prev/curr + * counters. When moving forward > 1 window, prev counter is set to 0 + */ +static inline void +sync_window_start(struct rq *rq, struct group_cpu_time *cpu_time) +{ + u64 delta; + int nr_windows; + u64 curr_sum = cpu_time->curr_runnable_sum; + u64 nt_curr_sum = cpu_time->nt_curr_runnable_sum; + + delta = rq->window_start - cpu_time->window_start; + if (!delta) + return; + + nr_windows = div64_u64(delta, sched_ravg_window); + if (nr_windows > 1) + curr_sum = nt_curr_sum = 0; + + cpu_time->prev_runnable_sum = curr_sum; + cpu_time->curr_runnable_sum = 0; + + cpu_time->nt_prev_runnable_sum = nt_curr_sum; + cpu_time->nt_curr_runnable_sum = 0; + + cpu_time->window_start = rq->window_start; +} + +/* + * Task's cpu usage is accounted in: + * rq->curr/prev_runnable_sum, when its ->grp is NULL + * grp->cpu_time[cpu]->curr/prev_runnable_sum, when its ->grp is !NULL + * + * Transfer task's cpu usage between those counters when transitioning between + * groups + */ +static void transfer_busy_time(struct rq *rq, struct related_thread_group *grp, + struct task_struct *p, int event) +{ + u64 wallclock; + struct group_cpu_time *cpu_time; + u64 *src_curr_runnable_sum, *dst_curr_runnable_sum; + u64 *src_prev_runnable_sum, *dst_prev_runnable_sum; + u64 *src_nt_curr_runnable_sum, *dst_nt_curr_runnable_sum; + u64 *src_nt_prev_runnable_sum, *dst_nt_prev_runnable_sum; + struct migration_sum_data d; + int migrate_type; + + if (!sched_freq_aggregate) + return; + + wallclock = sched_ktime_clock(); + + update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0); + update_task_ravg(p, rq, TASK_UPDATE, wallclock, 0); + + /* cpu_time protected by related_thread_group_lock, grp->lock rq_lock */ + cpu_time = _group_cpu_time(grp, cpu_of(rq)); + if (event == ADD_TASK) { + sync_window_start(rq, cpu_time); + migrate_type = RQ_TO_GROUP; + d.src_rq = rq; + d.src_cpu_time = NULL; + d.dst_rq = NULL; + d.dst_cpu_time = cpu_time; + src_curr_runnable_sum = &rq->curr_runnable_sum; + dst_curr_runnable_sum = &cpu_time->curr_runnable_sum; + src_prev_runnable_sum = &rq->prev_runnable_sum; + dst_prev_runnable_sum = &cpu_time->prev_runnable_sum; + + src_nt_curr_runnable_sum = &rq->nt_curr_runnable_sum; + dst_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum; + src_nt_prev_runnable_sum = &rq->nt_prev_runnable_sum; + dst_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum; + } else if (event == REM_TASK) { + migrate_type = GROUP_TO_RQ; + d.src_rq = NULL; + d.src_cpu_time = cpu_time; + d.dst_rq = rq; + d.dst_cpu_time = NULL; + + /* + * In case of REM_TASK, cpu_time->window_start would be + * uptodate, because of the update_task_ravg() we called + * above on the moving task. Hence no need for + * sync_window_start() + */ + src_curr_runnable_sum = &cpu_time->curr_runnable_sum; + dst_curr_runnable_sum = &rq->curr_runnable_sum; + src_prev_runnable_sum = &cpu_time->prev_runnable_sum; + dst_prev_runnable_sum = &rq->prev_runnable_sum; + + src_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum; + dst_nt_curr_runnable_sum = &rq->nt_curr_runnable_sum; + src_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum; + dst_nt_prev_runnable_sum = &rq->nt_prev_runnable_sum; + } + + *src_curr_runnable_sum -= p->ravg.curr_window; + *dst_curr_runnable_sum += p->ravg.curr_window; + + *src_prev_runnable_sum -= p->ravg.prev_window; + *dst_prev_runnable_sum += p->ravg.prev_window; + + if (is_new_task(p)) { + *src_nt_curr_runnable_sum -= p->ravg.curr_window; + *dst_nt_curr_runnable_sum += p->ravg.curr_window; + *src_nt_prev_runnable_sum -= p->ravg.prev_window; + *dst_nt_prev_runnable_sum += p->ravg.prev_window; + } + + trace_sched_migration_update_sum(p, migrate_type, &d); + + BUG_ON((s64)*src_curr_runnable_sum < 0); + BUG_ON((s64)*src_prev_runnable_sum < 0); +} + +static inline struct group_cpu_time * +task_group_cpu_time(struct task_struct *p, int cpu) +{ + return _group_cpu_time(rcu_dereference(p->grp), cpu); +} + +static inline struct group_cpu_time * +_group_cpu_time(struct related_thread_group *grp, int cpu) +{ + return grp ? per_cpu_ptr(grp->cpu_time, cpu) : NULL; +} + +#else /* CONFIG_SCHED_FREQ_INPUT */ + +static inline void free_group_cputime(struct related_thread_group *grp) { } + +static inline int alloc_group_cputime(struct related_thread_group *grp) +{ + return 0; +} + +static inline void transfer_busy_time(struct rq *rq, + struct related_thread_group *grp, struct task_struct *p, int event) +{ +} + +static struct group_cpu_time * +task_group_cpu_time(struct task_struct *p, int cpu) +{ + return NULL; +} + +static inline struct group_cpu_time * +_group_cpu_time(struct related_thread_group *grp, int cpu) +{ + return NULL; +} + +#endif + struct related_thread_group *alloc_related_thread_group(int group_id) { struct related_thread_group *grp; @@ -3429,6 +3913,11 @@ struct related_thread_group *alloc_related_thread_group(int group_id) if (!grp) return ERR_PTR(-ENOMEM); + if (alloc_group_cputime(grp)) { + kfree(grp); + return ERR_PTR(-ENOMEM); + } + grp->id = group_id; INIT_LIST_HEAD(&grp->tasks); INIT_LIST_HEAD(&grp->list); @@ -3449,6 +3938,16 @@ struct related_thread_group *lookup_related_thread_group(unsigned int group_id) return NULL; } +/* See comments before preferred_cluster() */ +static void free_related_thread_group(struct rcu_head *rcu) +{ + struct related_thread_group *grp = container_of(rcu, struct + related_thread_group, rcu); + + free_group_cputime(grp); + kfree(grp); +} + static void remove_task_from_group(struct task_struct *p) { struct related_thread_group *grp = p->grp; @@ -3458,6 +3957,7 @@ static void remove_task_from_group(struct task_struct *p) raw_spin_lock(&grp->lock); rq = __task_rq_lock(p); + transfer_busy_time(rq, p->grp, p, REM_TASK); list_del_init(&p->grp_list); rcu_assign_pointer(p->grp, NULL); __task_rq_unlock(rq); @@ -3471,9 +3971,7 @@ static void remove_task_from_group(struct task_struct *p) if (empty_group) { list_del(&grp->list); - nr_related_thread_groups--; - /* See comments before preferred_cluster() */ - kfree_rcu(grp, rcu); + call_rcu(&grp->rcu, free_related_thread_group); } } @@ -3489,8 +3987,9 @@ add_task_to_group(struct task_struct *p, struct related_thread_group *grp) * reference of p->grp in various hot-paths */ rq = __task_rq_lock(p); - rcu_assign_pointer(p->grp, grp); + transfer_busy_time(rq, grp, p, ADD_TASK); list_add(&p->grp_list, &grp->tasks); + rcu_assign_pointer(p->grp, grp); __task_rq_unlock(rq); _set_preferred_cluster(grp); @@ -3539,7 +4038,6 @@ redo: } else if (!grp && new) { /* New group - use object allocated before */ destroy = 0; - nr_related_thread_groups++; list_add(&new->list, &related_thread_groups); grp = new; } @@ -3550,8 +4048,10 @@ redo: done: raw_spin_unlock_irqrestore(&p->pi_lock, flags); - if (destroy) + if (new && destroy) { + free_group_cputime(new); kfree(new); + } return rc; } @@ -3898,13 +4398,19 @@ static void notify_migration(int src_cpu, int dest_cpu, bool src_cpu_dead, struct task_struct *p) { struct migration_notify_data mnd; + bool check_groups; + + rcu_read_lock(); + check_groups = rcu_access_pointer(p->grp) != NULL; + rcu_read_unlock(); if (!same_freq_domain(src_cpu, dest_cpu)) { if (!src_cpu_dead) - check_for_freq_change(cpu_rq(src_cpu), false); - check_for_freq_change(cpu_rq(dest_cpu), false); + check_for_freq_change(cpu_rq(src_cpu), false, + check_groups); + check_for_freq_change(cpu_rq(dest_cpu), false, check_groups); } else { - check_for_freq_change(cpu_rq(dest_cpu), true); + check_for_freq_change(cpu_rq(dest_cpu), true, check_groups); } if (task_notify_on_migrate(p)) { @@ -4771,6 +5277,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) struct related_thread_group *grp = NULL; #endif bool freq_notif_allowed = !(wake_flags & WF_NO_NOTIFIER); + bool check_group = false; wake_flags &= ~WF_NO_NOTIFIER; @@ -4846,6 +5353,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) if (update_preferred_cluster(grp, p, old_load)) set_preferred_cluster(grp); rcu_read_unlock(); + check_group = grp != NULL; p->sched_contributes_to_load = !!task_contributes_to_load(p); p->state = TASK_WAKING; @@ -4894,12 +5402,14 @@ out: if (freq_notif_allowed) { if (!same_freq_domain(src_cpu, cpu)) { - check_for_freq_change(cpu_rq(cpu), false); - check_for_freq_change(cpu_rq(src_cpu), false); + check_for_freq_change(cpu_rq(cpu), + false, check_group); + check_for_freq_change(cpu_rq(src_cpu), + false, check_group); } else if (heavy_task) { - check_for_freq_change(cpu_rq(cpu), false); + check_for_freq_change(cpu_rq(cpu), false, false); } else if (success) { - check_for_freq_change(cpu_rq(cpu), true); + check_for_freq_change(cpu_rq(cpu), true, false); } } @@ -10543,6 +11053,7 @@ void __init sched_init(void) rq->nt_curr_runnable_sum = rq->nt_prev_runnable_sum = 0; rq->old_busy_time = 0; rq->old_estimated_time = 0; + rq->old_busy_time_group = 0; rq->notifier_sent = 0; rq->hmp_stats.pred_demands_sum = 0; #endif diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0288a331e311..a33eddb7b17d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -32,9 +32,8 @@ #include <linux/task_work.h> #include <linux/ratelimit.h> -#include <trace/events/sched.h> - #include "sched.h" +#include <trace/events/sched.h> /* * Targeted preemption latency for CPU-bound tasks: @@ -4059,6 +4058,9 @@ static inline int invalid_value_freq_input(unsigned int *data) if (data == &sysctl_sched_freq_account_wait_time) return !(*data == 0 || *data == 1); + if (data == &sysctl_sched_freq_aggregate) + return !(*data == 0 || *data == 1); + return 0; } #else @@ -7674,6 +7676,7 @@ enum fbq_type { regular, remote, all }; LBF_BIG_TASK_ACTIVE_BALANCE) #define LBF_IGNORE_BIG_TASKS 0x100 #define LBF_IGNORE_PREFERRED_CLUSTER_TASKS 0x200 +#define LBF_MOVED_RELATED_THREAD_GROUP_TASK 0x400 struct lb_env { struct sched_domain *sd; @@ -7916,6 +7919,8 @@ static void detach_task(struct task_struct *p, struct lb_env *env) deactivate_task(env->src_rq, p, 0); double_lock_balance(env->src_rq, env->dst_rq); set_task_cpu(p, env->dst_cpu); + if (rcu_access_pointer(p->grp)) + env->flags |= LBF_MOVED_RELATED_THREAD_GROUP_TASK; double_unlock_balance(env->src_rq, env->dst_rq); } @@ -9575,10 +9580,13 @@ no_move: /* Assumes one 'busiest' cpu that we pulled tasks from */ if (!same_freq_domain(this_cpu, cpu_of(busiest))) { - check_for_freq_change(this_rq, false); - check_for_freq_change(busiest, false); + int check_groups = !!(env.flags & + LBF_MOVED_RELATED_THREAD_GROUP_TASK); + + check_for_freq_change(this_rq, false, check_groups); + check_for_freq_change(busiest, false, check_groups); } else { - check_for_freq_change(this_rq, true); + check_for_freq_change(this_rq, true, false); } } if (likely(!active_balance)) { @@ -9876,10 +9884,12 @@ out_unlock: local_irq_enable(); if (moved && !same_freq_domain(busiest_cpu, target_cpu)) { - check_for_freq_change(busiest_rq, false); - check_for_freq_change(target_rq, false); + int check_groups = !!(env.flags & + LBF_MOVED_RELATED_THREAD_GROUP_TASK); + check_for_freq_change(busiest_rq, false, check_groups); + check_for_freq_change(target_rq, false, check_groups); } else if (moved) { - check_for_freq_change(target_rq, true); + check_for_freq_change(target_rq, true, false); } if (per_cpu(dbs_boost_needed, target_cpu)) { diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index a66d8a12051c..df9b972195e5 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -409,6 +409,16 @@ struct related_thread_group { struct sched_cluster *preferred_cluster; struct rcu_head rcu; u64 last_update; +#ifdef CONFIG_SCHED_FREQ_INPUT + struct group_cpu_time __percpu *cpu_time; /* one per cluster */ +#endif +}; + +struct migration_sum_data { + struct rq *src_rq, *dst_rq; +#ifdef CONFIG_SCHED_FREQ_INPUT + struct group_cpu_time *src_cpu_time, *dst_cpu_time; +#endif }; extern struct list_head cluster_head; @@ -741,7 +751,7 @@ struct rq { struct task_struct *ed_task; #ifdef CONFIG_SCHED_FREQ_INPUT - unsigned int old_busy_time; + u64 old_busy_time, old_busy_time_group; int notifier_sent; u64 old_estimated_time; #endif @@ -1337,7 +1347,16 @@ static inline int update_preferred_cluster(struct related_thread_group *grp, #ifdef CONFIG_SCHED_FREQ_INPUT #define PRED_DEMAND_DELTA ((s64)new_pred_demand - p->ravg.pred_demand) -extern void check_for_freq_change(struct rq *rq, bool check_cra); +extern void +check_for_freq_change(struct rq *rq, bool check_pred, bool check_groups); + +struct group_cpu_time { + u64 curr_runnable_sum; + u64 prev_runnable_sum; + u64 nt_curr_runnable_sum; + u64 nt_prev_runnable_sum; + u64 window_start; +}; /* Is frequency of two cpus synchronized with each other? */ static inline int same_freq_domain(int src_cpu, int dst_cpu) @@ -1355,7 +1374,8 @@ static inline int same_freq_domain(int src_cpu, int dst_cpu) #define sched_migration_fixup 0 #define PRED_DEMAND_DELTA (0) -static inline void check_for_freq_change(struct rq *rq, bool check_cra) { } +static inline void +check_for_freq_change(struct rq *rq, bool check_pred, bool check_groups) { } static inline int same_freq_domain(int src_cpu, int dst_cpu) { diff --git a/kernel/sched/sched_avg.c b/kernel/sched/sched_avg.c index cdb1d7c53849..c70e0466c36c 100644 --- a/kernel/sched/sched_avg.c +++ b/kernel/sched/sched_avg.c @@ -18,9 +18,9 @@ #include <linux/hrtimer.h> #include <linux/sched.h> #include <linux/math64.h> -#include <trace/events/sched.h> #include "sched.h" +#include <trace/events/sched.h> static DEFINE_PER_CPU(u64, nr_prod_sum); static DEFINE_PER_CPU(u64, last_time); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 1da3b96368b1..825be75ca1a3 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -472,6 +472,13 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = &zero, }, + { + .procname = "sched_freq_aggregate", + .data = &sysctl_sched_freq_aggregate, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_window_update_handler, + }, #endif { .procname = "sched_boost", diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index cc9f7a9319be..731f6484b811 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -256,7 +256,8 @@ int perf_trace_add(struct perf_event *p_event, int flags) void perf_trace_del(struct perf_event *p_event, int flags) { struct trace_event_call *tp_event = p_event->tp_event; - hlist_del_rcu(&p_event->hlist_entry); + if (!hlist_unhashed(&p_event->hlist_entry)) + hlist_del_rcu(&p_event->hlist_entry); tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event); } |
