summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cgroup.c61
-rw-r--r--kernel/cpu.c24
-rw-r--r--kernel/cpu_pm.c31
-rw-r--r--kernel/debug/kdb/kdb_io.c12
-rw-r--r--kernel/events/core.c121
-rw-r--r--kernel/events/hw_breakpoint.c2
-rw-r--r--kernel/exit.c19
-rw-r--r--kernel/fork.c10
-rw-r--r--kernel/irq/manage.c3
-rw-r--r--kernel/locking/spinlock_debug.c14
-rw-r--r--kernel/module.c10
-rw-r--r--kernel/panic.c8
-rw-r--r--kernel/power/Kconfig9
-rw-r--r--kernel/power/Makefile2
-rw-r--r--kernel/power/process.c22
-rw-r--r--kernel/power/qos.c195
-rw-r--r--kernel/power/suspend.c35
-rw-r--r--kernel/power/wakeup_reason.c225
-rw-r--r--kernel/printk/printk.c22
-rw-r--r--kernel/resource.c2
-rw-r--r--kernel/sched/Makefile2
-rw-r--r--kernel/sched/clock.c2
-rw-r--r--kernel/sched/core.c3654
-rw-r--r--kernel/sched/cputime.c13
-rw-r--r--kernel/sched/deadline.c48
-rw-r--r--kernel/sched/debug.c50
-rw-r--r--kernel/sched/fair.c2649
-rw-r--r--kernel/sched/features.h3
-rw-r--r--kernel/sched/idle.c1
-rw-r--r--kernel/sched/idle_task.c25
-rw-r--r--kernel/sched/rt.c189
-rw-r--r--kernel/sched/sched.h595
-rw-r--r--kernel/sched/sched_avg.c128
-rw-r--r--kernel/sched/stop_task.c42
-rw-r--r--kernel/smp.c35
-rw-r--r--kernel/smpboot.c3
-rw-r--r--kernel/sys.c173
-rw-r--r--kernel/sysctl.c239
-rw-r--r--kernel/sysctl_binary.c3
-rw-r--r--kernel/time/Makefile2
-rw-r--r--kernel/time/alarmtimer.c270
-rw-r--r--kernel/time/sched_clock.c8
-rw-r--r--kernel/time/tick-sched.c92
-rw-r--r--kernel/time/timer.c64
-rw-r--r--kernel/trace/Kconfig41
-rw-r--r--kernel/trace/Makefile7
-rw-r--r--kernel/trace/blktrace.c80
-rw-r--r--kernel/trace/gpu-traces.c23
-rw-r--r--kernel/trace/ipc_logging.c876
-rw-r--r--kernel/trace/ipc_logging_debug.c184
-rw-r--r--kernel/trace/ipc_logging_private.h165
-rw-r--r--kernel/trace/msm_rtb.c329
-rw-r--r--kernel/trace/power-traces.c3
-rw-r--r--kernel/trace/trace.c111
-rw-r--r--kernel/trace/trace.h4
-rw-r--r--kernel/trace/trace_cpu_freq_switch.c312
-rw-r--r--kernel/trace/trace_event_perf.c3
-rw-r--r--kernel/trace/trace_events.c5
-rw-r--r--kernel/trace/trace_functions_graph.c43
-rw-r--r--kernel/trace/trace_output.c184
-rw-r--r--kernel/trace/trace_sched_wakeup.c3
-rw-r--r--kernel/trace/trace_syscalls.c4
-rw-r--r--kernel/trace/trace_uprobe.c2
-rw-r--r--kernel/watchdog.c143
-rw-r--r--kernel/workqueue.c176
65 files changed, 11549 insertions, 266 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index dc94f8beb097..e8d71110ed2a 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2006,7 +2006,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
{
struct super_block *pinned_sb = NULL;
struct cgroup_subsys *ss;
- struct cgroup_root *root;
+ struct cgroup_root *root = NULL;
struct cgroup_sb_opts opts;
struct dentry *dentry;
int ret;
@@ -2671,6 +2671,45 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
return ret;
}
+int subsys_cgroup_allow_attach(struct cgroup_taskset *tset)
+{
+ const struct cred *cred = current_cred(), *tcred;
+ struct task_struct *task;
+ struct cgroup_subsys_state *css;
+
+ if (capable(CAP_SYS_NICE))
+ return 0;
+
+ cgroup_taskset_for_each(task, css, tset) {
+ tcred = __task_cred(task);
+
+ if (current != task && !uid_eq(cred->euid, tcred->uid) &&
+ !uid_eq(cred->euid, tcred->suid))
+ return -EACCES;
+ }
+
+ return 0;
+}
+
+static int cgroup_allow_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
+{
+ struct cgroup_subsys_state *css;
+ int i;
+ int ret;
+
+ for_each_css(css, i, cgrp) {
+ if (css->ss->allow_attach) {
+ ret = css->ss->allow_attach(tset);
+ if (ret)
+ return ret;
+ } else {
+ return -EACCES;
+ }
+ }
+
+ return 0;
+}
+
static int cgroup_procs_write_permission(struct task_struct *task,
struct cgroup *dst_cgrp,
struct kernfs_open_file *of)
@@ -2685,8 +2724,24 @@ static int cgroup_procs_write_permission(struct task_struct *task,
*/
if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
!uid_eq(cred->euid, tcred->uid) &&
- !uid_eq(cred->euid, tcred->suid))
- ret = -EACCES;
+ !uid_eq(cred->euid, tcred->suid)) {
+ /*
+ * if the default permission check fails, give each
+ * cgroup a chance to extend the permission check
+ */
+ struct cgroup_taskset tset = {
+ .src_csets = LIST_HEAD_INIT(tset.src_csets),
+ .dst_csets = LIST_HEAD_INIT(tset.dst_csets),
+ .csets = &tset.src_csets,
+ };
+ struct css_set *cset;
+ cset = task_css_set(task);
+ list_add(&cset->mg_node, &tset.src_csets);
+ ret = cgroup_allow_attach(dst_cgrp, &tset);
+ list_del(&tset.src_csets);
+ if (ret)
+ ret = -EACCES;
+ }
if (!ret && cgroup_on_dfl(dst_cgrp)) {
struct super_block *sb = of->file->f_path.dentry->d_sb;
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 85ff5e26e23b..37731292f8a1 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -24,6 +24,8 @@
#include <linux/irq.h>
#include <trace/events/power.h>
+#include <trace/events/sched.h>
+
#include "smpboot.h"
#ifdef CONFIG_SMP
@@ -425,6 +427,7 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
out_release:
cpu_hotplug_done();
+ trace_sched_cpu_hotplug(cpu, err, 0);
if (!err)
cpu_notify_nofail(CPU_POST_DEAD | mod, hcpu);
return err;
@@ -530,6 +533,7 @@ out_notify:
__cpu_notify(CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL);
out:
cpu_hotplug_done();
+ trace_sched_cpu_hotplug(cpu, ret, 1);
return ret;
}
@@ -827,3 +831,23 @@ void init_cpu_online(const struct cpumask *src)
{
cpumask_copy(to_cpumask(cpu_online_bits), src);
}
+
+static ATOMIC_NOTIFIER_HEAD(idle_notifier);
+
+void idle_notifier_register(struct notifier_block *n)
+{
+ atomic_notifier_chain_register(&idle_notifier, n);
+}
+EXPORT_SYMBOL_GPL(idle_notifier_register);
+
+void idle_notifier_unregister(struct notifier_block *n)
+{
+ atomic_notifier_chain_unregister(&idle_notifier, n);
+}
+EXPORT_SYMBOL_GPL(idle_notifier_unregister);
+
+void idle_notifier_call_chain(unsigned long val)
+{
+ atomic_notifier_call_chain(&idle_notifier, val, NULL);
+}
+EXPORT_SYMBOL_GPL(idle_notifier_call_chain);
diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c
index 009cc9a17d95..774bfe7a2893 100644
--- a/kernel/cpu_pm.c
+++ b/kernel/cpu_pm.c
@@ -22,14 +22,17 @@
#include <linux/spinlock.h>
#include <linux/syscore_ops.h>
+bool from_suspend = false;
+
static DEFINE_RWLOCK(cpu_pm_notifier_lock);
static RAW_NOTIFIER_HEAD(cpu_pm_notifier_chain);
-static int cpu_pm_notify(enum cpu_pm_event event, int nr_to_call, int *nr_calls)
+static int cpu_pm_notify(enum cpu_pm_event event, int nr_to_call, int *nr_calls,
+ void *data)
{
int ret;
- ret = __raw_notifier_call_chain(&cpu_pm_notifier_chain, event, NULL,
+ ret = __raw_notifier_call_chain(&cpu_pm_notifier_chain, event, data,
nr_to_call, nr_calls);
return notifier_to_errno(ret);
@@ -101,13 +104,13 @@ int cpu_pm_enter(void)
int ret = 0;
read_lock(&cpu_pm_notifier_lock);
- ret = cpu_pm_notify(CPU_PM_ENTER, -1, &nr_calls);
+ ret = cpu_pm_notify(CPU_PM_ENTER, -1, &nr_calls, NULL);
if (ret)
/*
* Inform listeners (nr_calls - 1) about failure of CPU PM
* PM entry who are notified earlier to prepare for it.
*/
- cpu_pm_notify(CPU_PM_ENTER_FAILED, nr_calls - 1, NULL);
+ cpu_pm_notify(CPU_PM_ENTER_FAILED, nr_calls - 1, NULL, NULL);
read_unlock(&cpu_pm_notifier_lock);
return ret;
@@ -131,7 +134,7 @@ int cpu_pm_exit(void)
int ret;
read_lock(&cpu_pm_notifier_lock);
- ret = cpu_pm_notify(CPU_PM_EXIT, -1, NULL);
+ ret = cpu_pm_notify(CPU_PM_EXIT, -1, NULL, NULL);
read_unlock(&cpu_pm_notifier_lock);
return ret;
@@ -154,19 +157,21 @@ EXPORT_SYMBOL_GPL(cpu_pm_exit);
*
* Return conditions are same as __raw_notifier_call_chain.
*/
-int cpu_cluster_pm_enter(void)
+int cpu_cluster_pm_enter(unsigned long aff_level)
{
int nr_calls;
int ret = 0;
read_lock(&cpu_pm_notifier_lock);
- ret = cpu_pm_notify(CPU_CLUSTER_PM_ENTER, -1, &nr_calls);
+ ret = cpu_pm_notify(CPU_CLUSTER_PM_ENTER, -1, &nr_calls,
+ (void *) aff_level);
if (ret)
/*
* Inform listeners (nr_calls - 1) about failure of CPU cluster
* PM entry who are notified earlier to prepare for it.
*/
- cpu_pm_notify(CPU_CLUSTER_PM_ENTER_FAILED, nr_calls - 1, NULL);
+ cpu_pm_notify(CPU_CLUSTER_PM_ENTER_FAILED, nr_calls - 1, NULL,
+ (void *) aff_level);
read_unlock(&cpu_pm_notifier_lock);
return ret;
@@ -188,12 +193,12 @@ EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter);
*
* Return conditions are same as __raw_notifier_call_chain.
*/
-int cpu_cluster_pm_exit(void)
+int cpu_cluster_pm_exit(unsigned long aff_level)
{
int ret;
read_lock(&cpu_pm_notifier_lock);
- ret = cpu_pm_notify(CPU_CLUSTER_PM_EXIT, -1, NULL);
+ ret = cpu_pm_notify(CPU_CLUSTER_PM_EXIT, -1, NULL, (void *) aff_level);
read_unlock(&cpu_pm_notifier_lock);
return ret;
@@ -205,17 +210,19 @@ static int cpu_pm_suspend(void)
{
int ret;
+ from_suspend = true;
ret = cpu_pm_enter();
if (ret)
return ret;
- ret = cpu_cluster_pm_enter();
+ ret = cpu_cluster_pm_enter(0);
return ret;
}
static void cpu_pm_resume(void)
{
- cpu_cluster_pm_exit();
+ from_suspend = false;
+ cpu_cluster_pm_exit(0);
cpu_pm_exit();
}
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index fc1ef736253c..0b891286a150 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -216,7 +216,7 @@ static char *kdb_read(char *buffer, size_t bufsize)
int i;
int diag, dtab_count;
int key;
-
+ static int last_crlf;
diag = kdbgetintenv("DTABCOUNT", &dtab_count);
if (diag)
@@ -237,6 +237,9 @@ poll_again:
return buffer;
if (key != 9)
tab = 0;
+ if (key != 10 && key != 13)
+ last_crlf = 0;
+
switch (key) {
case 8: /* backspace */
if (cp > buffer) {
@@ -254,7 +257,12 @@ poll_again:
*cp = tmp;
}
break;
- case 13: /* enter */
+ case 10: /* new line */
+ case 13: /* carriage return */
+ /* handle \n after \r */
+ if (last_crlf && last_crlf != key)
+ break;
+ last_crlf = key;
*lastchar++ = '\n';
*lastchar++ = '\0';
if (!KDB_STATE(KGDB_TRANS)) {
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 1e889a078dbc..96100cc046c5 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -176,7 +176,11 @@ static struct srcu_struct pmus_srcu;
* 1 - disallow cpu events for unpriv
* 2 - disallow kernel profiling for unpriv
*/
+#ifdef CONFIG_PERF_EVENTS_USERMODE
+int sysctl_perf_event_paranoid __read_mostly = -1;
+#else
int sysctl_perf_event_paranoid __read_mostly = 1;
+#endif
/* Minimum for 512 kiB + 1 user control page */
int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
@@ -1657,7 +1661,32 @@ static int __perf_remove_from_context(void *info)
}
-/*
+#ifdef CONFIG_SMP
+static void perf_retry_remove(struct perf_event *event,
+ struct remove_event *rep)
+{
+ int up_ret;
+ /*
+ * CPU was offline. Bring it online so we can
+ * gracefully exit a perf context.
+ */
+ up_ret = cpu_up(event->cpu);
+ if (!up_ret)
+ /* Try the remove call once again. */
+ cpu_function_call(event->cpu, __perf_remove_from_context,
+ rep);
+ else
+ pr_err("Failed to bring up CPU: %d, ret: %d\n",
+ event->cpu, up_ret);
+}
+#else
+static void perf_retry_remove(struct perf_event *event,
+ struct remove_event *rep)
+{
+}
+#endif
+
+ /*
* Remove the event from a task's (or a CPU's) list of events.
*
* CPU events are removed with a smp call. For task events we only
@@ -1670,7 +1699,8 @@ static int __perf_remove_from_context(void *info)
* When called from perf_event_exit_task, it's OK because the
* context has been detached from its task.
*/
-static void perf_remove_from_context(struct perf_event *event, bool detach_group)
+static void __ref perf_remove_from_context(struct perf_event *event,
+ bool detach_group)
{
struct perf_event_context *ctx = event->ctx;
struct task_struct *task = ctx->task;
@@ -1678,6 +1708,7 @@ static void perf_remove_from_context(struct perf_event *event, bool detach_group
.event = event,
.detach_group = detach_group,
};
+ int ret;
lockdep_assert_held(&ctx->mutex);
@@ -1688,7 +1719,11 @@ static void perf_remove_from_context(struct perf_event *event, bool detach_group
* already called __perf_remove_from_context from
* perf_event_exit_cpu.
*/
- cpu_function_call(event->cpu, __perf_remove_from_context, &re);
+ ret = cpu_function_call(event->cpu, __perf_remove_from_context,
+ &re);
+ if (ret == -ENXIO)
+ perf_retry_remove(event, &re);
+
return;
}
@@ -3460,7 +3495,8 @@ find_get_context(struct pmu *pmu, struct task_struct *task,
if (!task) {
/* Must be root to operate on a CPU event: */
- if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
+ if (event->owner != EVENT_OWNER_KERNEL && perf_paranoid_cpu() &&
+ !capable(CAP_SYS_ADMIN))
return ERR_PTR(-EACCES);
/*
@@ -3844,6 +3880,15 @@ EXPORT_SYMBOL_GPL(perf_event_release_kernel);
*/
static int perf_release(struct inode *inode, struct file *file)
{
+ struct perf_event *event = file->private_data;
+
+ /*
+ * Event can be in state OFF because of a constraint check.
+ * Change to ACTIVE so that it gets cleaned up correctly.
+ */
+ if ((event->state == PERF_EVENT_STATE_OFF) &&
+ event->attr.constraint_duplicate)
+ event->state = PERF_EVENT_STATE_ACTIVE;
put_event(file->private_data);
return 0;
}
@@ -6920,6 +6965,8 @@ static struct pmu perf_swevent = {
.start = perf_swevent_start,
.stop = perf_swevent_stop,
.read = perf_swevent_read,
+
+ .events_across_hotplug = 1,
};
#ifdef CONFIG_EVENT_TRACING
@@ -7041,6 +7088,8 @@ static struct pmu perf_tracepoint = {
.start = perf_swevent_start,
.stop = perf_swevent_stop,
.read = perf_swevent_read,
+
+ .events_across_hotplug = 1,
};
static inline void perf_tp_register(void)
@@ -7319,6 +7368,8 @@ static struct pmu perf_cpu_clock = {
.start = cpu_clock_event_start,
.stop = cpu_clock_event_stop,
.read = cpu_clock_event_read,
+
+ .events_across_hotplug = 1,
};
/*
@@ -7400,6 +7451,8 @@ static struct pmu perf_task_clock = {
.start = task_clock_event_start,
.stop = task_clock_event_stop,
.read = task_clock_event_read,
+
+ .events_across_hotplug = 1,
};
static void perf_pmu_nop_void(struct pmu *pmu)
@@ -8272,6 +8325,9 @@ SYSCALL_DEFINE5(perf_event_open,
if (err)
return err;
+ if (attr.constraint_duplicate || attr.__reserved_1)
+ return -EINVAL;
+
if (!attr.exclude_kernel) {
if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
return -EACCES;
@@ -9302,6 +9358,18 @@ static void __perf_event_exit_context(void *__info)
rcu_read_unlock();
}
+static void __perf_event_stop_swclock(void *__info)
+{
+ struct perf_event_context *ctx = __info;
+ struct perf_event *event, *tmp;
+
+ list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) {
+ if (event->attr.config == PERF_COUNT_SW_CPU_CLOCK &&
+ event->attr.type == PERF_TYPE_SOFTWARE)
+ cpu_clock_event_stop(event, 0);
+ }
+}
+
static void perf_event_exit_cpu_context(int cpu)
{
struct perf_event_context *ctx;
@@ -9311,20 +9379,56 @@ static void perf_event_exit_cpu_context(int cpu)
idx = srcu_read_lock(&pmus_srcu);
list_for_each_entry_rcu(pmu, &pmus, entry) {
ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx;
-
mutex_lock(&ctx->mutex);
- smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
+ /*
+ * If keeping events across hotplugging is supported, do not
+ * remove the event list, but keep it alive across CPU hotplug.
+ * The context is exited via an fd close path when userspace
+ * is done and the target CPU is online. If software clock
+ * event is active, then stop hrtimer associated with it.
+ * Start the timer when the CPU comes back online.
+ */
+ if (!pmu->events_across_hotplug)
+ smp_call_function_single(cpu, __perf_event_exit_context,
+ ctx, 1);
+ else
+ smp_call_function_single(cpu, __perf_event_stop_swclock,
+ ctx, 1);
mutex_unlock(&ctx->mutex);
}
srcu_read_unlock(&pmus_srcu, idx);
}
+static void perf_event_start_swclock(int cpu)
+{
+ struct perf_event_context *ctx;
+ struct pmu *pmu;
+ int idx;
+ struct perf_event *event, *tmp;
+
+ idx = srcu_read_lock(&pmus_srcu);
+ list_for_each_entry_rcu(pmu, &pmus, entry) {
+ if (pmu->events_across_hotplug) {
+ ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx;
+ list_for_each_entry_safe(event, tmp, &ctx->event_list,
+ event_entry) {
+ if (event->attr.config ==
+ PERF_COUNT_SW_CPU_CLOCK &&
+ event->attr.type == PERF_TYPE_SOFTWARE)
+ cpu_clock_event_start(event, 0);
+ }
+ }
+ }
+ srcu_read_unlock(&pmus_srcu, idx);
+}
+
static void perf_event_exit_cpu(int cpu)
{
perf_event_exit_cpu_context(cpu);
}
#else
static inline void perf_event_exit_cpu(int cpu) { }
+static inline void perf_event_start_swclock(int cpu) { }
#endif
static int
@@ -9363,6 +9467,11 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
case CPU_DOWN_PREPARE:
perf_event_exit_cpu(cpu);
break;
+
+ case CPU_STARTING:
+ perf_event_start_swclock(cpu);
+ break;
+
default:
break;
}
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index 92ce5f4ccc26..7da5b674d16e 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -614,6 +614,8 @@ static struct pmu perf_breakpoint = {
.start = hw_breakpoint_start,
.stop = hw_breakpoint_stop,
.read = hw_breakpoint_pmu_read,
+
+ .events_across_hotplug = 1,
};
int __init init_hw_breakpoint(void)
diff --git a/kernel/exit.c b/kernel/exit.c
index 07110c6020a0..a32e83d567b9 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -388,6 +388,7 @@ static void exit_mm(struct task_struct *tsk)
{
struct mm_struct *mm = tsk->mm;
struct core_state *core_state;
+ int mm_released;
mm_release(tsk, mm);
if (!mm)
@@ -434,9 +435,12 @@ static void exit_mm(struct task_struct *tsk)
enter_lazy_tlb(mm, current);
task_unlock(tsk);
mm_update_next_owner(mm);
- mmput(mm);
+
+ mm_released = mmput(mm);
if (test_thread_flag(TIF_MEMDIE))
exit_oom_victim();
+ if (mm_released)
+ set_tsk_thread_flag(tsk, TIF_MM_RELEASED);
}
static struct task_struct *find_alive_thread(struct task_struct *p)
@@ -632,6 +636,7 @@ static void check_stack_usage(void)
static DEFINE_SPINLOCK(low_water_lock);
static int lowest_to_date = THREAD_SIZE;
unsigned long free;
+ int islower = false;
free = stack_not_used(current);
@@ -640,11 +645,16 @@ static void check_stack_usage(void)
spin_lock(&low_water_lock);
if (free < lowest_to_date) {
- pr_warn("%s (%d) used greatest stack depth: %lu bytes left\n",
- current->comm, task_pid_nr(current), free);
lowest_to_date = free;
+ islower = true;
}
spin_unlock(&low_water_lock);
+
+ if (islower) {
+ printk(KERN_WARNING "%s (%d) used greatest stack depth: "
+ "%lu bytes left\n",
+ current->comm, task_pid_nr(current), free);
+ }
}
#else
static inline void check_stack_usage(void) {}
@@ -699,6 +709,9 @@ void do_exit(long code)
}
exit_signals(tsk); /* sets PF_EXITING */
+
+ sched_exit(tsk);
+
/*
* tsk->flags are checked in the futex code to protect against
* an exiting task cleaning up the robust pi futexes.
diff --git a/kernel/fork.c b/kernel/fork.c
index 1155eac61687..c9eb86b646ab 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -23,6 +23,7 @@
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/iocontext.h>
+#include <linux/kasan.h>
#include <linux/key.h>
#include <linux/binfmts.h>
#include <linux/mman.h>
@@ -169,6 +170,7 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
static inline void free_thread_info(struct thread_info *ti)
{
+ kasan_alloc_pages(virt_to_page(ti), THREAD_SIZE_ORDER);
free_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER);
}
# else
@@ -692,8 +694,9 @@ EXPORT_SYMBOL_GPL(__mmdrop);
/*
* Decrement the use count and release all resources for an mm.
*/
-void mmput(struct mm_struct *mm)
+int mmput(struct mm_struct *mm)
{
+ int mm_freed = 0;
might_sleep();
if (atomic_dec_and_test(&mm->mm_users)) {
@@ -711,7 +714,9 @@ void mmput(struct mm_struct *mm)
if (mm->binfmt)
module_put(mm->binfmt->module);
mmdrop(mm);
+ mm_freed = 1;
}
+ return mm_freed;
}
EXPORT_SYMBOL_GPL(mmput);
@@ -800,7 +805,8 @@ struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
mm = get_task_mm(task);
if (mm && mm != current->mm &&
- !ptrace_may_access(task, mode)) {
+ !ptrace_may_access(task, mode) &&
+ !capable(CAP_SYS_RESOURCE)) {
mmput(mm);
mm = ERR_PTR(-EACCES);
}
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 6ead200370da..5cb153a8474a 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -319,6 +319,9 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
desc->affinity_notify = notify;
raw_spin_unlock_irqrestore(&desc->lock, flags);
+ if (!notify && old_notify)
+ cancel_work_sync(&old_notify->work);
+
if (old_notify)
kref_put(&old_notify->kref, old_notify->release);
diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c
index 0374a596cffa..d381f559e0ce 100644
--- a/kernel/locking/spinlock_debug.c
+++ b/kernel/locking/spinlock_debug.c
@@ -12,6 +12,8 @@
#include <linux/debug_locks.h>
#include <linux/delay.h>
#include <linux/export.h>
+#include <linux/bug.h>
+#include <soc/qcom/watchdog.h>
void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name,
struct lock_class_key *key)
@@ -64,6 +66,11 @@ static void spin_dump(raw_spinlock_t *lock, const char *msg)
owner ? owner->comm : "<none>",
owner ? task_pid_nr(owner) : -1,
lock->owner_cpu);
+#ifdef CONFIG_DEBUG_SPINLOCK_BITE_ON_BUG
+ msm_trigger_wdog_bite();
+#elif defined(CONFIG_DEBUG_SPINLOCK_PANIC_ON_BUG)
+ BUG();
+#endif
dump_stack();
}
@@ -114,7 +121,7 @@ static void __spin_lock_debug(raw_spinlock_t *lock)
__delay(1);
}
/* lockup suspected: */
- spin_dump(lock, "lockup suspected");
+ spin_bug(lock, "lockup suspected");
#ifdef CONFIG_SMP
trigger_all_cpu_backtrace();
#endif
@@ -167,6 +174,11 @@ static void rwlock_bug(rwlock_t *lock, const char *msg)
printk(KERN_EMERG "BUG: rwlock %s on CPU#%d, %s/%d, %p\n",
msg, raw_smp_processor_id(), current->comm,
task_pid_nr(current), lock);
+#ifdef CONFIG_DEBUG_SPINLOCK_BITE_ON_BUG
+ msm_trigger_wdog_bite();
+#elif defined(CONFIG_DEBUG_SPINLOCK_PANIC_ON_BUG)
+ BUG();
+#endif
dump_stack();
}
diff --git a/kernel/module.c b/kernel/module.c
index 0e5c71195f18..fe5248ab3378 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2499,7 +2499,7 @@ static void layout_symtab(struct module *mod, struct load_info *info)
/* We'll tack temporary mod_kallsyms on the end. */
mod->init_size = ALIGN(mod->init_size,
- __alignof__(struct mod_kallsyms));
+ __alignof__(struct mod_kallsyms));
info->mod_kallsyms_init_off = mod->init_size;
mod->init_size += sizeof(struct mod_kallsyms);
mod->init_size = debug_align(mod->init_size);
@@ -2578,7 +2578,13 @@ void * __weak module_alloc(unsigned long size)
return vmalloc_exec(size);
}
-#ifdef CONFIG_DEBUG_KMEMLEAK
+#if defined(CONFIG_DEBUG_KMEMLEAK) && defined(CONFIG_DEBUG_MODULE_SCAN_OFF)
+static void kmemleak_load_module(const struct module *mod,
+ const struct load_info *info)
+{
+ kmemleak_no_scan(mod->module_core);
+}
+#elif defined(CONFIG_DEBUG_KMEMLEAK)
static void kmemleak_load_module(const struct module *mod,
const struct load_info *info)
{
diff --git a/kernel/panic.c b/kernel/panic.c
index 41e2b54f36b5..223564d3e1f8 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -28,6 +28,9 @@
#define PANIC_TIMER_STEP 100
#define PANIC_BLINK_SPD 18
+/* Machine specific panic information string */
+char *mach_panic_string;
+
int panic_on_oops = CONFIG_PANIC_ON_OOPS_VALUE;
static unsigned long tainted_mask;
static int pause_on_oops;
@@ -412,6 +415,11 @@ late_initcall(init_oops_id);
void print_oops_end_marker(void)
{
init_oops_id();
+
+ if (mach_panic_string)
+ printk(KERN_WARNING "Board Information: %s\n",
+ mach_panic_string);
+
pr_warn("---[ end trace %016llx ]---\n", (unsigned long long)oops_id);
}
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 02e8dfaa1ce2..84c480946fb2 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -28,6 +28,15 @@ config SUSPEND_SKIP_SYNC
of suspend, or they are content with invoking sync() from
user-space before invoking suspend. Say Y if that's your case.
+config WAKELOCK
+ bool "Android's method of preventing suspend"
+ default y
+ ---help---
+ This allows applications to prevent the CPU from suspending while
+ they need it.
+
+ Say Y if you are running an android userspace.
+
config HIBERNATE_CALLBACKS
bool
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index cb880a14cc39..22eb9ed879ad 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -12,3 +12,5 @@ obj-$(CONFIG_PM_AUTOSLEEP) += autosleep.o
obj-$(CONFIG_PM_WAKELOCKS) += wakelock.o
obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o
+
+obj-$(CONFIG_SUSPEND) += wakeup_reason.o
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 564f786df470..e7f1f736a5b6 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -18,6 +18,7 @@
#include <linux/workqueue.h>
#include <linux/kmod.h>
#include <trace/events/power.h>
+#include <linux/wakeup_reason.h>
/*
* Timeout for stopping processes
@@ -35,6 +36,9 @@ static int try_to_freeze_tasks(bool user_only)
unsigned int elapsed_msecs;
bool wakeup = false;
int sleep_usecs = USEC_PER_MSEC;
+#ifdef CONFIG_PM_SLEEP
+ char suspend_abort[MAX_SUSPEND_ABORT_LEN];
+#endif
do_gettimeofday(&start);
@@ -64,6 +68,11 @@ static int try_to_freeze_tasks(bool user_only)
break;
if (pm_wakeup_pending()) {
+#ifdef CONFIG_PM_SLEEP
+ pm_get_active_wakeup_sources(suspend_abort,
+ MAX_SUSPEND_ABORT_LEN);
+ log_suspend_abort_reason(suspend_abort);
+#endif
wakeup = true;
break;
}
@@ -83,15 +92,17 @@ static int try_to_freeze_tasks(bool user_only)
do_div(elapsed_msecs64, NSEC_PER_MSEC);
elapsed_msecs = elapsed_msecs64;
- if (todo) {
+ if (wakeup) {
pr_cont("\n");
- pr_err("Freezing of tasks %s after %d.%03d seconds "
- "(%d tasks refusing to freeze, wq_busy=%d):\n",
- wakeup ? "aborted" : "failed",
+ pr_err("Freezing of tasks aborted after %d.%03d seconds",
+ elapsed_msecs / 1000, elapsed_msecs % 1000);
+ } else if (todo) {
+ pr_cont("\n");
+ pr_err("Freezing of tasks failed after %d.%03d seconds"
+ " (%d tasks refusing to freeze, wq_busy=%d):\n",
elapsed_msecs / 1000, elapsed_msecs % 1000,
todo - wq_busy, wq_busy);
- if (!wakeup) {
read_lock(&tasklist_lock);
for_each_process_thread(g, p) {
if (p != current && !freezer_should_skip(p)
@@ -99,7 +110,6 @@ static int try_to_freeze_tasks(bool user_only)
sched_show_task(p);
}
read_unlock(&tasklist_lock);
- }
} else {
pr_cont("(elapsed %d.%03d seconds) ", elapsed_msecs / 1000,
elapsed_msecs % 1000);
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 97b0df71303e..8ecc7b3f7dd9 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -43,6 +43,8 @@
#include <linux/kernel.h>
#include <linux/debugfs.h>
#include <linux/seq_file.h>
+#include <linux/irq.h>
+#include <linux/irqdesc.h>
#include <linux/uaccess.h>
#include <linux/export.h>
@@ -67,6 +69,8 @@ static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier);
static struct pm_qos_constraints cpu_dma_constraints = {
.list = PLIST_HEAD_INIT(cpu_dma_constraints.list),
.target_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE,
+ .target_per_cpu = { [0 ... (NR_CPUS - 1)] =
+ PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE },
.default_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE,
.no_constraint_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE,
.type = PM_QOS_MIN,
@@ -81,6 +85,8 @@ static BLOCKING_NOTIFIER_HEAD(network_lat_notifier);
static struct pm_qos_constraints network_lat_constraints = {
.list = PLIST_HEAD_INIT(network_lat_constraints.list),
.target_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE,
+ .target_per_cpu = { [0 ... (NR_CPUS - 1)] =
+ PM_QOS_NETWORK_LAT_DEFAULT_VALUE },
.default_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE,
.no_constraint_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE,
.type = PM_QOS_MIN,
@@ -91,11 +97,12 @@ static struct pm_qos_object network_lat_pm_qos = {
.name = "network_latency",
};
-
static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier);
static struct pm_qos_constraints network_tput_constraints = {
.list = PLIST_HEAD_INIT(network_tput_constraints.list),
.target_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE,
+ .target_per_cpu = { [0 ... (NR_CPUS - 1)] =
+ PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE },
.default_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE,
.no_constraint_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE,
.type = PM_QOS_MAX,
@@ -259,22 +266,60 @@ static const struct file_operations pm_qos_debug_fops = {
.release = single_release,
};
+static inline void pm_qos_set_value_for_cpus(struct pm_qos_constraints *c,
+ struct cpumask *cpus)
+{
+ struct pm_qos_request *req = NULL;
+ int cpu;
+ s32 qos_val[NR_CPUS] = { [0 ... (NR_CPUS - 1)] = c->default_value };
+
+ plist_for_each_entry(req, &c->list, node) {
+ for_each_cpu(cpu, &req->cpus_affine) {
+ switch (c->type) {
+ case PM_QOS_MIN:
+ if (qos_val[cpu] > req->node.prio)
+ qos_val[cpu] = req->node.prio;
+ break;
+ case PM_QOS_MAX:
+ if (req->node.prio > qos_val[cpu])
+ qos_val[cpu] = req->node.prio;
+ break;
+ case PM_QOS_SUM:
+ qos_val[cpu] += req->node.prio;
+ break;
+ default:
+ BUG();
+ break;
+ }
+ }
+ }
+
+ for_each_possible_cpu(cpu) {
+ if (c->target_per_cpu[cpu] != qos_val[cpu])
+ cpumask_set_cpu(cpu, cpus);
+ c->target_per_cpu[cpu] = qos_val[cpu];
+ }
+}
+
/**
* pm_qos_update_target - manages the constraints list and calls the notifiers
* if needed
* @c: constraints data struct
- * @node: request to add to the list, to update or to remove
+ * @req: request to add to the list, to update or to remove
* @action: action to take on the constraints list
* @value: value of the request to add or update
*
* This function returns 1 if the aggregated constraint value has changed, 0
* otherwise.
*/
-int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node,
- enum pm_qos_req_action action, int value)
+int pm_qos_update_target(struct pm_qos_constraints *c,
+ struct pm_qos_request *req,
+ enum pm_qos_req_action action, int value)
{
unsigned long flags;
int prev_value, curr_value, new_value;
+ struct plist_node *node = &req->node;
+ struct cpumask cpus;
int ret;
spin_lock_irqsave(&pm_qos_lock, flags);
@@ -305,7 +350,9 @@ int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node,
}
curr_value = pm_qos_get_value(c);
+ cpumask_clear(&cpus);
pm_qos_set_value(c, curr_value);
+ pm_qos_set_value_for_cpus(c, &cpus);
spin_unlock_irqrestore(&pm_qos_lock, flags);
@@ -315,7 +362,7 @@ int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node,
if (c->notifiers)
blocking_notifier_call_chain(c->notifiers,
(unsigned long)curr_value,
- NULL);
+ &cpus);
} else {
ret = 0;
}
@@ -398,12 +445,50 @@ int pm_qos_request(int pm_qos_class)
}
EXPORT_SYMBOL_GPL(pm_qos_request);
+int pm_qos_request_for_cpu(int pm_qos_class, int cpu)
+{
+ return pm_qos_array[pm_qos_class]->constraints->target_per_cpu[cpu];
+}
+EXPORT_SYMBOL(pm_qos_request_for_cpu);
+
int pm_qos_request_active(struct pm_qos_request *req)
{
return req->pm_qos_class != 0;
}
EXPORT_SYMBOL_GPL(pm_qos_request_active);
+int pm_qos_request_for_cpumask(int pm_qos_class, struct cpumask *mask)
+{
+ unsigned long irqflags;
+ int cpu;
+ struct pm_qos_constraints *c = NULL;
+ int val;
+
+ spin_lock_irqsave(&pm_qos_lock, irqflags);
+ c = pm_qos_array[pm_qos_class]->constraints;
+ val = c->default_value;
+
+ for_each_cpu(cpu, mask) {
+ switch (c->type) {
+ case PM_QOS_MIN:
+ if (c->target_per_cpu[cpu] < val)
+ val = c->target_per_cpu[cpu];
+ break;
+ case PM_QOS_MAX:
+ if (c->target_per_cpu[cpu] > val)
+ val = c->target_per_cpu[cpu];
+ break;
+ default:
+ BUG();
+ break;
+ }
+ }
+ spin_unlock_irqrestore(&pm_qos_lock, irqflags);
+
+ return val;
+}
+EXPORT_SYMBOL(pm_qos_request_for_cpumask);
+
static void __pm_qos_update_request(struct pm_qos_request *req,
s32 new_value)
{
@@ -412,7 +497,7 @@ static void __pm_qos_update_request(struct pm_qos_request *req,
if (new_value != req->node.prio)
pm_qos_update_target(
pm_qos_array[req->pm_qos_class]->constraints,
- &req->node, PM_QOS_UPDATE_REQ, new_value);
+ req, PM_QOS_UPDATE_REQ, new_value);
}
/**
@@ -430,6 +515,41 @@ static void pm_qos_work_fn(struct work_struct *work)
__pm_qos_update_request(req, PM_QOS_DEFAULT_VALUE);
}
+#ifdef CONFIG_SMP
+static void pm_qos_irq_release(struct kref *ref)
+{
+ unsigned long flags;
+ struct irq_affinity_notify *notify = container_of(ref,
+ struct irq_affinity_notify, kref);
+ struct pm_qos_request *req = container_of(notify,
+ struct pm_qos_request, irq_notify);
+ struct pm_qos_constraints *c =
+ pm_qos_array[req->pm_qos_class]->constraints;
+
+ spin_lock_irqsave(&pm_qos_lock, flags);
+ cpumask_setall(&req->cpus_affine);
+ spin_unlock_irqrestore(&pm_qos_lock, flags);
+
+ pm_qos_update_target(c, req, PM_QOS_UPDATE_REQ, c->default_value);
+}
+
+static void pm_qos_irq_notify(struct irq_affinity_notify *notify,
+ const cpumask_t *mask)
+{
+ unsigned long flags;
+ struct pm_qos_request *req = container_of(notify,
+ struct pm_qos_request, irq_notify);
+ struct pm_qos_constraints *c =
+ pm_qos_array[req->pm_qos_class]->constraints;
+
+ spin_lock_irqsave(&pm_qos_lock, flags);
+ cpumask_copy(&req->cpus_affine, mask);
+ spin_unlock_irqrestore(&pm_qos_lock, flags);
+
+ pm_qos_update_target(c, req, PM_QOS_UPDATE_REQ, req->node.prio);
+}
+#endif
+
/**
* pm_qos_add_request - inserts new qos request into the list
* @req: pointer to a preallocated handle
@@ -453,11 +573,56 @@ void pm_qos_add_request(struct pm_qos_request *req,
WARN(1, KERN_ERR "pm_qos_add_request() called for already added request\n");
return;
}
+
+ switch (req->type) {
+ case PM_QOS_REQ_AFFINE_CORES:
+ if (cpumask_empty(&req->cpus_affine)) {
+ req->type = PM_QOS_REQ_ALL_CORES;
+ cpumask_setall(&req->cpus_affine);
+ WARN(1, KERN_ERR "Affine cores not set for request with affinity flag\n");
+ }
+ break;
+#ifdef CONFIG_SMP
+ case PM_QOS_REQ_AFFINE_IRQ:
+ if (irq_can_set_affinity(req->irq)) {
+ int ret = 0;
+ struct irq_desc *desc = irq_to_desc(req->irq);
+ struct cpumask *mask = desc->irq_data.common->affinity;
+
+ /* Get the current affinity */
+ cpumask_copy(&req->cpus_affine, mask);
+ req->irq_notify.irq = req->irq;
+ req->irq_notify.notify = pm_qos_irq_notify;
+ req->irq_notify.release = pm_qos_irq_release;
+
+ ret = irq_set_affinity_notifier(req->irq,
+ &req->irq_notify);
+ if (ret) {
+ WARN(1, KERN_ERR "IRQ affinity notify set failed\n");
+ req->type = PM_QOS_REQ_ALL_CORES;
+ cpumask_setall(&req->cpus_affine);
+ }
+ } else {
+ req->type = PM_QOS_REQ_ALL_CORES;
+ cpumask_setall(&req->cpus_affine);
+ WARN(1, KERN_ERR "IRQ-%d not set for request with affinity flag\n",
+ req->irq);
+ }
+ break;
+#endif
+ default:
+ WARN(1, KERN_ERR "Unknown request type %d\n", req->type);
+ /* fall through */
+ case PM_QOS_REQ_ALL_CORES:
+ cpumask_setall(&req->cpus_affine);
+ break;
+ }
+
req->pm_qos_class = pm_qos_class;
INIT_DELAYED_WORK(&req->work, pm_qos_work_fn);
trace_pm_qos_add_request(pm_qos_class, value);
pm_qos_update_target(pm_qos_array[pm_qos_class]->constraints,
- &req->node, PM_QOS_ADD_REQ, value);
+ req, PM_QOS_ADD_REQ, value);
}
EXPORT_SYMBOL_GPL(pm_qos_add_request);
@@ -511,7 +676,7 @@ void pm_qos_update_request_timeout(struct pm_qos_request *req, s32 new_value,
if (new_value != req->node.prio)
pm_qos_update_target(
pm_qos_array[req->pm_qos_class]->constraints,
- &req->node, PM_QOS_UPDATE_REQ, new_value);
+ req, PM_QOS_UPDATE_REQ, new_value);
schedule_delayed_work(&req->work, usecs_to_jiffies(timeout_us));
}
@@ -531,15 +696,25 @@ void pm_qos_remove_request(struct pm_qos_request *req)
/* silent return to keep pcm code cleaner */
if (!pm_qos_request_active(req)) {
- WARN(1, KERN_ERR "pm_qos_remove_request() called for unknown object\n");
+ WARN(1, "pm_qos_remove_request() called for unknown object\n");
return;
}
cancel_delayed_work_sync(&req->work);
+#ifdef CONFIG_SMP
+ if (req->type == PM_QOS_REQ_AFFINE_IRQ) {
+ int ret = 0;
+ /* Get the current affinity */
+ ret = irq_set_affinity_notifier(req->irq, NULL);
+ if (ret)
+ WARN(1, "IRQ affinity notify set failed\n");
+ }
+#endif
+
trace_pm_qos_remove_request(req->pm_qos_class, PM_QOS_DEFAULT_VALUE);
pm_qos_update_target(pm_qos_array[req->pm_qos_class]->constraints,
- &req->node, PM_QOS_REMOVE_REQ,
+ req, PM_QOS_REMOVE_REQ,
PM_QOS_DEFAULT_VALUE);
memset(req, 0, sizeof(*req));
}
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index f9fe133c13e2..024411816ccf 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -26,9 +26,11 @@
#include <linux/suspend.h>
#include <linux/syscore_ops.h>
#include <linux/ftrace.h>
+#include <linux/rtc.h>
#include <trace/events/power.h>
#include <linux/compiler.h>
#include <linux/moduleparam.h>
+#include <linux/wakeup_reason.h>
#include "power.h"
@@ -312,7 +314,8 @@ void __weak arch_suspend_enable_irqs(void)
*/
static int suspend_enter(suspend_state_t state, bool *wakeup)
{
- int error;
+ char suspend_abort[MAX_SUSPEND_ABORT_LEN];
+ int error, last_dev;
error = platform_suspend_prepare(state);
if (error)
@@ -320,7 +323,11 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
error = dpm_suspend_late(PMSG_SUSPEND);
if (error) {
+ last_dev = suspend_stats.last_failed_dev + REC_FAILED_NUM - 1;
+ last_dev %= REC_FAILED_NUM;
printk(KERN_ERR "PM: late suspend of devices failed\n");
+ log_suspend_abort_reason("%s device failed to power down",
+ suspend_stats.failed_devs[last_dev]);
goto Platform_finish;
}
error = platform_suspend_prepare_late(state);
@@ -329,7 +336,11 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
error = dpm_suspend_noirq(PMSG_SUSPEND);
if (error) {
+ last_dev = suspend_stats.last_failed_dev + REC_FAILED_NUM - 1;
+ last_dev %= REC_FAILED_NUM;
printk(KERN_ERR "PM: noirq suspend of devices failed\n");
+ log_suspend_abort_reason("noirq suspend of %s device failed",
+ suspend_stats.failed_devs[last_dev]);
goto Platform_early_resume;
}
error = platform_suspend_prepare_noirq(state);
@@ -353,8 +364,10 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
}
error = disable_nonboot_cpus();
- if (error || suspend_test(TEST_CPUS))
+ if (error || suspend_test(TEST_CPUS)) {
+ log_suspend_abort_reason("Disabling non-boot cpus failed");
goto Enable_cpus;
+ }
arch_suspend_disable_irqs();
BUG_ON(!irqs_disabled());
@@ -370,6 +383,9 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
state, false);
events_check_enabled = false;
} else if (*wakeup) {
+ pm_get_active_wakeup_sources(suspend_abort,
+ MAX_SUSPEND_ABORT_LEN);
+ log_suspend_abort_reason(suspend_abort);
error = -EBUSY;
}
syscore_resume();
@@ -417,6 +433,7 @@ int suspend_devices_and_enter(suspend_state_t state)
error = dpm_suspend_start(PMSG_SUSPEND);
if (error) {
pr_err("PM: Some devices failed to suspend, or early wake event detected\n");
+ log_suspend_abort_reason("Some devices failed to suspend, or early wake event detected");
goto Recover_platform;
}
suspend_test_finish("suspend devices");
@@ -518,6 +535,18 @@ static int enter_state(suspend_state_t state)
return error;
}
+static void pm_suspend_marker(char *annotation)
+{
+ struct timespec ts;
+ struct rtc_time tm;
+
+ getnstimeofday(&ts);
+ rtc_time_to_tm(ts.tv_sec, &tm);
+ pr_info("PM: suspend %s %d-%02d-%02d %02d:%02d:%02d.%09lu UTC\n",
+ annotation, tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday,
+ tm.tm_hour, tm.tm_min, tm.tm_sec, ts.tv_nsec);
+}
+
/**
* pm_suspend - Externally visible function for suspending the system.
* @state: System sleep state to enter.
@@ -532,6 +561,7 @@ int pm_suspend(suspend_state_t state)
if (state <= PM_SUSPEND_ON || state >= PM_SUSPEND_MAX)
return -EINVAL;
+ pm_suspend_marker("entry");
error = enter_state(state);
if (error) {
suspend_stats.fail++;
@@ -539,6 +569,7 @@ int pm_suspend(suspend_state_t state)
} else {
suspend_stats.success++;
}
+ pm_suspend_marker("exit");
return error;
}
EXPORT_SYMBOL(pm_suspend);
diff --git a/kernel/power/wakeup_reason.c b/kernel/power/wakeup_reason.c
new file mode 100644
index 000000000000..252611fad2fe
--- /dev/null
+++ b/kernel/power/wakeup_reason.c
@@ -0,0 +1,225 @@
+/*
+ * kernel/power/wakeup_reason.c
+ *
+ * Logs the reasons which caused the kernel to resume from
+ * the suspend mode.
+ *
+ * Copyright (C) 2014 Google, Inc.
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/wakeup_reason.h>
+#include <linux/kernel.h>
+#include <linux/irq.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/notifier.h>
+#include <linux/suspend.h>
+
+
+#define MAX_WAKEUP_REASON_IRQS 32
+static int irq_list[MAX_WAKEUP_REASON_IRQS];
+static int irqcount;
+static bool suspend_abort;
+static char abort_reason[MAX_SUSPEND_ABORT_LEN];
+static struct kobject *wakeup_reason;
+static DEFINE_SPINLOCK(resume_reason_lock);
+
+static ktime_t last_monotime; /* monotonic time before last suspend */
+static ktime_t curr_monotime; /* monotonic time after last suspend */
+static ktime_t last_stime; /* monotonic boottime offset before last suspend */
+static ktime_t curr_stime; /* monotonic boottime offset after last suspend */
+
+static ssize_t last_resume_reason_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ int irq_no, buf_offset = 0;
+ struct irq_desc *desc;
+ spin_lock(&resume_reason_lock);
+ if (suspend_abort) {
+ buf_offset = sprintf(buf, "Abort: %s", abort_reason);
+ } else {
+ for (irq_no = 0; irq_no < irqcount; irq_no++) {
+ desc = irq_to_desc(irq_list[irq_no]);
+ if (desc && desc->action && desc->action->name)
+ buf_offset += sprintf(buf + buf_offset, "%d %s\n",
+ irq_list[irq_no], desc->action->name);
+ else
+ buf_offset += sprintf(buf + buf_offset, "%d\n",
+ irq_list[irq_no]);
+ }
+ }
+ spin_unlock(&resume_reason_lock);
+ return buf_offset;
+}
+
+static ssize_t last_suspend_time_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct timespec sleep_time;
+ struct timespec total_time;
+ struct timespec suspend_resume_time;
+
+ /*
+ * total_time is calculated from monotonic bootoffsets because
+ * unlike CLOCK_MONOTONIC it include the time spent in suspend state.
+ */
+ total_time = ktime_to_timespec(ktime_sub(curr_stime, last_stime));
+
+ /*
+ * suspend_resume_time is calculated as monotonic (CLOCK_MONOTONIC)
+ * time interval before entering suspend and post suspend.
+ */
+ suspend_resume_time = ktime_to_timespec(ktime_sub(curr_monotime, last_monotime));
+
+ /* sleep_time = total_time - suspend_resume_time */
+ sleep_time = timespec_sub(total_time, suspend_resume_time);
+
+ /* Export suspend_resume_time and sleep_time in pair here. */
+ return sprintf(buf, "%lu.%09lu %lu.%09lu\n",
+ suspend_resume_time.tv_sec, suspend_resume_time.tv_nsec,
+ sleep_time.tv_sec, sleep_time.tv_nsec);
+}
+
+static struct kobj_attribute resume_reason = __ATTR_RO(last_resume_reason);
+static struct kobj_attribute suspend_time = __ATTR_RO(last_suspend_time);
+
+static struct attribute *attrs[] = {
+ &resume_reason.attr,
+ &suspend_time.attr,
+ NULL,
+};
+static struct attribute_group attr_group = {
+ .attrs = attrs,
+};
+
+/*
+ * logs all the wake up reasons to the kernel
+ * stores the irqs to expose them to the userspace via sysfs
+ */
+void log_wakeup_reason(int irq)
+{
+ struct irq_desc *desc;
+ desc = irq_to_desc(irq);
+ if (desc && desc->action && desc->action->name)
+ printk(KERN_INFO "Resume caused by IRQ %d, %s\n", irq,
+ desc->action->name);
+ else
+ printk(KERN_INFO "Resume caused by IRQ %d\n", irq);
+
+ spin_lock(&resume_reason_lock);
+ if (irqcount == MAX_WAKEUP_REASON_IRQS) {
+ spin_unlock(&resume_reason_lock);
+ printk(KERN_WARNING "Resume caused by more than %d IRQs\n",
+ MAX_WAKEUP_REASON_IRQS);
+ return;
+ }
+
+ irq_list[irqcount++] = irq;
+ spin_unlock(&resume_reason_lock);
+}
+
+int check_wakeup_reason(int irq)
+{
+ int irq_no;
+ int ret = false;
+
+ spin_lock(&resume_reason_lock);
+ for (irq_no = 0; irq_no < irqcount; irq_no++)
+ if (irq_list[irq_no] == irq) {
+ ret = true;
+ break;
+ }
+ spin_unlock(&resume_reason_lock);
+ return ret;
+}
+
+void log_suspend_abort_reason(const char *fmt, ...)
+{
+ va_list args;
+
+ spin_lock(&resume_reason_lock);
+
+ //Suspend abort reason has already been logged.
+ if (suspend_abort) {
+ spin_unlock(&resume_reason_lock);
+ return;
+ }
+
+ suspend_abort = true;
+ va_start(args, fmt);
+ vsnprintf(abort_reason, MAX_SUSPEND_ABORT_LEN, fmt, args);
+ va_end(args);
+ spin_unlock(&resume_reason_lock);
+}
+
+/* Detects a suspend and clears all the previous wake up reasons*/
+static int wakeup_reason_pm_event(struct notifier_block *notifier,
+ unsigned long pm_event, void *unused)
+{
+ switch (pm_event) {
+ case PM_SUSPEND_PREPARE:
+ spin_lock(&resume_reason_lock);
+ irqcount = 0;
+ suspend_abort = false;
+ spin_unlock(&resume_reason_lock);
+ /* monotonic time since boot */
+ last_monotime = ktime_get();
+ /* monotonic time since boot including the time spent in suspend */
+ last_stime = ktime_get_boottime();
+ break;
+ case PM_POST_SUSPEND:
+ /* monotonic time since boot */
+ curr_monotime = ktime_get();
+ /* monotonic time since boot including the time spent in suspend */
+ curr_stime = ktime_get_boottime();
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block wakeup_reason_pm_notifier_block = {
+ .notifier_call = wakeup_reason_pm_event,
+};
+
+/* Initializes the sysfs parameter
+ * registers the pm_event notifier
+ */
+int __init wakeup_reason_init(void)
+{
+ int retval;
+
+ retval = register_pm_notifier(&wakeup_reason_pm_notifier_block);
+ if (retval)
+ printk(KERN_WARNING "[%s] failed to register PM notifier %d\n",
+ __func__, retval);
+
+ wakeup_reason = kobject_create_and_add("wakeup_reasons", kernel_kobj);
+ if (!wakeup_reason) {
+ printk(KERN_WARNING "[%s] failed to create a sysfs kobject\n",
+ __func__);
+ return 1;
+ }
+ retval = sysfs_create_group(wakeup_reason, &attr_group);
+ if (retval) {
+ kobject_put(wakeup_reason);
+ printk(KERN_WARNING "[%s] failed to create a sysfs group %d\n",
+ __func__, retval);
+ }
+ return 0;
+}
+
+late_initcall(wakeup_reason_init);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index c048e34b177f..7b884dc55bd0 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -55,6 +55,10 @@
#include "console_cmdline.h"
#include "braille.h"
+#ifdef CONFIG_EARLY_PRINTK_DIRECT
+extern void printascii(char *);
+#endif
+
int console_printk[4] = {
CONSOLE_LOGLEVEL_DEFAULT, /* console_loglevel */
MESSAGE_LOGLEVEL_DEFAULT, /* default_message_loglevel */
@@ -232,7 +236,11 @@ struct printk_log {
u8 facility; /* syslog facility */
u8 flags:5; /* internal record flags */
u8 level:3; /* syslog level */
-};
+}
+#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+__packed __aligned(4)
+#endif
+;
/*
* The logbuf_lock protects kmsg buffer, indices, counters. This can be taken
@@ -273,11 +281,7 @@ static u32 clear_idx;
#define LOG_FACILITY(v) ((v) >> 3 & 0xff)
/* record buffer */
-#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
-#define LOG_ALIGN 4
-#else
#define LOG_ALIGN __alignof__(struct printk_log)
-#endif
#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN);
static char *log_buf = __log_buf;
@@ -1754,6 +1758,10 @@ asmlinkage int vprintk_emit(int facility, int level,
}
}
+#ifdef CONFIG_EARLY_PRINTK_DIRECT
+ printascii(text);
+#endif
+
if (level == LOGLEVEL_DEFAULT)
level = default_message_loglevel;
@@ -2130,8 +2138,12 @@ static int console_cpu_notify(struct notifier_block *self,
case CPU_DEAD:
case CPU_DOWN_FAILED:
case CPU_UP_CANCELED:
+ case CPU_DYING:
+#ifdef CONFIG_CONSOLE_FLUSH_ON_HOTPLUG
console_lock();
console_unlock();
+#endif
+ break;
}
return NOTIFY_OK;
}
diff --git a/kernel/resource.c b/kernel/resource.c
index 249b1eb1e6e1..4c9835c09dcd 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -163,7 +163,7 @@ static const struct file_operations proc_iomem_operations = {
static int __init ioresources_init(void)
{
proc_create("ioports", 0, NULL, &proc_ioports_operations);
- proc_create("iomem", 0, NULL, &proc_iomem_operations);
+ proc_create("iomem", S_IRUSR, NULL, &proc_iomem_operations);
return 0;
}
__initcall(ioresources_init);
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 67687973ce80..846c15156616 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -13,7 +13,7 @@ endif
obj-y += core.o loadavg.o clock.o cputime.o
obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
-obj-y += wait.o completion.o idle.o
+obj-y += wait.o completion.o idle.o sched_avg.o
obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
obj-$(CONFIG_SCHEDSTATS) += stats.o
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index caf4041f5b0a..bc54e84675da 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -354,7 +354,7 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
return;
sched_clock_tick();
- touch_softlockup_watchdog();
+ touch_softlockup_watchdog_sched();
}
EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 70e5e09341f1..58303b3dc356 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -74,6 +74,9 @@
#include <linux/binfmts.h>
#include <linux/context_tracking.h>
#include <linux/compiler.h>
+#include <linux/cpufreq.h>
+#include <linux/syscore_ops.h>
+#include <linux/list_sort.h>
#include <asm/switch_to.h>
#include <asm/tlb.h>
@@ -82,6 +85,9 @@
#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
#endif
+#ifdef CONFIG_MSM_APP_SETTINGS
+#include <asm/app_api.h>
+#endif
#include "sched.h"
#include "../workqueue_internal.h"
@@ -90,6 +96,16 @@
#define CREATE_TRACE_POINTS
#include <trace/events/sched.h>
+const char *task_event_names[] = {"PUT_PREV_TASK", "PICK_NEXT_TASK",
+ "TASK_WAKE", "TASK_MIGRATE", "TASK_UPDATE",
+ "IRQ_UPDATE"};
+
+const char *migrate_type_names[] = {"GROUP_TO_RQ", "RQ_TO_GROUP",
+ "RQ_TO_RQ", "GROUP_TO_GROUP"};
+
+ATOMIC_NOTIFIER_HEAD(migration_notifier_head);
+ATOMIC_NOTIFIER_HEAD(load_alert_notifier_head);
+
DEFINE_MUTEX(sched_domains_mutex);
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
@@ -762,8 +778,202 @@ void sched_avg_update(struct rq *rq)
}
}
+/*
+ * Note C-state for (idle) cpus.
+ *
+ * @cstate = cstate index, 0 -> active state
+ * @wakeup_energy = energy spent in waking up cpu
+ * @wakeup_latency = latency to wakeup from cstate
+ *
+ */
+void
+sched_set_cpu_cstate(int cpu, int cstate, int wakeup_energy, int wakeup_latency)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ rq->cstate = cstate; /* C1, C2 etc */
+ rq->wakeup_energy = wakeup_energy;
+ rq->wakeup_latency = wakeup_latency;
+}
+
#endif /* CONFIG_SMP */
+#ifdef CONFIG_SCHED_HMP
+
+static ktime_t ktime_last;
+static bool sched_ktime_suspended;
+
+static bool use_cycle_counter;
+static struct cpu_cycle_counter_cb cpu_cycle_counter_cb;
+
+u64 sched_ktime_clock(void)
+{
+ if (unlikely(sched_ktime_suspended))
+ return ktime_to_ns(ktime_last);
+ return ktime_get_ns();
+}
+
+static void sched_resume(void)
+{
+ sched_ktime_suspended = false;
+}
+
+static int sched_suspend(void)
+{
+ ktime_last = ktime_get();
+ sched_ktime_suspended = true;
+ return 0;
+}
+
+static struct syscore_ops sched_syscore_ops = {
+ .resume = sched_resume,
+ .suspend = sched_suspend
+};
+
+static int __init sched_init_ops(void)
+{
+ register_syscore_ops(&sched_syscore_ops);
+ return 0;
+}
+late_initcall(sched_init_ops);
+
+static inline void clear_ed_task(struct task_struct *p, struct rq *rq)
+{
+ if (p == rq->ed_task)
+ rq->ed_task = NULL;
+}
+
+static inline void set_task_last_wake(struct task_struct *p, u64 wallclock)
+{
+ p->last_wake_ts = wallclock;
+}
+
+static inline void set_task_last_switch_out(struct task_struct *p,
+ u64 wallclock)
+{
+ p->last_switch_out_ts = wallclock;
+}
+
+/*
+ * Note D-state for (idle) cluster.
+ *
+ * @dstate = dstate index, 0 -> active state
+ * @wakeup_energy = energy spent in waking up cluster
+ * @wakeup_latency = latency to wakeup from cluster
+ *
+ */
+void sched_set_cluster_dstate(const cpumask_t *cluster_cpus, int dstate,
+ int wakeup_energy, int wakeup_latency)
+{
+ struct sched_cluster *cluster =
+ cpu_rq(cpumask_first(cluster_cpus))->cluster;
+ cluster->dstate = dstate;
+ cluster->dstate_wakeup_energy = wakeup_energy;
+ cluster->dstate_wakeup_latency = wakeup_latency;
+}
+
+u32 __weak get_freq_max_load(int cpu, u32 freq)
+{
+ /* 100% by default */
+ return 100;
+}
+
+DEFINE_PER_CPU(struct freq_max_load *, freq_max_load);
+static DEFINE_SPINLOCK(freq_max_load_lock);
+
+int sched_update_freq_max_load(const cpumask_t *cpumask)
+{
+ int i, cpu, ret;
+ unsigned int freq;
+ struct cpu_pstate_pwr *costs;
+ struct cpu_pwr_stats *per_cpu_info = get_cpu_pwr_stats();
+ struct freq_max_load *max_load, *old_max_load;
+ struct freq_max_load_entry *entry;
+ u64 max_demand_capacity, max_demand;
+ unsigned long flags;
+ u32 hfreq;
+ int hpct;
+
+ if (!per_cpu_info)
+ return 0;
+
+ spin_lock_irqsave(&freq_max_load_lock, flags);
+ max_demand_capacity = div64_u64(max_task_load(), max_possible_capacity);
+ for_each_cpu(cpu, cpumask) {
+ if (!per_cpu_info[cpu].ptable) {
+ ret = -EINVAL;
+ goto fail;
+ }
+
+ old_max_load = rcu_dereference(per_cpu(freq_max_load, cpu));
+
+ /*
+ * allocate len + 1 and leave the last power cost as 0 for
+ * power_cost() can stop iterating index when
+ * per_cpu_info[cpu].len > len of max_load due to race between
+ * cpu power stats update and get_cpu_pwr_stats().
+ */
+ max_load = kzalloc(sizeof(struct freq_max_load) +
+ sizeof(struct freq_max_load_entry) *
+ (per_cpu_info[cpu].len + 1), GFP_ATOMIC);
+ if (unlikely(!max_load)) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+
+ max_load->length = per_cpu_info[cpu].len;
+
+ max_demand = max_demand_capacity *
+ cpu_max_possible_capacity(cpu);
+
+ i = 0;
+ costs = per_cpu_info[cpu].ptable;
+ while (costs[i].freq) {
+ entry = &max_load->freqs[i];
+ freq = costs[i].freq;
+ hpct = get_freq_max_load(cpu, freq);
+ if (hpct <= 0 && hpct > 100)
+ hpct = 100;
+ hfreq = div64_u64((u64)freq * hpct, 100);
+ entry->hdemand =
+ div64_u64(max_demand * hfreq,
+ cpu_max_possible_freq(cpu));
+ i++;
+ }
+
+ rcu_assign_pointer(per_cpu(freq_max_load, cpu), max_load);
+ if (old_max_load)
+ kfree_rcu(old_max_load, rcu);
+ }
+
+ spin_unlock_irqrestore(&freq_max_load_lock, flags);
+ return 0;
+
+fail:
+ for_each_cpu(cpu, cpumask) {
+ max_load = rcu_dereference(per_cpu(freq_max_load, cpu));
+ if (max_load) {
+ rcu_assign_pointer(per_cpu(freq_max_load, cpu), NULL);
+ kfree_rcu(max_load, rcu);
+ }
+ }
+
+ spin_unlock_irqrestore(&freq_max_load_lock, flags);
+ return ret;
+}
+
+#else /* CONFIG_SCHED_HMP */
+u64 sched_ktime_clock(void)
+{
+ return 0;
+}
+
+static inline void clear_ed_task(struct task_struct *p, struct rq *rq) {}
+static inline void set_task_last_wake(struct task_struct *p, u64 wallclock) {}
+static inline void set_task_last_switch_out(struct task_struct *p,
+ u64 wallclock) {}
+#endif /* CONFIG_SCHED_HMP */
+
#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
(defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
/*
@@ -833,6 +1043,7 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
if (!(flags & ENQUEUE_RESTORE))
sched_info_queued(rq, p);
p->sched_class->enqueue_task(rq, p, flags);
+ trace_sched_enq_deq_task(p, 1, cpumask_bits(&p->cpus_allowed)[0]);
}
static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
@@ -841,6 +1052,7 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
if (!(flags & DEQUEUE_SAVE))
sched_info_dequeued(rq, p);
p->sched_class->dequeue_task(rq, p, flags);
+ trace_sched_enq_deq_task(p, 0, cpumask_bits(&p->cpus_allowed)[0]);
}
void activate_task(struct rq *rq, struct task_struct *p, int flags)
@@ -856,6 +1068,9 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
if (task_contributes_to_load(p))
rq->nr_uninterruptible++;
+ if (flags & DEQUEUE_SLEEP)
+ clear_ed_task(p, rq);
+
dequeue_task(rq, p, flags);
}
@@ -1047,6 +1262,3004 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
rq_clock_skip_update(rq, true);
}
+#ifdef CONFIG_SCHED_HMP
+unsigned int max_possible_efficiency = 1;
+unsigned int min_possible_efficiency = UINT_MAX;
+
+unsigned long __weak arch_get_cpu_efficiency(int cpu)
+{
+ return SCHED_LOAD_SCALE;
+}
+
+/* Keep track of max/min capacity possible across CPUs "currently" */
+static void __update_min_max_capacity(void)
+{
+ int i;
+ int max_cap = 0, min_cap = INT_MAX;
+
+ for_each_online_cpu(i) {
+ max_cap = max(max_cap, cpu_capacity(i));
+ min_cap = min(min_cap, cpu_capacity(i));
+ }
+
+ max_capacity = max_cap;
+ min_capacity = min_cap;
+}
+
+static void update_min_max_capacity(void)
+{
+ unsigned long flags;
+ int i;
+
+ local_irq_save(flags);
+ for_each_possible_cpu(i)
+ raw_spin_lock(&cpu_rq(i)->lock);
+
+ __update_min_max_capacity();
+
+ for_each_possible_cpu(i)
+ raw_spin_unlock(&cpu_rq(i)->lock);
+ local_irq_restore(flags);
+}
+
+/*
+ * Return 'capacity' of a cpu in reference to "least" efficient cpu, such that
+ * least efficient cpu gets capacity of 1024
+ */
+static unsigned long
+capacity_scale_cpu_efficiency(struct sched_cluster *cluster)
+{
+ return (1024 * cluster->efficiency) / min_possible_efficiency;
+}
+
+/*
+ * Return 'capacity' of a cpu in reference to cpu with lowest max_freq
+ * (min_max_freq), such that one with lowest max_freq gets capacity of 1024.
+ */
+static unsigned long capacity_scale_cpu_freq(struct sched_cluster *cluster)
+{
+ return (1024 * cluster_max_freq(cluster)) / min_max_freq;
+}
+
+/*
+ * Return load_scale_factor of a cpu in reference to "most" efficient cpu, so
+ * that "most" efficient cpu gets a load_scale_factor of 1
+ */
+static inline unsigned long
+load_scale_cpu_efficiency(struct sched_cluster *cluster)
+{
+ return DIV_ROUND_UP(1024 * max_possible_efficiency,
+ cluster->efficiency);
+}
+
+/*
+ * Return load_scale_factor of a cpu in reference to cpu with best max_freq
+ * (max_possible_freq), so that one with best max_freq gets a load_scale_factor
+ * of 1.
+ */
+static inline unsigned long load_scale_cpu_freq(struct sched_cluster *cluster)
+{
+ return DIV_ROUND_UP(1024 * max_possible_freq,
+ cluster_max_freq(cluster));
+}
+
+static int compute_capacity(struct sched_cluster *cluster)
+{
+ int capacity = 1024;
+
+ capacity *= capacity_scale_cpu_efficiency(cluster);
+ capacity >>= 10;
+
+ capacity *= capacity_scale_cpu_freq(cluster);
+ capacity >>= 10;
+
+ return capacity;
+}
+
+static int compute_max_possible_capacity(struct sched_cluster *cluster)
+{
+ int capacity = 1024;
+
+ capacity *= capacity_scale_cpu_efficiency(cluster);
+ capacity >>= 10;
+
+ capacity *= (1024 * cluster->max_possible_freq) / min_max_freq;
+ capacity >>= 10;
+
+ return capacity;
+}
+
+static int compute_load_scale_factor(struct sched_cluster *cluster)
+{
+ int load_scale = 1024;
+
+ /*
+ * load_scale_factor accounts for the fact that task load
+ * is in reference to "best" performing cpu. Task's load will need to be
+ * scaled (up) by a factor to determine suitability to be placed on a
+ * (little) cpu.
+ */
+ load_scale *= load_scale_cpu_efficiency(cluster);
+ load_scale >>= 10;
+
+ load_scale *= load_scale_cpu_freq(cluster);
+ load_scale >>= 10;
+
+ return load_scale;
+}
+
+struct list_head cluster_head;
+static DEFINE_MUTEX(cluster_lock);
+static cpumask_t all_cluster_cpus = CPU_MASK_NONE;
+DECLARE_BITMAP(all_cluster_ids, NR_CPUS);
+struct sched_cluster *sched_cluster[NR_CPUS];
+int num_clusters;
+
+static struct sched_cluster init_cluster = {
+ .list = LIST_HEAD_INIT(init_cluster.list),
+ .id = 0,
+ .max_power_cost = 1,
+ .min_power_cost = 1,
+ .capacity = 1024,
+ .max_possible_capacity = 1024,
+ .efficiency = 1,
+ .load_scale_factor = 1024,
+ .cur_freq = 1,
+ .max_freq = 1,
+ .max_mitigated_freq = UINT_MAX,
+ .min_freq = 1,
+ .max_possible_freq = 1,
+ .dstate = 0,
+ .dstate_wakeup_energy = 0,
+ .dstate_wakeup_latency = 0,
+ .exec_scale_factor = 1024,
+};
+
+void update_all_clusters_stats(void)
+{
+ struct sched_cluster *cluster;
+ u64 highest_mpc = 0, lowest_mpc = U64_MAX;
+
+ pre_big_task_count_change(cpu_possible_mask);
+
+ for_each_sched_cluster(cluster) {
+ u64 mpc;
+
+ cluster->capacity = compute_capacity(cluster);
+ mpc = cluster->max_possible_capacity =
+ compute_max_possible_capacity(cluster);
+ cluster->load_scale_factor = compute_load_scale_factor(cluster);
+
+ cluster->exec_scale_factor =
+ DIV_ROUND_UP(cluster->efficiency * 1024,
+ max_possible_efficiency);
+
+ if (mpc > highest_mpc)
+ highest_mpc = mpc;
+
+ if (mpc < lowest_mpc)
+ lowest_mpc = mpc;
+ }
+
+ max_possible_capacity = highest_mpc;
+ min_max_possible_capacity = lowest_mpc;
+
+ __update_min_max_capacity();
+ sched_update_freq_max_load(cpu_possible_mask);
+ post_big_task_count_change(cpu_possible_mask);
+}
+
+static void assign_cluster_ids(struct list_head *head)
+{
+ struct sched_cluster *cluster;
+ int pos = 0;
+
+ list_for_each_entry(cluster, head, list) {
+ cluster->id = pos;
+ sched_cluster[pos++] = cluster;
+ }
+}
+
+static void
+move_list(struct list_head *dst, struct list_head *src, bool sync_rcu)
+{
+ struct list_head *first, *last;
+
+ first = src->next;
+ last = src->prev;
+
+ if (sync_rcu) {
+ INIT_LIST_HEAD_RCU(src);
+ synchronize_rcu();
+ }
+
+ first->prev = dst;
+ dst->prev = last;
+ last->next = dst;
+
+ /* Ensure list sanity before making the head visible to all CPUs. */
+ smp_mb();
+ dst->next = first;
+}
+
+static int
+compare_clusters(void *priv, struct list_head *a, struct list_head *b)
+{
+ struct sched_cluster *cluster1, *cluster2;
+ int ret;
+
+ cluster1 = container_of(a, struct sched_cluster, list);
+ cluster2 = container_of(b, struct sched_cluster, list);
+
+ ret = cluster1->max_power_cost > cluster2->max_power_cost ||
+ (cluster1->max_power_cost == cluster2->max_power_cost &&
+ cluster1->max_possible_capacity <
+ cluster2->max_possible_capacity);
+
+ return ret;
+}
+
+static void sort_clusters(void)
+{
+ struct sched_cluster *cluster;
+ struct list_head new_head;
+
+ INIT_LIST_HEAD(&new_head);
+
+ for_each_sched_cluster(cluster) {
+ cluster->max_power_cost = power_cost(cluster_first_cpu(cluster),
+ max_task_load());
+ cluster->min_power_cost = power_cost(cluster_first_cpu(cluster),
+ 0);
+ }
+
+ move_list(&new_head, &cluster_head, true);
+
+ list_sort(NULL, &new_head, compare_clusters);
+ assign_cluster_ids(&new_head);
+
+ /*
+ * Ensure cluster ids are visible to all CPUs before making
+ * cluster_head visible.
+ */
+ move_list(&cluster_head, &new_head, false);
+}
+
+static void
+insert_cluster(struct sched_cluster *cluster, struct list_head *head)
+{
+ struct sched_cluster *tmp;
+ struct list_head *iter = head;
+
+ list_for_each_entry(tmp, head, list) {
+ if (cluster->max_power_cost < tmp->max_power_cost)
+ break;
+ iter = &tmp->list;
+ }
+
+ list_add(&cluster->list, iter);
+}
+
+static struct sched_cluster *alloc_new_cluster(const struct cpumask *cpus)
+{
+ struct sched_cluster *cluster = NULL;
+
+ cluster = kzalloc(sizeof(struct sched_cluster), GFP_ATOMIC);
+ if (!cluster) {
+ __WARN_printf("Cluster allocation failed. \
+ Possible bad scheduling\n");
+ return NULL;
+ }
+
+ INIT_LIST_HEAD(&cluster->list);
+ cluster->max_power_cost = 1;
+ cluster->min_power_cost = 1;
+ cluster->capacity = 1024;
+ cluster->max_possible_capacity = 1024;
+ cluster->efficiency = 1;
+ cluster->load_scale_factor = 1024;
+ cluster->cur_freq = 1;
+ cluster->max_freq = 1;
+ cluster->max_mitigated_freq = UINT_MAX;
+ cluster->min_freq = 1;
+ cluster->max_possible_freq = 1;
+ cluster->dstate = 0;
+ cluster->dstate_wakeup_energy = 0;
+ cluster->dstate_wakeup_latency = 0;
+ cluster->freq_init_done = false;
+
+ cluster->cpus = *cpus;
+ cluster->efficiency = arch_get_cpu_efficiency(cpumask_first(cpus));
+
+ if (cluster->efficiency > max_possible_efficiency)
+ max_possible_efficiency = cluster->efficiency;
+ if (cluster->efficiency < min_possible_efficiency)
+ min_possible_efficiency = cluster->efficiency;
+
+ return cluster;
+}
+
+static void add_cluster(const struct cpumask *cpus, struct list_head *head)
+{
+ struct sched_cluster *cluster = alloc_new_cluster(cpus);
+ int i;
+
+ if (!cluster)
+ return;
+
+ for_each_cpu(i, cpus)
+ cpu_rq(i)->cluster = cluster;
+
+ insert_cluster(cluster, head);
+ set_bit(num_clusters, all_cluster_ids);
+ num_clusters++;
+}
+
+#ifdef CONFIG_SMP
+static void update_cluster_topology(void)
+{
+ struct cpumask cpus = *cpu_possible_mask;
+ const struct cpumask *cluster_cpus;
+ struct list_head new_head;
+ int i;
+
+ INIT_LIST_HEAD(&new_head);
+
+ for_each_cpu(i, &cpus) {
+ cluster_cpus = cpu_coregroup_mask(i);
+ cpumask_or(&all_cluster_cpus, &all_cluster_cpus, cluster_cpus);
+ cpumask_andnot(&cpus, &cpus, cluster_cpus);
+ add_cluster(cluster_cpus, &new_head);
+ }
+
+ assign_cluster_ids(&new_head);
+
+ /*
+ * Ensure cluster ids are visible to all CPUs before making
+ * cluster_head visible.
+ */
+ move_list(&cluster_head, &new_head, false);
+}
+#endif
+
+static void init_clusters(void)
+{
+ bitmap_clear(all_cluster_ids, 0, NR_CPUS);
+ init_cluster.cpus = *cpu_possible_mask;
+ INIT_LIST_HEAD(&cluster_head);
+}
+
+int register_cpu_cycle_counter_cb(struct cpu_cycle_counter_cb *cb)
+{
+ mutex_lock(&cluster_lock);
+ if (!cb->get_cpu_cycle_counter) {
+ mutex_unlock(&cluster_lock);
+ return -EINVAL;
+ }
+
+ cpu_cycle_counter_cb = *cb;
+ use_cycle_counter = true;
+ mutex_unlock(&cluster_lock);
+
+ return 0;
+}
+
+static int __init set_sched_enable_hmp(char *str)
+{
+ int enable_hmp = 0;
+
+ get_option(&str, &enable_hmp);
+
+ sched_enable_hmp = !!enable_hmp;
+
+ return 0;
+}
+
+early_param("sched_enable_hmp", set_sched_enable_hmp);
+
+static inline int got_boost_kick(void)
+{
+ int cpu = smp_processor_id();
+ struct rq *rq = cpu_rq(cpu);
+
+ return test_bit(BOOST_KICK, &rq->hmp_flags);
+}
+
+static inline void clear_boost_kick(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ clear_bit(BOOST_KICK, &rq->hmp_flags);
+}
+
+void boost_kick(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ if (!test_and_set_bit(BOOST_KICK, &rq->hmp_flags))
+ smp_send_reschedule(cpu);
+}
+
+/* Clear any HMP scheduler related requests pending from or on cpu */
+static inline void clear_hmp_request(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long flags;
+
+ clear_boost_kick(cpu);
+ clear_reserved(cpu);
+ if (rq->push_task) {
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ if (rq->push_task) {
+ clear_reserved(rq->push_cpu);
+ put_task_struct(rq->push_task);
+ rq->push_task = NULL;
+ }
+ rq->active_balance = 0;
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+ }
+}
+
+int sched_set_static_cpu_pwr_cost(int cpu, unsigned int cost)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ rq->static_cpu_pwr_cost = cost;
+ return 0;
+}
+
+unsigned int sched_get_static_cpu_pwr_cost(int cpu)
+{
+ return cpu_rq(cpu)->static_cpu_pwr_cost;
+}
+
+int sched_set_static_cluster_pwr_cost(int cpu, unsigned int cost)
+{
+ struct sched_cluster *cluster = cpu_rq(cpu)->cluster;
+
+ cluster->static_cluster_pwr_cost = cost;
+ return 0;
+}
+
+unsigned int sched_get_static_cluster_pwr_cost(int cpu)
+{
+ return cpu_rq(cpu)->cluster->static_cluster_pwr_cost;
+}
+
+#else /* CONFIG_SCHED_HMP */
+
+static inline int got_boost_kick(void)
+{
+ return 0;
+}
+
+static inline void clear_boost_kick(int cpu) { }
+
+static inline void clear_hmp_request(int cpu) { }
+
+int register_cpu_cycle_counter_cb(struct cpu_cycle_counter_cb *cb)
+{
+ return 0;
+}
+
+#ifdef CONFIG_SMP
+static void update_cluster_topology(void) { }
+#endif
+
+#endif /* CONFIG_SCHED_HMP */
+
+#define SCHED_MIN_FREQ 1
+
+#if defined(CONFIG_SCHED_HMP)
+
+/*
+ * sched_window_stats_policy and sched_ravg_hist_size have a 'sysctl' copy
+ * associated with them. This is required for atomic update of those variables
+ * when being modifed via sysctl interface.
+ *
+ * IMPORTANT: Initialize both copies to same value!!
+ */
+
+/*
+ * Tasks that are runnable continuously for a period greather than
+ * EARLY_DETECTION_DURATION can be flagged early as potential
+ * high load tasks.
+ */
+#define EARLY_DETECTION_DURATION 9500000
+
+static __read_mostly unsigned int sched_ravg_hist_size = 5;
+__read_mostly unsigned int sysctl_sched_ravg_hist_size = 5;
+
+static __read_mostly unsigned int sched_window_stats_policy =
+ WINDOW_STATS_MAX_RECENT_AVG;
+__read_mostly unsigned int sysctl_sched_window_stats_policy =
+ WINDOW_STATS_MAX_RECENT_AVG;
+
+#define SCHED_ACCOUNT_WAIT_TIME 1
+
+__read_mostly unsigned int sysctl_sched_cpu_high_irqload = (10 * NSEC_PER_MSEC);
+
+unsigned int __read_mostly sysctl_sched_enable_colocation = 1;
+
+#ifdef CONFIG_SCHED_FREQ_INPUT
+
+__read_mostly unsigned int sysctl_sched_new_task_windows = 5;
+
+#define SCHED_FREQ_ACCOUNT_WAIT_TIME 0
+
+/*
+ * For increase, send notification if
+ * freq_required - cur_freq > sysctl_sched_freq_inc_notify
+ */
+__read_mostly int sysctl_sched_freq_inc_notify = 10 * 1024 * 1024; /* + 10GHz */
+
+/*
+ * For decrease, send notification if
+ * cur_freq - freq_required > sysctl_sched_freq_dec_notify
+ */
+__read_mostly int sysctl_sched_freq_dec_notify = 10 * 1024 * 1024; /* - 10GHz */
+
+static __read_mostly unsigned int sched_io_is_busy;
+
+__read_mostly unsigned int sysctl_sched_pred_alert_freq = 10 * 1024 * 1024;
+
+#endif /* CONFIG_SCHED_FREQ_INPUT */
+
+/* 1 -> use PELT based load stats, 0 -> use window-based load stats */
+unsigned int __read_mostly sched_use_pelt;
+
+/*
+ * Maximum possible frequency across all cpus. Task demand and cpu
+ * capacity (cpu_power) metrics are scaled in reference to it.
+ */
+unsigned int max_possible_freq = 1;
+
+/*
+ * Minimum possible max_freq across all cpus. This will be same as
+ * max_possible_freq on homogeneous systems and could be different from
+ * max_possible_freq on heterogenous systems. min_max_freq is used to derive
+ * capacity (cpu_power) of cpus.
+ */
+unsigned int min_max_freq = 1;
+
+unsigned int max_capacity = 1024; /* max(rq->capacity) */
+unsigned int min_capacity = 1024; /* min(rq->capacity) */
+unsigned int max_possible_capacity = 1024; /* max(rq->max_possible_capacity) */
+unsigned int
+min_max_possible_capacity = 1024; /* min(rq->max_possible_capacity) */
+
+/* Window size (in ns) */
+__read_mostly unsigned int sched_ravg_window = 10000000;
+
+/* Min window size (in ns) = 10ms */
+#define MIN_SCHED_RAVG_WINDOW 10000000
+
+/* Max window size (in ns) = 1s */
+#define MAX_SCHED_RAVG_WINDOW 1000000000
+
+/* Temporarily disable window-stats activity on all cpus */
+unsigned int __read_mostly sched_disable_window_stats;
+
+/*
+ * Major task runtime. If a task runs for more than sched_major_task_runtime
+ * in a window, it's considered to be generating majority of workload
+ * for this window. Prediction could be adjusted for such tasks.
+ */
+#ifdef CONFIG_SCHED_FREQ_INPUT
+__read_mostly unsigned int sched_major_task_runtime = 10000000;
+#endif
+
+static unsigned int sync_cpu;
+
+static LIST_HEAD(related_thread_groups);
+static DEFINE_RWLOCK(related_thread_group_lock);
+
+#define for_each_related_thread_group(grp) \
+ list_for_each_entry(grp, &related_thread_groups, list)
+
+/*
+ * Demand aggregation for frequency purpose:
+ *
+ * 'sched_freq_aggregate' controls aggregation of cpu demand of related threads
+ * for frequency determination purpose. This aggregation is done per-cluster.
+ *
+ * CPU demand of tasks from various related groups is aggregated per-cluster and
+ * added to the "max_busy_cpu" in that cluster, where max_busy_cpu is determined
+ * by just rq->prev_runnable_sum.
+ *
+ * Some examples follow, which assume:
+ * Cluster0 = CPU0-3, Cluster1 = CPU4-7
+ * One related thread group A that has tasks A0, A1, A2
+ *
+ * A->cpu_time[X].curr/prev_sum = counters in which cpu execution stats of
+ * tasks belonging to group A are accumulated when they run on cpu X.
+ *
+ * CX->curr/prev_sum = counters in which cpu execution stats of all tasks
+ * not belonging to group A are accumulated when they run on cpu X
+ *
+ * Lets say the stats for window M was as below:
+ *
+ * C0->prev_sum = 1ms, A->cpu_time[0].prev_sum = 5ms
+ * Task A0 ran 5ms on CPU0
+ * Task B0 ran 1ms on CPU0
+ *
+ * C1->prev_sum = 5ms, A->cpu_time[1].prev_sum = 6ms
+ * Task A1 ran 4ms on CPU1
+ * Task A2 ran 2ms on CPU1
+ * Task B1 ran 5ms on CPU1
+ *
+ * C2->prev_sum = 0ms, A->cpu_time[2].prev_sum = 0
+ * CPU2 idle
+ *
+ * C3->prev_sum = 0ms, A->cpu_time[3].prev_sum = 0
+ * CPU3 idle
+ *
+ * In this case, CPU1 was most busy going by just its prev_sum counter. Demand
+ * from all group A tasks are added to CPU1. IOW, at end of window M, cpu busy
+ * time reported to governor will be:
+ *
+ *
+ * C0 busy time = 1ms
+ * C1 busy time = 5 + 5 + 6 = 16ms
+ *
+ */
+static __read_mostly unsigned int sched_freq_aggregate;
+__read_mostly unsigned int sysctl_sched_freq_aggregate;
+
+#define EXITING_TASK_MARKER 0xdeaddead
+
+static inline int exiting_task(struct task_struct *p)
+{
+ return (p->ravg.sum_history[0] == EXITING_TASK_MARKER);
+}
+
+static int __init set_sched_ravg_window(char *str)
+{
+ get_option(&str, &sched_ravg_window);
+
+ sched_use_pelt = (sched_ravg_window < MIN_SCHED_RAVG_WINDOW ||
+ sched_ravg_window > MAX_SCHED_RAVG_WINDOW);
+
+ return 0;
+}
+
+early_param("sched_ravg_window", set_sched_ravg_window);
+
+static inline void
+update_window_start(struct rq *rq, u64 wallclock)
+{
+ s64 delta;
+ int nr_windows;
+
+ delta = wallclock - rq->window_start;
+ BUG_ON(delta < 0);
+ if (delta < sched_ravg_window)
+ return;
+
+ nr_windows = div64_u64(delta, sched_ravg_window);
+ rq->window_start += (u64)nr_windows * (u64)sched_ravg_window;
+}
+
+#define DIV64_U64_ROUNDUP(X, Y) div64_u64((X) + (Y - 1), Y)
+
+static inline u64 scale_exec_time(u64 delta, struct rq *rq)
+{
+ u32 freq;
+
+ freq = cpu_cycles_to_freq(rq->cc.cycles, rq->cc.time);
+ delta = DIV64_U64_ROUNDUP(delta * freq, max_possible_freq);
+ delta *= rq->cluster->exec_scale_factor;
+ delta >>= 10;
+
+ return delta;
+}
+
+#ifdef CONFIG_SCHED_FREQ_INPUT
+
+static inline int cpu_is_waiting_on_io(struct rq *rq)
+{
+ if (!sched_io_is_busy)
+ return 0;
+
+ return atomic_read(&rq->nr_iowait);
+}
+
+/* Does freq_required sufficiently exceed or fall behind cur_freq? */
+static inline int
+nearly_same_freq(unsigned int cur_freq, unsigned int freq_required)
+{
+ int delta = freq_required - cur_freq;
+
+ if (freq_required > cur_freq)
+ return delta < sysctl_sched_freq_inc_notify;
+
+ delta = -delta;
+
+ return delta < sysctl_sched_freq_dec_notify;
+}
+
+/* Convert busy time to frequency equivalent */
+static inline unsigned int load_to_freq(struct rq *rq, u64 load)
+{
+ unsigned int freq;
+
+ load = scale_load_to_cpu(load, cpu_of(rq));
+ load *= 128;
+ load = div64_u64(load, max_task_load());
+
+ freq = load * cpu_max_possible_freq(cpu_of(rq));
+ freq /= 128;
+
+ return freq;
+}
+
+static inline struct group_cpu_time *
+_group_cpu_time(struct related_thread_group *grp, int cpu);
+
+/*
+ * Return load from all related group in given cpu.
+ * Caller must ensure that related_thread_group_lock is held.
+ */
+static void _group_load_in_cpu(int cpu, u64 *grp_load, u64 *new_grp_load)
+{
+ struct related_thread_group *grp;
+
+ for_each_related_thread_group(grp) {
+ struct group_cpu_time *cpu_time;
+
+ cpu_time = _group_cpu_time(grp, cpu);
+ *grp_load += cpu_time->prev_runnable_sum;
+ if (new_grp_load)
+ *new_grp_load += cpu_time->nt_prev_runnable_sum;
+ }
+}
+
+/*
+ * Return load from all related groups in given frequency domain.
+ * Caller must ensure that related_thread_group_lock is held.
+ */
+static void group_load_in_freq_domain(struct cpumask *cpus,
+ u64 *grp_load, u64 *new_grp_load)
+{
+ struct related_thread_group *grp;
+ int j;
+
+ for_each_related_thread_group(grp) {
+ for_each_cpu(j, cpus) {
+ struct group_cpu_time *cpu_time;
+
+ cpu_time = _group_cpu_time(grp, j);
+ *grp_load += cpu_time->prev_runnable_sum;
+ *new_grp_load += cpu_time->nt_prev_runnable_sum;
+ }
+ }
+}
+
+/*
+ * Should scheduler alert governor for changing frequency?
+ *
+ * @check_pred - evaluate frequency based on the predictive demand
+ * @check_groups - add load from all related groups on given cpu
+ *
+ * check_groups is set to 1 if a "related" task movement/wakeup is triggering
+ * the notification check. To avoid "re-aggregation" of demand in such cases,
+ * we check whether the migrated/woken tasks demand (along with demand from
+ * existing tasks on the cpu) can be met on target cpu
+ *
+ */
+
+static int send_notification(struct rq *rq, int check_pred, int check_groups)
+{
+ unsigned int cur_freq, freq_required;
+ unsigned long flags;
+ int rc = 0;
+ u64 group_load = 0, new_load = 0;
+
+ if (!sched_enable_hmp)
+ return 0;
+
+ if (check_pred) {
+ u64 prev = rq->old_busy_time;
+ u64 predicted = rq->hmp_stats.pred_demands_sum;
+
+ if (rq->cluster->cur_freq == cpu_max_freq(cpu_of(rq)))
+ return 0;
+
+ prev = max(prev, rq->old_estimated_time);
+ if (prev > predicted)
+ return 0;
+
+ cur_freq = load_to_freq(rq, prev);
+ freq_required = load_to_freq(rq, predicted);
+
+ if (freq_required < cur_freq + sysctl_sched_pred_alert_freq)
+ return 0;
+ } else {
+ read_lock(&related_thread_group_lock);
+ /*
+ * Protect from concurrent update of rq->prev_runnable_sum and
+ * group cpu load
+ */
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ if (check_groups)
+ _group_load_in_cpu(cpu_of(rq), &group_load, NULL);
+
+ new_load = rq->prev_runnable_sum + group_load;
+
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+ read_unlock(&related_thread_group_lock);
+
+ cur_freq = load_to_freq(rq, rq->old_busy_time);
+ freq_required = load_to_freq(rq, new_load);
+
+ if (nearly_same_freq(cur_freq, freq_required))
+ return 0;
+ }
+
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ if (!rq->notifier_sent) {
+ rq->notifier_sent = 1;
+ rc = 1;
+ trace_sched_freq_alert(cpu_of(rq), check_pred, check_groups, rq,
+ new_load);
+ }
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+
+ return rc;
+}
+
+/* Alert governor if there is a need to change frequency */
+void check_for_freq_change(struct rq *rq, bool check_pred, bool check_groups)
+{
+ int cpu = cpu_of(rq);
+
+ if (!send_notification(rq, check_pred, check_groups))
+ return;
+
+ atomic_notifier_call_chain(
+ &load_alert_notifier_head, 0,
+ (void *)(long)cpu);
+}
+
+static int account_busy_for_cpu_time(struct rq *rq, struct task_struct *p,
+ u64 irqtime, int event)
+{
+ if (is_idle_task(p)) {
+ /* TASK_WAKE && TASK_MIGRATE is not possible on idle task! */
+ if (event == PICK_NEXT_TASK)
+ return 0;
+
+ /* PUT_PREV_TASK, TASK_UPDATE && IRQ_UPDATE are left */
+ return irqtime || cpu_is_waiting_on_io(rq);
+ }
+
+ if (event == TASK_WAKE)
+ return 0;
+
+ if (event == PUT_PREV_TASK || event == IRQ_UPDATE)
+ return 1;
+
+ /*
+ * TASK_UPDATE can be called on sleeping task, when its moved between
+ * related groups
+ */
+ if (event == TASK_UPDATE) {
+ if (rq->curr == p)
+ return 1;
+
+ return p->on_rq ? SCHED_FREQ_ACCOUNT_WAIT_TIME : 0;
+ }
+
+ /* TASK_MIGRATE, PICK_NEXT_TASK left */
+ return SCHED_FREQ_ACCOUNT_WAIT_TIME;
+}
+
+static inline bool is_new_task(struct task_struct *p)
+{
+ return p->ravg.active_windows < sysctl_sched_new_task_windows;
+}
+
+#define INC_STEP 8
+#define DEC_STEP 2
+#define CONSISTENT_THRES 16
+#define INC_STEP_BIG 16
+/*
+ * bucket_increase - update the count of all buckets
+ *
+ * @buckets: array of buckets tracking busy time of a task
+ * @idx: the index of bucket to be incremented
+ *
+ * Each time a complete window finishes, count of bucket that runtime
+ * falls in (@idx) is incremented. Counts of all other buckets are
+ * decayed. The rate of increase and decay could be different based
+ * on current count in the bucket.
+ */
+static inline void bucket_increase(u8 *buckets, int idx)
+{
+ int i, step;
+
+ for (i = 0; i < NUM_BUSY_BUCKETS; i++) {
+ if (idx != i) {
+ if (buckets[i] > DEC_STEP)
+ buckets[i] -= DEC_STEP;
+ else
+ buckets[i] = 0;
+ } else {
+ step = buckets[i] >= CONSISTENT_THRES ?
+ INC_STEP_BIG : INC_STEP;
+ if (buckets[i] > U8_MAX - step)
+ buckets[i] = U8_MAX;
+ else
+ buckets[i] += step;
+ }
+ }
+}
+
+static inline int busy_to_bucket(u32 normalized_rt)
+{
+ int bidx;
+
+ bidx = mult_frac(normalized_rt, NUM_BUSY_BUCKETS, max_task_load());
+ bidx = min(bidx, NUM_BUSY_BUCKETS - 1);
+
+ /*
+ * Combine lowest two buckets. The lowest frequency falls into
+ * 2nd bucket and thus keep predicting lowest bucket is not
+ * useful.
+ */
+ if (!bidx)
+ bidx++;
+
+ return bidx;
+}
+
+static inline u64
+scale_load_to_freq(u64 load, unsigned int src_freq, unsigned int dst_freq)
+{
+ return div64_u64(load * (u64)src_freq, (u64)dst_freq);
+}
+
+#define HEAVY_TASK_SKIP 2
+#define HEAVY_TASK_SKIP_LIMIT 4
+/*
+ * get_pred_busy - calculate predicted demand for a task on runqueue
+ *
+ * @rq: runqueue of task p
+ * @p: task whose prediction is being updated
+ * @start: starting bucket. returned prediction should not be lower than
+ * this bucket.
+ * @runtime: runtime of the task. returned prediction should not be lower
+ * than this runtime.
+ * Note: @start can be derived from @runtime. It's passed in only to
+ * avoid duplicated calculation in some cases.
+ *
+ * A new predicted busy time is returned for task @p based on @runtime
+ * passed in. The function searches through buckets that represent busy
+ * time equal to or bigger than @runtime and attempts to find the bucket to
+ * to use for prediction. Once found, it searches through historical busy
+ * time and returns the latest that falls into the bucket. If no such busy
+ * time exists, it returns the medium of that bucket.
+ */
+static u32 get_pred_busy(struct rq *rq, struct task_struct *p,
+ int start, u32 runtime)
+{
+ int i;
+ u8 *buckets = p->ravg.busy_buckets;
+ u32 *hist = p->ravg.sum_history;
+ u32 dmin, dmax;
+ u64 cur_freq_runtime = 0;
+ int first = NUM_BUSY_BUCKETS, final, skip_to;
+ u32 ret = runtime;
+
+ /* skip prediction for new tasks due to lack of history */
+ if (unlikely(is_new_task(p)))
+ goto out;
+
+ /* find minimal bucket index to pick */
+ for (i = start; i < NUM_BUSY_BUCKETS; i++) {
+ if (buckets[i]) {
+ first = i;
+ break;
+ }
+ }
+ /* if no higher buckets are filled, predict runtime */
+ if (first >= NUM_BUSY_BUCKETS)
+ goto out;
+
+ /* compute the bucket for prediction */
+ final = first;
+ if (first < HEAVY_TASK_SKIP_LIMIT) {
+ /* compute runtime at current CPU frequency */
+ cur_freq_runtime = mult_frac(runtime, max_possible_efficiency,
+ rq->cluster->efficiency);
+ cur_freq_runtime = scale_load_to_freq(cur_freq_runtime,
+ max_possible_freq, rq->cluster->cur_freq);
+ /*
+ * if the task runs for majority of the window, try to
+ * pick higher buckets.
+ */
+ if (cur_freq_runtime >= sched_major_task_runtime) {
+ int next = NUM_BUSY_BUCKETS;
+ /*
+ * if there is a higher bucket that's consistently
+ * hit, don't jump beyond that.
+ */
+ for (i = start + 1; i <= HEAVY_TASK_SKIP_LIMIT &&
+ i < NUM_BUSY_BUCKETS; i++) {
+ if (buckets[i] > CONSISTENT_THRES) {
+ next = i;
+ break;
+ }
+ }
+ skip_to = min(next, start + HEAVY_TASK_SKIP);
+ /* don't jump beyond HEAVY_TASK_SKIP_LIMIT */
+ skip_to = min(HEAVY_TASK_SKIP_LIMIT, skip_to);
+ /* don't go below first non-empty bucket, if any */
+ final = max(first, skip_to);
+ }
+ }
+
+ /* determine demand range for the predicted bucket */
+ if (final < 2) {
+ /* lowest two buckets are combined */
+ dmin = 0;
+ final = 1;
+ } else {
+ dmin = mult_frac(final, max_task_load(), NUM_BUSY_BUCKETS);
+ }
+ dmax = mult_frac(final + 1, max_task_load(), NUM_BUSY_BUCKETS);
+
+ /*
+ * search through runtime history and return first runtime that falls
+ * into the range of predicted bucket.
+ */
+ for (i = 0; i < sched_ravg_hist_size; i++) {
+ if (hist[i] >= dmin && hist[i] < dmax) {
+ ret = hist[i];
+ break;
+ }
+ }
+ /* no historical runtime within bucket found, use average of the bin */
+ if (ret < dmin)
+ ret = (dmin + dmax) / 2;
+ /*
+ * when updating in middle of a window, runtime could be higher
+ * than all recorded history. Always predict at least runtime.
+ */
+ ret = max(runtime, ret);
+out:
+ trace_sched_update_pred_demand(rq, p, runtime,
+ mult_frac((unsigned int)cur_freq_runtime, 100,
+ sched_ravg_window), ret);
+ return ret;
+}
+
+static inline u32 calc_pred_demand(struct rq *rq, struct task_struct *p)
+{
+ if (p->ravg.pred_demand >= p->ravg.curr_window)
+ return p->ravg.pred_demand;
+
+ return get_pred_busy(rq, p, busy_to_bucket(p->ravg.curr_window),
+ p->ravg.curr_window);
+}
+
+/*
+ * predictive demand of a task is calculated at the window roll-over.
+ * if the task current window busy time exceeds the predicted
+ * demand, update it here to reflect the task needs.
+ */
+void update_task_pred_demand(struct rq *rq, struct task_struct *p, int event)
+{
+ u32 new, old;
+
+ if (is_idle_task(p) || exiting_task(p))
+ return;
+
+ if (event != PUT_PREV_TASK && event != TASK_UPDATE &&
+ (!SCHED_FREQ_ACCOUNT_WAIT_TIME ||
+ (event != TASK_MIGRATE &&
+ event != PICK_NEXT_TASK)))
+ return;
+
+ /*
+ * TASK_UPDATE can be called on sleeping task, when its moved between
+ * related groups
+ */
+ if (event == TASK_UPDATE) {
+ if (!p->on_rq && !SCHED_FREQ_ACCOUNT_WAIT_TIME)
+ return;
+ }
+
+ new = calc_pred_demand(rq, p);
+ old = p->ravg.pred_demand;
+
+ if (old >= new)
+ return;
+
+ if (task_on_rq_queued(p) && (!task_has_dl_policy(p) ||
+ !p->dl.dl_throttled))
+ p->sched_class->fixup_hmp_sched_stats(rq, p,
+ p->ravg.demand,
+ new);
+
+ p->ravg.pred_demand = new;
+}
+
+/*
+ * Account cpu activity in its busy time counters (rq->curr/prev_runnable_sum)
+ */
+static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
+ int event, u64 wallclock, u64 irqtime)
+{
+ int new_window, full_window = 0;
+ int p_is_curr_task = (p == rq->curr);
+ u64 mark_start = p->ravg.mark_start;
+ u64 window_start = rq->window_start;
+ u32 window_size = sched_ravg_window;
+ u64 delta;
+ u64 *curr_runnable_sum = &rq->curr_runnable_sum;
+ u64 *prev_runnable_sum = &rq->prev_runnable_sum;
+ u64 *nt_curr_runnable_sum = &rq->nt_curr_runnable_sum;
+ u64 *nt_prev_runnable_sum = &rq->nt_prev_runnable_sum;
+ int flip_counters = 0;
+ int prev_sum_reset = 0;
+ bool new_task;
+ struct related_thread_group *grp;
+
+ new_window = mark_start < window_start;
+ if (new_window) {
+ full_window = (window_start - mark_start) >= window_size;
+ if (p->ravg.active_windows < USHRT_MAX)
+ p->ravg.active_windows++;
+ }
+
+ new_task = is_new_task(p);
+
+ grp = p->grp;
+ if (grp && sched_freq_aggregate) {
+ /* cpu_time protected by rq_lock */
+ struct group_cpu_time *cpu_time =
+ _group_cpu_time(grp, cpu_of(rq));
+
+ curr_runnable_sum = &cpu_time->curr_runnable_sum;
+ prev_runnable_sum = &cpu_time->prev_runnable_sum;
+
+ nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
+ nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
+
+ if (cpu_time->window_start != rq->window_start) {
+ int nr_windows;
+
+ delta = rq->window_start - cpu_time->window_start;
+ nr_windows = div64_u64(delta, window_size);
+ if (nr_windows > 1)
+ prev_sum_reset = 1;
+
+ cpu_time->window_start = rq->window_start;
+ flip_counters = 1;
+ }
+
+ if (p_is_curr_task && new_window) {
+ u64 curr_sum = rq->curr_runnable_sum;
+ u64 nt_curr_sum = rq->nt_curr_runnable_sum;
+
+ if (full_window)
+ curr_sum = nt_curr_sum = 0;
+
+ rq->prev_runnable_sum = curr_sum;
+ rq->nt_prev_runnable_sum = nt_curr_sum;
+
+ rq->curr_runnable_sum = 0;
+ rq->nt_curr_runnable_sum = 0;
+ }
+ } else {
+ if (p_is_curr_task && new_window) {
+ flip_counters = 1;
+ if (full_window)
+ prev_sum_reset = 1;
+ }
+ }
+
+ /* Handle per-task window rollover. We don't care about the idle
+ * task or exiting tasks. */
+ if (new_window && !is_idle_task(p) && !exiting_task(p)) {
+ u32 curr_window = 0;
+
+ if (!full_window)
+ curr_window = p->ravg.curr_window;
+
+ p->ravg.prev_window = curr_window;
+ p->ravg.curr_window = 0;
+ }
+
+ if (flip_counters) {
+ u64 curr_sum = *curr_runnable_sum;
+ u64 nt_curr_sum = *nt_curr_runnable_sum;
+
+ if (prev_sum_reset)
+ curr_sum = nt_curr_sum = 0;
+
+ *prev_runnable_sum = curr_sum;
+ *nt_prev_runnable_sum = nt_curr_sum;
+
+ *curr_runnable_sum = 0;
+ *nt_curr_runnable_sum = 0;
+ }
+
+ if (!account_busy_for_cpu_time(rq, p, irqtime, event)) {
+ /* account_busy_for_cpu_time() = 0, so no update to the
+ * task's current window needs to be made. This could be
+ * for example
+ *
+ * - a wakeup event on a task within the current
+ * window (!new_window below, no action required),
+ * - switching to a new task from idle (PICK_NEXT_TASK)
+ * in a new window where irqtime is 0 and we aren't
+ * waiting on IO */
+
+ if (!new_window)
+ return;
+
+ /* A new window has started. The RQ demand must be rolled
+ * over if p is the current task. */
+ if (p_is_curr_task) {
+ /* p is idle task */
+ BUG_ON(p != rq->idle);
+ }
+
+ return;
+ }
+
+ if (!new_window) {
+ /* account_busy_for_cpu_time() = 1 so busy time needs
+ * to be accounted to the current window. No rollover
+ * since we didn't start a new window. An example of this is
+ * when a task starts execution and then sleeps within the
+ * same window. */
+
+ if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq))
+ delta = wallclock - mark_start;
+ else
+ delta = irqtime;
+ delta = scale_exec_time(delta, rq);
+ *curr_runnable_sum += delta;
+ if (new_task)
+ *nt_curr_runnable_sum += delta;
+
+ if (!is_idle_task(p) && !exiting_task(p))
+ p->ravg.curr_window += delta;
+
+ return;
+ }
+
+ if (!p_is_curr_task) {
+ /* account_busy_for_cpu_time() = 1 so busy time needs
+ * to be accounted to the current window. A new window
+ * has also started, but p is not the current task, so the
+ * window is not rolled over - just split up and account
+ * as necessary into curr and prev. The window is only
+ * rolled over when a new window is processed for the current
+ * task.
+ *
+ * Irqtime can't be accounted by a task that isn't the
+ * currently running task. */
+
+ if (!full_window) {
+ /* A full window hasn't elapsed, account partial
+ * contribution to previous completed window. */
+ delta = scale_exec_time(window_start - mark_start, rq);
+ if (!exiting_task(p))
+ p->ravg.prev_window += delta;
+ } else {
+ /* Since at least one full window has elapsed,
+ * the contribution to the previous window is the
+ * full window (window_size). */
+ delta = scale_exec_time(window_size, rq);
+ if (!exiting_task(p))
+ p->ravg.prev_window = delta;
+ }
+
+ *prev_runnable_sum += delta;
+ if (new_task)
+ *nt_prev_runnable_sum += delta;
+
+ /* Account piece of busy time in the current window. */
+ delta = scale_exec_time(wallclock - window_start, rq);
+ *curr_runnable_sum += delta;
+ if (new_task)
+ *nt_curr_runnable_sum += delta;
+
+ if (!exiting_task(p))
+ p->ravg.curr_window = delta;
+
+ return;
+ }
+
+ if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq)) {
+ /* account_busy_for_cpu_time() = 1 so busy time needs
+ * to be accounted to the current window. A new window
+ * has started and p is the current task so rollover is
+ * needed. If any of these three above conditions are true
+ * then this busy time can't be accounted as irqtime.
+ *
+ * Busy time for the idle task or exiting tasks need not
+ * be accounted.
+ *
+ * An example of this would be a task that starts execution
+ * and then sleeps once a new window has begun. */
+
+ if (!full_window) {
+ /* A full window hasn't elapsed, account partial
+ * contribution to previous completed window. */
+ delta = scale_exec_time(window_start - mark_start, rq);
+ if (!is_idle_task(p) && !exiting_task(p))
+ p->ravg.prev_window += delta;
+ } else {
+ /* Since at least one full window has elapsed,
+ * the contribution to the previous window is the
+ * full window (window_size). */
+ delta = scale_exec_time(window_size, rq);
+ if (!is_idle_task(p) && !exiting_task(p))
+ p->ravg.prev_window = delta;
+ }
+
+ /* Rollover is done here by overwriting the values in
+ * prev_runnable_sum and curr_runnable_sum. */
+ *prev_runnable_sum += delta;
+ if (new_task)
+ *nt_prev_runnable_sum += delta;
+
+ /* Account piece of busy time in the current window. */
+ delta = scale_exec_time(wallclock - window_start, rq);
+ *curr_runnable_sum += delta;
+ if (new_task)
+ *nt_curr_runnable_sum += delta;
+
+ if (!is_idle_task(p) && !exiting_task(p))
+ p->ravg.curr_window = delta;
+
+ return;
+ }
+
+ if (irqtime) {
+ /* account_busy_for_cpu_time() = 1 so busy time needs
+ * to be accounted to the current window. A new window
+ * has started and p is the current task so rollover is
+ * needed. The current task must be the idle task because
+ * irqtime is not accounted for any other task.
+ *
+ * Irqtime will be accounted each time we process IRQ activity
+ * after a period of idleness, so we know the IRQ busy time
+ * started at wallclock - irqtime. */
+
+ BUG_ON(!is_idle_task(p));
+ mark_start = wallclock - irqtime;
+
+ /* Roll window over. If IRQ busy time was just in the current
+ * window then that is all that need be accounted. */
+ if (mark_start > window_start) {
+ *curr_runnable_sum = scale_exec_time(irqtime, rq);
+ return;
+ }
+
+ /* The IRQ busy time spanned multiple windows. Process the
+ * busy time preceding the current window start first. */
+ delta = window_start - mark_start;
+ if (delta > window_size)
+ delta = window_size;
+ delta = scale_exec_time(delta, rq);
+ *prev_runnable_sum += delta;
+
+ /* Process the remaining IRQ busy time in the current window. */
+ delta = wallclock - window_start;
+ rq->curr_runnable_sum = scale_exec_time(delta, rq);
+
+ return;
+ }
+
+ BUG();
+}
+
+static inline u32 predict_and_update_buckets(struct rq *rq,
+ struct task_struct *p, u32 runtime) {
+
+ int bidx;
+ u32 pred_demand;
+
+ bidx = busy_to_bucket(runtime);
+ pred_demand = get_pred_busy(rq, p, bidx, runtime);
+ bucket_increase(p->ravg.busy_buckets, bidx);
+
+ return pred_demand;
+}
+#define assign_ravg_pred_demand(x) (p->ravg.pred_demand = x)
+
+#else /* CONFIG_SCHED_FREQ_INPUT */
+
+static inline void
+update_task_pred_demand(struct rq *rq, struct task_struct *p, int event)
+{
+}
+
+static inline void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
+ int event, u64 wallclock, u64 irqtime)
+{
+}
+
+static inline u32 predict_and_update_buckets(struct rq *rq,
+ struct task_struct *p, u32 runtime)
+{
+ return 0;
+}
+#define assign_ravg_pred_demand(x)
+
+#endif /* CONFIG_SCHED_FREQ_INPUT */
+
+static void update_task_cpu_cycles(struct task_struct *p, int cpu)
+{
+ if (use_cycle_counter)
+ p->cpu_cycles = cpu_cycle_counter_cb.get_cpu_cycle_counter(cpu);
+}
+
+static void
+update_task_rq_cpu_cycles(struct task_struct *p, struct rq *rq, int event,
+ u64 wallclock, u64 irqtime)
+{
+ u64 cur_cycles;
+ int cpu = cpu_of(rq);
+
+ lockdep_assert_held(&rq->lock);
+
+ if (!use_cycle_counter) {
+ rq->cc.cycles = cpu_cur_freq(cpu);
+ rq->cc.time = 1;
+ return;
+ }
+
+ cur_cycles = cpu_cycle_counter_cb.get_cpu_cycle_counter(cpu);
+
+ /*
+ * If current task is idle task and irqtime == 0 CPU was
+ * indeed idle and probably its cycle counter was not
+ * increasing. We still need estimatied CPU frequency
+ * for IO wait time accounting. Use the previously
+ * calculated frequency in such a case.
+ */
+ if (!is_idle_task(rq->curr) || irqtime) {
+ if (unlikely(cur_cycles < p->cpu_cycles))
+ rq->cc.cycles = cur_cycles + (U64_MAX - p->cpu_cycles);
+ else
+ rq->cc.cycles = cur_cycles - p->cpu_cycles;
+ rq->cc.cycles = rq->cc.cycles * NSEC_PER_MSEC;
+
+ if (event == IRQ_UPDATE && is_idle_task(p))
+ /*
+ * Time between mark_start of idle task and IRQ handler
+ * entry time is CPU cycle counter stall period.
+ * Upon IRQ handler entry sched_account_irqstart()
+ * replenishes idle task's cpu cycle counter so
+ * rq->cc.cycles now represents increased cycles during
+ * IRQ handler rather than time between idle entry and
+ * IRQ exit. Thus use irqtime as time delta.
+ */
+ rq->cc.time = irqtime;
+ else
+ rq->cc.time = wallclock - p->ravg.mark_start;
+ BUG_ON((s64)rq->cc.time < 0);
+ }
+
+ p->cpu_cycles = cur_cycles;
+
+ trace_sched_get_task_cpu_cycles(cpu, event, rq->cc.cycles, rq->cc.time);
+}
+
+static int account_busy_for_task_demand(struct task_struct *p, int event)
+{
+ /* No need to bother updating task demand for exiting tasks
+ * or the idle task. */
+ if (exiting_task(p) || is_idle_task(p))
+ return 0;
+
+ /* When a task is waking up it is completing a segment of non-busy
+ * time. Likewise, if wait time is not treated as busy time, then
+ * when a task begins to run or is migrated, it is not running and
+ * is completing a segment of non-busy time. */
+ if (event == TASK_WAKE || (!SCHED_ACCOUNT_WAIT_TIME &&
+ (event == PICK_NEXT_TASK || event == TASK_MIGRATE)))
+ return 0;
+
+ return 1;
+}
+
+/*
+ * Called when new window is starting for a task, to record cpu usage over
+ * recently concluded window(s). Normally 'samples' should be 1. It can be > 1
+ * when, say, a real-time task runs without preemption for several windows at a
+ * stretch.
+ */
+static void update_history(struct rq *rq, struct task_struct *p,
+ u32 runtime, int samples, int event)
+{
+ u32 *hist = &p->ravg.sum_history[0];
+ int ridx, widx;
+ u32 max = 0, avg, demand, pred_demand;
+ u64 sum = 0;
+
+ /* Ignore windows where task had no activity */
+ if (!runtime || is_idle_task(p) || exiting_task(p) || !samples)
+ goto done;
+
+ /* Push new 'runtime' value onto stack */
+ widx = sched_ravg_hist_size - 1;
+ ridx = widx - samples;
+ for (; ridx >= 0; --widx, --ridx) {
+ hist[widx] = hist[ridx];
+ sum += hist[widx];
+ if (hist[widx] > max)
+ max = hist[widx];
+ }
+
+ for (widx = 0; widx < samples && widx < sched_ravg_hist_size; widx++) {
+ hist[widx] = runtime;
+ sum += hist[widx];
+ if (hist[widx] > max)
+ max = hist[widx];
+ }
+
+ p->ravg.sum = 0;
+
+ if (sched_window_stats_policy == WINDOW_STATS_RECENT) {
+ demand = runtime;
+ } else if (sched_window_stats_policy == WINDOW_STATS_MAX) {
+ demand = max;
+ } else {
+ avg = div64_u64(sum, sched_ravg_hist_size);
+ if (sched_window_stats_policy == WINDOW_STATS_AVG)
+ demand = avg;
+ else
+ demand = max(avg, runtime);
+ }
+ pred_demand = predict_and_update_buckets(rq, p, runtime);
+
+ /*
+ * A throttled deadline sched class task gets dequeued without
+ * changing p->on_rq. Since the dequeue decrements hmp stats
+ * avoid decrementing it here again.
+ */
+ if (task_on_rq_queued(p) && (!task_has_dl_policy(p) ||
+ !p->dl.dl_throttled))
+ p->sched_class->fixup_hmp_sched_stats(rq, p, demand,
+ pred_demand);
+
+ p->ravg.demand = demand;
+ assign_ravg_pred_demand(pred_demand);
+
+done:
+ trace_sched_update_history(rq, p, runtime, samples, event);
+}
+
+static void add_to_task_demand(struct rq *rq, struct task_struct *p, u64 delta)
+{
+ delta = scale_exec_time(delta, rq);
+ p->ravg.sum += delta;
+ if (unlikely(p->ravg.sum > sched_ravg_window))
+ p->ravg.sum = sched_ravg_window;
+}
+
+/*
+ * Account cpu demand of task and/or update task's cpu demand history
+ *
+ * ms = p->ravg.mark_start;
+ * wc = wallclock
+ * ws = rq->window_start
+ *
+ * Three possibilities:
+ *
+ * a) Task event is contained within one window.
+ * window_start < mark_start < wallclock
+ *
+ * ws ms wc
+ * | | |
+ * V V V
+ * |---------------|
+ *
+ * In this case, p->ravg.sum is updated *iff* event is appropriate
+ * (ex: event == PUT_PREV_TASK)
+ *
+ * b) Task event spans two windows.
+ * mark_start < window_start < wallclock
+ *
+ * ms ws wc
+ * | | |
+ * V V V
+ * -----|-------------------
+ *
+ * In this case, p->ravg.sum is updated with (ws - ms) *iff* event
+ * is appropriate, then a new window sample is recorded followed
+ * by p->ravg.sum being set to (wc - ws) *iff* event is appropriate.
+ *
+ * c) Task event spans more than two windows.
+ *
+ * ms ws_tmp ws wc
+ * | | | |
+ * V V V V
+ * ---|-------|-------|-------|-------|------
+ * | |
+ * |<------ nr_full_windows ------>|
+ *
+ * In this case, p->ravg.sum is updated with (ws_tmp - ms) first *iff*
+ * event is appropriate, window sample of p->ravg.sum is recorded,
+ * 'nr_full_window' samples of window_size is also recorded *iff*
+ * event is appropriate and finally p->ravg.sum is set to (wc - ws)
+ * *iff* event is appropriate.
+ *
+ * IMPORTANT : Leave p->ravg.mark_start unchanged, as update_cpu_busy_time()
+ * depends on it!
+ */
+static void update_task_demand(struct task_struct *p, struct rq *rq,
+ int event, u64 wallclock)
+{
+ u64 mark_start = p->ravg.mark_start;
+ u64 delta, window_start = rq->window_start;
+ int new_window, nr_full_windows;
+ u32 window_size = sched_ravg_window;
+
+ new_window = mark_start < window_start;
+ if (!account_busy_for_task_demand(p, event)) {
+ if (new_window)
+ /* If the time accounted isn't being accounted as
+ * busy time, and a new window started, only the
+ * previous window need be closed out with the
+ * pre-existing demand. Multiple windows may have
+ * elapsed, but since empty windows are dropped,
+ * it is not necessary to account those. */
+ update_history(rq, p, p->ravg.sum, 1, event);
+ return;
+ }
+
+ if (!new_window) {
+ /* The simple case - busy time contained within the existing
+ * window. */
+ add_to_task_demand(rq, p, wallclock - mark_start);
+ return;
+ }
+
+ /* Busy time spans at least two windows. Temporarily rewind
+ * window_start to first window boundary after mark_start. */
+ delta = window_start - mark_start;
+ nr_full_windows = div64_u64(delta, window_size);
+ window_start -= (u64)nr_full_windows * (u64)window_size;
+
+ /* Process (window_start - mark_start) first */
+ add_to_task_demand(rq, p, window_start - mark_start);
+
+ /* Push new sample(s) into task's demand history */
+ update_history(rq, p, p->ravg.sum, 1, event);
+ if (nr_full_windows)
+ update_history(rq, p, scale_exec_time(window_size, rq),
+ nr_full_windows, event);
+
+ /* Roll window_start back to current to process any remainder
+ * in current window. */
+ window_start += (u64)nr_full_windows * (u64)window_size;
+
+ /* Process (wallclock - window_start) next */
+ mark_start = window_start;
+ add_to_task_demand(rq, p, wallclock - mark_start);
+}
+
+/* Reflect task activity on its demand and cpu's busy time statistics */
+static void
+update_task_ravg(struct task_struct *p, struct rq *rq, int event,
+ u64 wallclock, u64 irqtime)
+{
+ if (sched_use_pelt || !rq->window_start || sched_disable_window_stats)
+ return;
+
+ lockdep_assert_held(&rq->lock);
+
+ update_window_start(rq, wallclock);
+
+ if (!p->ravg.mark_start) {
+ update_task_cpu_cycles(p, cpu_of(rq));
+ goto done;
+ }
+
+ update_task_rq_cpu_cycles(p, rq, event, wallclock, irqtime);
+ update_task_demand(p, rq, event, wallclock);
+ update_cpu_busy_time(p, rq, event, wallclock, irqtime);
+ update_task_pred_demand(rq, p, event);
+done:
+ trace_sched_update_task_ravg(p, rq, event, wallclock, irqtime,
+ rq->cc.cycles, rq->cc.time,
+ _group_cpu_time(p->grp, cpu_of(rq)));
+
+ p->ravg.mark_start = wallclock;
+}
+
+void sched_account_irqtime(int cpu, struct task_struct *curr,
+ u64 delta, u64 wallclock)
+{
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long flags, nr_windows;
+ u64 cur_jiffies_ts;
+
+ raw_spin_lock_irqsave(&rq->lock, flags);
+
+ /*
+ * cputime (wallclock) uses sched_clock so use the same here for
+ * consistency.
+ */
+ delta += sched_clock() - wallclock;
+ cur_jiffies_ts = get_jiffies_64();
+
+ if (is_idle_task(curr))
+ update_task_ravg(curr, rq, IRQ_UPDATE, sched_ktime_clock(),
+ delta);
+
+ nr_windows = cur_jiffies_ts - rq->irqload_ts;
+
+ if (nr_windows) {
+ if (nr_windows < 10) {
+ /* Decay CPU's irqload by 3/4 for each window. */
+ rq->avg_irqload *= (3 * nr_windows);
+ rq->avg_irqload = div64_u64(rq->avg_irqload,
+ 4 * nr_windows);
+ } else {
+ rq->avg_irqload = 0;
+ }
+ rq->avg_irqload += rq->cur_irqload;
+ rq->cur_irqload = 0;
+ }
+
+ rq->cur_irqload += delta;
+ rq->irqload_ts = cur_jiffies_ts;
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+}
+
+void sched_account_irqstart(int cpu, struct task_struct *curr, u64 wallclock)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ if (!rq->window_start || sched_disable_window_stats)
+ return;
+
+ if (is_idle_task(curr)) {
+ /* We're here without rq->lock held, IRQ disabled */
+ raw_spin_lock(&rq->lock);
+ update_task_cpu_cycles(curr, cpu);
+ raw_spin_unlock(&rq->lock);
+ }
+}
+
+static void reset_task_stats(struct task_struct *p)
+{
+ u32 sum = 0;
+
+ if (exiting_task(p))
+ sum = EXITING_TASK_MARKER;
+
+ memset(&p->ravg, 0, sizeof(struct ravg));
+ /* Retain EXITING_TASK marker */
+ p->ravg.sum_history[0] = sum;
+}
+
+static inline void mark_task_starting(struct task_struct *p)
+{
+ u64 wallclock;
+ struct rq *rq = task_rq(p);
+
+ if (!rq->window_start || sched_disable_window_stats) {
+ reset_task_stats(p);
+ return;
+ }
+
+ wallclock = sched_ktime_clock();
+ p->ravg.mark_start = p->last_wake_ts = wallclock;
+ p->last_cpu_selected_ts = wallclock;
+ p->last_switch_out_ts = 0;
+ update_task_cpu_cycles(p, cpu_of(rq));
+}
+
+static inline void set_window_start(struct rq *rq)
+{
+ int cpu = cpu_of(rq);
+ struct rq *sync_rq = cpu_rq(sync_cpu);
+
+ if (rq->window_start || !sched_enable_hmp)
+ return;
+
+ if (cpu == sync_cpu) {
+ rq->window_start = sched_ktime_clock();
+ } else {
+ raw_spin_unlock(&rq->lock);
+ double_rq_lock(rq, sync_rq);
+ rq->window_start = cpu_rq(sync_cpu)->window_start;
+#ifdef CONFIG_SCHED_FREQ_INPUT
+ rq->curr_runnable_sum = rq->prev_runnable_sum = 0;
+ rq->nt_curr_runnable_sum = rq->nt_prev_runnable_sum = 0;
+#endif
+ raw_spin_unlock(&sync_rq->lock);
+ }
+
+ rq->curr->ravg.mark_start = rq->window_start;
+}
+
+static inline void migrate_sync_cpu(int cpu)
+{
+ if (cpu == sync_cpu)
+ sync_cpu = smp_processor_id();
+}
+
+static void reset_all_task_stats(void)
+{
+ struct task_struct *g, *p;
+
+ read_lock(&tasklist_lock);
+ do_each_thread(g, p) {
+ reset_task_stats(p);
+ } while_each_thread(g, p);
+ read_unlock(&tasklist_lock);
+}
+
+/*
+ * sched_exit() - Set EXITING_TASK_MARKER in task's ravg.demand field
+ *
+ * Stop accounting (exiting) task's future cpu usage
+ *
+ * We need this so that reset_all_windows_stats() can function correctly.
+ * reset_all_window_stats() depends on do_each_thread/for_each_thread task
+ * iterators to reset *all* task's statistics. Exiting tasks however become
+ * invisible to those iterators. sched_exit() is called on a exiting task prior
+ * to being removed from task_list, which will let reset_all_window_stats()
+ * function correctly.
+ */
+void sched_exit(struct task_struct *p)
+{
+ unsigned long flags;
+ int cpu = get_cpu();
+ struct rq *rq = cpu_rq(cpu);
+ u64 wallclock;
+
+ sched_set_group_id(p, 0);
+
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ /* rq->curr == p */
+ wallclock = sched_ktime_clock();
+ update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
+ dequeue_task(rq, p, 0);
+ reset_task_stats(p);
+ p->ravg.mark_start = wallclock;
+ p->ravg.sum_history[0] = EXITING_TASK_MARKER;
+ enqueue_task(rq, p, 0);
+ clear_ed_task(p, rq);
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+
+ put_cpu();
+}
+
+static void disable_window_stats(void)
+{
+ unsigned long flags;
+ int i;
+
+ local_irq_save(flags);
+ for_each_possible_cpu(i)
+ raw_spin_lock(&cpu_rq(i)->lock);
+
+ sched_disable_window_stats = 1;
+
+ for_each_possible_cpu(i)
+ raw_spin_unlock(&cpu_rq(i)->lock);
+
+ local_irq_restore(flags);
+}
+
+/* Called with all cpu's rq->lock held */
+static void enable_window_stats(void)
+{
+ sched_disable_window_stats = 0;
+
+}
+
+enum reset_reason_code {
+ WINDOW_CHANGE,
+ POLICY_CHANGE,
+ HIST_SIZE_CHANGE,
+ FREQ_AGGREGATE_CHANGE,
+};
+
+const char *sched_window_reset_reasons[] = {
+ "WINDOW_CHANGE",
+ "POLICY_CHANGE",
+ "HIST_SIZE_CHANGE",
+};
+
+/* Called with IRQs enabled */
+void reset_all_window_stats(u64 window_start, unsigned int window_size)
+{
+ int cpu;
+ unsigned long flags;
+ u64 start_ts = sched_ktime_clock();
+ int reason = WINDOW_CHANGE;
+ unsigned int old = 0, new = 0;
+ struct related_thread_group *grp;
+
+ disable_window_stats();
+
+ reset_all_task_stats();
+
+ local_irq_save(flags);
+
+ read_lock(&related_thread_group_lock);
+
+ for_each_possible_cpu(cpu) {
+ struct rq *rq = cpu_rq(cpu);
+ raw_spin_lock(&rq->lock);
+ }
+
+ list_for_each_entry(grp, &related_thread_groups, list) {
+ int j;
+
+ for_each_possible_cpu(j) {
+ struct group_cpu_time *cpu_time;
+ /* Protected by rq lock */
+ cpu_time = _group_cpu_time(grp, j);
+ memset(cpu_time, 0, sizeof(struct group_cpu_time));
+ if (window_start)
+ cpu_time->window_start = window_start;
+ }
+ }
+
+ if (window_size) {
+ sched_ravg_window = window_size * TICK_NSEC;
+ set_hmp_defaults();
+ }
+
+ enable_window_stats();
+
+ for_each_possible_cpu(cpu) {
+ struct rq *rq = cpu_rq(cpu);
+
+ if (window_start)
+ rq->window_start = window_start;
+#ifdef CONFIG_SCHED_FREQ_INPUT
+ rq->curr_runnable_sum = rq->prev_runnable_sum = 0;
+ rq->nt_curr_runnable_sum = rq->nt_prev_runnable_sum = 0;
+#endif
+ reset_cpu_hmp_stats(cpu, 1);
+ }
+
+ if (sched_window_stats_policy != sysctl_sched_window_stats_policy) {
+ reason = POLICY_CHANGE;
+ old = sched_window_stats_policy;
+ new = sysctl_sched_window_stats_policy;
+ sched_window_stats_policy = sysctl_sched_window_stats_policy;
+ } else if (sched_ravg_hist_size != sysctl_sched_ravg_hist_size) {
+ reason = HIST_SIZE_CHANGE;
+ old = sched_ravg_hist_size;
+ new = sysctl_sched_ravg_hist_size;
+ sched_ravg_hist_size = sysctl_sched_ravg_hist_size;
+ }
+#ifdef CONFIG_SCHED_FREQ_INPUT
+ else if (sched_freq_aggregate !=
+ sysctl_sched_freq_aggregate) {
+ reason = FREQ_AGGREGATE_CHANGE;
+ old = sched_freq_aggregate;
+ new = sysctl_sched_freq_aggregate;
+ sched_freq_aggregate = sysctl_sched_freq_aggregate;
+ }
+#endif
+
+ for_each_possible_cpu(cpu) {
+ struct rq *rq = cpu_rq(cpu);
+ raw_spin_unlock(&rq->lock);
+ }
+
+ read_unlock(&related_thread_group_lock);
+
+ local_irq_restore(flags);
+
+ trace_sched_reset_all_window_stats(window_start, window_size,
+ sched_ktime_clock() - start_ts, reason, old, new);
+}
+
+#ifdef CONFIG_SCHED_FREQ_INPUT
+
+static inline void
+sync_window_start(struct rq *rq, struct group_cpu_time *cpu_time);
+
+void sched_get_cpus_busy(struct sched_load *busy,
+ const struct cpumask *query_cpus)
+{
+ unsigned long flags;
+ struct rq *rq;
+ const int cpus = cpumask_weight(query_cpus);
+ u64 load[cpus], group_load[cpus];
+ u64 nload[cpus], ngload[cpus];
+ u64 pload[cpus];
+ unsigned int cur_freq[cpus], max_freq[cpus];
+ int notifier_sent[cpus];
+ int early_detection[cpus];
+ int cpu, i = 0;
+ unsigned int window_size;
+ u64 max_prev_sum = 0;
+ int max_busy_cpu = cpumask_first(query_cpus);
+ struct related_thread_group *grp;
+
+ if (unlikely(cpus == 0))
+ return;
+
+ /*
+ * This function could be called in timer context, and the
+ * current task may have been executing for a long time. Ensure
+ * that the window stats are current by doing an update.
+ */
+ read_lock(&related_thread_group_lock);
+
+ local_irq_save(flags);
+ for_each_cpu(cpu, query_cpus)
+ raw_spin_lock(&cpu_rq(cpu)->lock);
+
+ window_size = sched_ravg_window;
+
+ for_each_cpu(cpu, query_cpus) {
+ rq = cpu_rq(cpu);
+
+ update_task_ravg(rq->curr, rq, TASK_UPDATE, sched_ktime_clock(),
+ 0);
+ cur_freq[i] = cpu_cycles_to_freq(rq->cc.cycles, rq->cc.time);
+
+ load[i] = rq->old_busy_time = rq->prev_runnable_sum;
+ nload[i] = rq->nt_prev_runnable_sum;
+ pload[i] = rq->hmp_stats.pred_demands_sum;
+ rq->old_estimated_time = pload[i];
+
+ if (load[i] > max_prev_sum) {
+ max_prev_sum = load[i];
+ max_busy_cpu = cpu;
+ }
+
+ notifier_sent[i] = rq->notifier_sent;
+ early_detection[i] = (rq->ed_task != NULL);
+ rq->notifier_sent = 0;
+ cur_freq[i] = cpu_cur_freq(cpu);
+ max_freq[i] = cpu_max_freq(cpu);
+ i++;
+ }
+
+ for_each_related_thread_group(grp) {
+ for_each_cpu(cpu, query_cpus) {
+ /* Protected by rq_lock */
+ struct group_cpu_time *cpu_time =
+ _group_cpu_time(grp, cpu);
+ sync_window_start(cpu_rq(cpu), cpu_time);
+ }
+ }
+
+ i = 0;
+ for_each_cpu(cpu, query_cpus) {
+ group_load[i] = 0;
+ ngload[i] = 0;
+
+ if (early_detection[i])
+ goto skip_early;
+
+ rq = cpu_rq(cpu);
+ if (!notifier_sent[i]) {
+ if (cpu == max_busy_cpu)
+ group_load_in_freq_domain(
+ &rq->freq_domain_cpumask,
+ &group_load[i], &ngload[i]);
+ } else {
+ _group_load_in_cpu(cpu, &group_load[i], &ngload[i]);
+ }
+
+ load[i] += group_load[i];
+ nload[i] += ngload[i];
+ /*
+ * Scale load in reference to cluster max_possible_freq.
+ *
+ * Note that scale_load_to_cpu() scales load in reference to
+ * the cluster max_freq.
+ */
+ load[i] = scale_load_to_cpu(load[i], cpu);
+ nload[i] = scale_load_to_cpu(nload[i], cpu);
+ pload[i] = scale_load_to_cpu(pload[i], cpu);
+skip_early:
+ i++;
+ }
+
+ for_each_cpu(cpu, query_cpus)
+ raw_spin_unlock(&(cpu_rq(cpu))->lock);
+ local_irq_restore(flags);
+
+ read_unlock(&related_thread_group_lock);
+
+ i = 0;
+ for_each_cpu(cpu, query_cpus) {
+ rq = cpu_rq(cpu);
+
+ if (early_detection[i]) {
+ busy[i].prev_load = div64_u64(sched_ravg_window,
+ NSEC_PER_USEC);
+ busy[i].new_task_load = 0;
+ goto exit_early;
+ }
+
+ if (!notifier_sent[i]) {
+ load[i] = scale_load_to_freq(load[i], max_freq[i],
+ cur_freq[i]);
+ nload[i] = scale_load_to_freq(nload[i], max_freq[i],
+ cur_freq[i]);
+ if (load[i] > window_size)
+ load[i] = window_size;
+ if (nload[i] > window_size)
+ nload[i] = window_size;
+
+ load[i] = scale_load_to_freq(load[i], cur_freq[i],
+ cpu_max_possible_freq(cpu));
+ nload[i] = scale_load_to_freq(nload[i], cur_freq[i],
+ cpu_max_possible_freq(cpu));
+ } else {
+ load[i] = scale_load_to_freq(load[i], max_freq[i],
+ cpu_max_possible_freq(cpu));
+ nload[i] = scale_load_to_freq(nload[i], max_freq[i],
+ cpu_max_possible_freq(cpu));
+ }
+ pload[i] = scale_load_to_freq(pload[i], max_freq[i],
+ rq->cluster->max_possible_freq);
+
+ busy[i].prev_load = div64_u64(load[i], NSEC_PER_USEC);
+ busy[i].new_task_load = div64_u64(nload[i], NSEC_PER_USEC);
+ busy[i].predicted_load = div64_u64(pload[i], NSEC_PER_USEC);
+
+exit_early:
+ trace_sched_get_busy(cpu, busy[i].prev_load,
+ busy[i].new_task_load,
+ busy[i].predicted_load,
+ early_detection[i]);
+ i++;
+ }
+}
+
+void sched_set_io_is_busy(int val)
+{
+ sched_io_is_busy = val;
+}
+
+int sched_set_window(u64 window_start, unsigned int window_size)
+{
+ u64 now, cur_jiffies, jiffy_ktime_ns;
+ s64 ws;
+ unsigned long flags;
+
+ if (sched_use_pelt ||
+ (window_size * TICK_NSEC < MIN_SCHED_RAVG_WINDOW))
+ return -EINVAL;
+
+ mutex_lock(&policy_mutex);
+
+ /*
+ * Get a consistent view of ktime, jiffies, and the time
+ * since the last jiffy (based on last_jiffies_update).
+ */
+ local_irq_save(flags);
+ cur_jiffies = jiffy_to_ktime_ns(&now, &jiffy_ktime_ns);
+ local_irq_restore(flags);
+
+ /* translate window_start from jiffies to nanoseconds */
+ ws = (window_start - cur_jiffies); /* jiffy difference */
+ ws *= TICK_NSEC;
+ ws += jiffy_ktime_ns;
+
+ /* roll back calculated window start so that it is in
+ * the past (window stats must have a current window) */
+ while (ws > now)
+ ws -= (window_size * TICK_NSEC);
+
+ BUG_ON(sched_ktime_clock() < ws);
+
+ reset_all_window_stats(ws, window_size);
+
+ sched_update_freq_max_load(cpu_possible_mask);
+
+ mutex_unlock(&policy_mutex);
+
+ return 0;
+}
+
+static void fixup_busy_time(struct task_struct *p, int new_cpu)
+{
+ struct rq *src_rq = task_rq(p);
+ struct rq *dest_rq = cpu_rq(new_cpu);
+ u64 wallclock;
+ u64 *src_curr_runnable_sum, *dst_curr_runnable_sum;
+ u64 *src_prev_runnable_sum, *dst_prev_runnable_sum;
+ u64 *src_nt_curr_runnable_sum, *dst_nt_curr_runnable_sum;
+ u64 *src_nt_prev_runnable_sum, *dst_nt_prev_runnable_sum;
+ int migrate_type;
+ struct migration_sum_data d;
+ bool new_task;
+ struct related_thread_group *grp;
+
+ if (!sched_enable_hmp || (!p->on_rq && p->state != TASK_WAKING))
+ return;
+
+ if (exiting_task(p)) {
+ clear_ed_task(p, src_rq);
+ return;
+ }
+
+ if (p->state == TASK_WAKING)
+ double_rq_lock(src_rq, dest_rq);
+
+ if (sched_disable_window_stats)
+ goto done;
+
+ wallclock = sched_ktime_clock();
+
+ update_task_ravg(task_rq(p)->curr, task_rq(p),
+ TASK_UPDATE,
+ wallclock, 0);
+ update_task_ravg(dest_rq->curr, dest_rq,
+ TASK_UPDATE, wallclock, 0);
+
+ update_task_ravg(p, task_rq(p), TASK_MIGRATE,
+ wallclock, 0);
+
+ update_task_cpu_cycles(p, new_cpu);
+
+ new_task = is_new_task(p);
+ /* Protected by rq_lock */
+ grp = p->grp;
+ if (grp && sched_freq_aggregate) {
+ struct group_cpu_time *cpu_time;
+
+ migrate_type = GROUP_TO_GROUP;
+ /* Protected by rq_lock */
+ cpu_time = _group_cpu_time(grp, cpu_of(src_rq));
+ d.src_rq = NULL;
+ d.src_cpu_time = cpu_time;
+ src_curr_runnable_sum = &cpu_time->curr_runnable_sum;
+ src_prev_runnable_sum = &cpu_time->prev_runnable_sum;
+ src_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
+ src_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
+
+ /* Protected by rq_lock */
+ cpu_time = _group_cpu_time(grp, cpu_of(dest_rq));
+ d.dst_rq = NULL;
+ d.dst_cpu_time = cpu_time;
+ dst_curr_runnable_sum = &cpu_time->curr_runnable_sum;
+ dst_prev_runnable_sum = &cpu_time->prev_runnable_sum;
+ dst_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
+ dst_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
+ sync_window_start(dest_rq, cpu_time);
+ } else {
+ migrate_type = RQ_TO_RQ;
+ d.src_rq = src_rq;
+ d.src_cpu_time = NULL;
+ d.dst_rq = dest_rq;
+ d.dst_cpu_time = NULL;
+ src_curr_runnable_sum = &src_rq->curr_runnable_sum;
+ src_prev_runnable_sum = &src_rq->prev_runnable_sum;
+ src_nt_curr_runnable_sum = &src_rq->nt_curr_runnable_sum;
+ src_nt_prev_runnable_sum = &src_rq->nt_prev_runnable_sum;
+
+ dst_curr_runnable_sum = &dest_rq->curr_runnable_sum;
+ dst_prev_runnable_sum = &dest_rq->prev_runnable_sum;
+ dst_nt_curr_runnable_sum = &dest_rq->nt_curr_runnable_sum;
+ dst_nt_prev_runnable_sum = &dest_rq->nt_prev_runnable_sum;
+ }
+
+ if (p->ravg.curr_window) {
+ *src_curr_runnable_sum -= p->ravg.curr_window;
+ *dst_curr_runnable_sum += p->ravg.curr_window;
+ if (new_task) {
+ *src_nt_curr_runnable_sum -= p->ravg.curr_window;
+ *dst_nt_curr_runnable_sum += p->ravg.curr_window;
+ }
+ }
+
+ if (p->ravg.prev_window) {
+ *src_prev_runnable_sum -= p->ravg.prev_window;
+ *dst_prev_runnable_sum += p->ravg.prev_window;
+ if (new_task) {
+ *src_nt_prev_runnable_sum -= p->ravg.prev_window;
+ *dst_nt_prev_runnable_sum += p->ravg.prev_window;
+ }
+ }
+
+ if (p == src_rq->ed_task) {
+ src_rq->ed_task = NULL;
+ if (!dest_rq->ed_task)
+ dest_rq->ed_task = p;
+ }
+
+ trace_sched_migration_update_sum(p, migrate_type, &d);
+ BUG_ON((s64)*src_prev_runnable_sum < 0);
+ BUG_ON((s64)*src_curr_runnable_sum < 0);
+ BUG_ON((s64)*src_nt_prev_runnable_sum < 0);
+ BUG_ON((s64)*src_nt_curr_runnable_sum < 0);
+
+done:
+ if (p->state == TASK_WAKING)
+ double_rq_unlock(src_rq, dest_rq);
+}
+
+#else
+
+static inline void fixup_busy_time(struct task_struct *p, int new_cpu) { }
+
+#endif /* CONFIG_SCHED_FREQ_INPUT */
+
+#define sched_up_down_migrate_auto_update 1
+static void check_for_up_down_migrate_update(const struct cpumask *cpus)
+{
+ int i = cpumask_first(cpus);
+
+ if (!sched_up_down_migrate_auto_update)
+ return;
+
+ if (cpu_max_possible_capacity(i) == max_possible_capacity)
+ return;
+
+ if (cpu_max_possible_freq(i) == cpu_max_freq(i))
+ up_down_migrate_scale_factor = 1024;
+ else
+ up_down_migrate_scale_factor = (1024 *
+ cpu_max_possible_freq(i)) / cpu_max_freq(i);
+
+ update_up_down_migrate();
+}
+
+/* Return cluster which can offer required capacity for group */
+static struct sched_cluster *
+best_cluster(struct related_thread_group *grp, u64 total_demand)
+{
+ struct sched_cluster *cluster = NULL;
+
+ for_each_sched_cluster(cluster) {
+ if (group_will_fit(cluster, grp, total_demand))
+ return cluster;
+ }
+
+ return NULL;
+}
+
+static void _set_preferred_cluster(struct related_thread_group *grp)
+{
+ struct task_struct *p;
+ u64 combined_demand = 0;
+
+ if (!sysctl_sched_enable_colocation) {
+ grp->last_update = sched_ktime_clock();
+ grp->preferred_cluster = NULL;
+ return;
+ }
+
+ /*
+ * wakeup of two or more related tasks could race with each other and
+ * could result in multiple calls to _set_preferred_cluster being issued
+ * at same time. Avoid overhead in such cases of rechecking preferred
+ * cluster
+ */
+ if (sched_ktime_clock() - grp->last_update < sched_ravg_window / 10)
+ return;
+
+ list_for_each_entry(p, &grp->tasks, grp_list)
+ combined_demand += p->ravg.demand;
+
+ grp->preferred_cluster = best_cluster(grp, combined_demand);
+ grp->last_update = sched_ktime_clock();
+ trace_sched_set_preferred_cluster(grp, combined_demand);
+}
+
+static void set_preferred_cluster(struct related_thread_group *grp)
+{
+ raw_spin_lock(&grp->lock);
+ _set_preferred_cluster(grp);
+ raw_spin_unlock(&grp->lock);
+}
+
+#define ADD_TASK 0
+#define REM_TASK 1
+
+#ifdef CONFIG_SCHED_FREQ_INPUT
+
+static void
+update_task_ravg(struct task_struct *p, struct rq *rq,
+ int event, u64 wallclock, u64 irqtime);
+
+static inline void free_group_cputime(struct related_thread_group *grp)
+{
+ free_percpu(grp->cpu_time);
+}
+
+static int alloc_group_cputime(struct related_thread_group *grp)
+{
+ int i;
+ struct group_cpu_time *cpu_time;
+ int cpu = raw_smp_processor_id();
+ struct rq *rq = cpu_rq(cpu);
+ u64 window_start = rq->window_start;
+
+ grp->cpu_time = alloc_percpu(struct group_cpu_time);
+ if (!grp->cpu_time)
+ return -ENOMEM;
+
+ for_each_possible_cpu(i) {
+ cpu_time = per_cpu_ptr(grp->cpu_time, i);
+ memset(cpu_time, 0, sizeof(struct group_cpu_time));
+ cpu_time->window_start = window_start;
+ }
+
+ return 0;
+}
+
+/*
+ * A group's window_start may be behind. When moving it forward, flip prev/curr
+ * counters. When moving forward > 1 window, prev counter is set to 0
+ */
+static inline void
+sync_window_start(struct rq *rq, struct group_cpu_time *cpu_time)
+{
+ u64 delta;
+ int nr_windows;
+ u64 curr_sum = cpu_time->curr_runnable_sum;
+ u64 nt_curr_sum = cpu_time->nt_curr_runnable_sum;
+
+ delta = rq->window_start - cpu_time->window_start;
+ if (!delta)
+ return;
+
+ nr_windows = div64_u64(delta, sched_ravg_window);
+ if (nr_windows > 1)
+ curr_sum = nt_curr_sum = 0;
+
+ cpu_time->prev_runnable_sum = curr_sum;
+ cpu_time->curr_runnable_sum = 0;
+
+ cpu_time->nt_prev_runnable_sum = nt_curr_sum;
+ cpu_time->nt_curr_runnable_sum = 0;
+
+ cpu_time->window_start = rq->window_start;
+}
+
+/*
+ * Task's cpu usage is accounted in:
+ * rq->curr/prev_runnable_sum, when its ->grp is NULL
+ * grp->cpu_time[cpu]->curr/prev_runnable_sum, when its ->grp is !NULL
+ *
+ * Transfer task's cpu usage between those counters when transitioning between
+ * groups
+ */
+static void transfer_busy_time(struct rq *rq, struct related_thread_group *grp,
+ struct task_struct *p, int event)
+{
+ u64 wallclock;
+ struct group_cpu_time *cpu_time;
+ u64 *src_curr_runnable_sum, *dst_curr_runnable_sum;
+ u64 *src_prev_runnable_sum, *dst_prev_runnable_sum;
+ u64 *src_nt_curr_runnable_sum, *dst_nt_curr_runnable_sum;
+ u64 *src_nt_prev_runnable_sum, *dst_nt_prev_runnable_sum;
+ struct migration_sum_data d;
+ int migrate_type;
+
+ if (!sched_freq_aggregate)
+ return;
+
+ wallclock = sched_ktime_clock();
+
+ update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
+ update_task_ravg(p, rq, TASK_UPDATE, wallclock, 0);
+
+ /* cpu_time protected by related_thread_group_lock, grp->lock rq_lock */
+ cpu_time = _group_cpu_time(grp, cpu_of(rq));
+ if (event == ADD_TASK) {
+ sync_window_start(rq, cpu_time);
+ migrate_type = RQ_TO_GROUP;
+ d.src_rq = rq;
+ d.src_cpu_time = NULL;
+ d.dst_rq = NULL;
+ d.dst_cpu_time = cpu_time;
+ src_curr_runnable_sum = &rq->curr_runnable_sum;
+ dst_curr_runnable_sum = &cpu_time->curr_runnable_sum;
+ src_prev_runnable_sum = &rq->prev_runnable_sum;
+ dst_prev_runnable_sum = &cpu_time->prev_runnable_sum;
+
+ src_nt_curr_runnable_sum = &rq->nt_curr_runnable_sum;
+ dst_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
+ src_nt_prev_runnable_sum = &rq->nt_prev_runnable_sum;
+ dst_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
+ } else if (event == REM_TASK) {
+ migrate_type = GROUP_TO_RQ;
+ d.src_rq = NULL;
+ d.src_cpu_time = cpu_time;
+ d.dst_rq = rq;
+ d.dst_cpu_time = NULL;
+
+ /*
+ * In case of REM_TASK, cpu_time->window_start would be
+ * uptodate, because of the update_task_ravg() we called
+ * above on the moving task. Hence no need for
+ * sync_window_start()
+ */
+ src_curr_runnable_sum = &cpu_time->curr_runnable_sum;
+ dst_curr_runnable_sum = &rq->curr_runnable_sum;
+ src_prev_runnable_sum = &cpu_time->prev_runnable_sum;
+ dst_prev_runnable_sum = &rq->prev_runnable_sum;
+
+ src_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
+ dst_nt_curr_runnable_sum = &rq->nt_curr_runnable_sum;
+ src_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
+ dst_nt_prev_runnable_sum = &rq->nt_prev_runnable_sum;
+ }
+
+ *src_curr_runnable_sum -= p->ravg.curr_window;
+ *dst_curr_runnable_sum += p->ravg.curr_window;
+
+ *src_prev_runnable_sum -= p->ravg.prev_window;
+ *dst_prev_runnable_sum += p->ravg.prev_window;
+
+ if (is_new_task(p)) {
+ *src_nt_curr_runnable_sum -= p->ravg.curr_window;
+ *dst_nt_curr_runnable_sum += p->ravg.curr_window;
+ *src_nt_prev_runnable_sum -= p->ravg.prev_window;
+ *dst_nt_prev_runnable_sum += p->ravg.prev_window;
+ }
+
+ trace_sched_migration_update_sum(p, migrate_type, &d);
+
+ BUG_ON((s64)*src_curr_runnable_sum < 0);
+ BUG_ON((s64)*src_prev_runnable_sum < 0);
+}
+
+static inline struct group_cpu_time *
+task_group_cpu_time(struct task_struct *p, int cpu)
+{
+ return _group_cpu_time(rcu_dereference(p->grp), cpu);
+}
+
+static inline struct group_cpu_time *
+_group_cpu_time(struct related_thread_group *grp, int cpu)
+{
+ return grp ? per_cpu_ptr(grp->cpu_time, cpu) : NULL;
+}
+
+#else /* CONFIG_SCHED_FREQ_INPUT */
+
+static inline void free_group_cputime(struct related_thread_group *grp) { }
+
+static inline int alloc_group_cputime(struct related_thread_group *grp)
+{
+ return 0;
+}
+
+static inline void transfer_busy_time(struct rq *rq,
+ struct related_thread_group *grp, struct task_struct *p, int event)
+{
+}
+
+static struct group_cpu_time *
+task_group_cpu_time(struct task_struct *p, int cpu)
+{
+ return NULL;
+}
+
+static inline struct group_cpu_time *
+_group_cpu_time(struct related_thread_group *grp, int cpu)
+{
+ return NULL;
+}
+
+#endif
+
+struct related_thread_group *alloc_related_thread_group(int group_id)
+{
+ struct related_thread_group *grp;
+
+ grp = kzalloc(sizeof(*grp), GFP_KERNEL);
+ if (!grp)
+ return ERR_PTR(-ENOMEM);
+
+ if (alloc_group_cputime(grp)) {
+ kfree(grp);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ grp->id = group_id;
+ INIT_LIST_HEAD(&grp->tasks);
+ INIT_LIST_HEAD(&grp->list);
+ raw_spin_lock_init(&grp->lock);
+
+ return grp;
+}
+
+struct related_thread_group *lookup_related_thread_group(unsigned int group_id)
+{
+ struct related_thread_group *grp;
+
+ list_for_each_entry(grp, &related_thread_groups, list) {
+ if (grp->id == group_id)
+ return grp;
+ }
+
+ return NULL;
+}
+
+/* See comments before preferred_cluster() */
+static void free_related_thread_group(struct rcu_head *rcu)
+{
+ struct related_thread_group *grp = container_of(rcu, struct
+ related_thread_group, rcu);
+
+ free_group_cputime(grp);
+ kfree(grp);
+}
+
+static void remove_task_from_group(struct task_struct *p)
+{
+ struct related_thread_group *grp = p->grp;
+ struct rq *rq;
+ int empty_group = 1;
+
+ raw_spin_lock(&grp->lock);
+
+ rq = __task_rq_lock(p);
+ transfer_busy_time(rq, p->grp, p, REM_TASK);
+ list_del_init(&p->grp_list);
+ rcu_assign_pointer(p->grp, NULL);
+ __task_rq_unlock(rq);
+
+ if (!list_empty(&grp->tasks)) {
+ empty_group = 0;
+ _set_preferred_cluster(grp);
+ }
+
+ raw_spin_unlock(&grp->lock);
+
+ if (empty_group) {
+ list_del(&grp->list);
+ call_rcu(&grp->rcu, free_related_thread_group);
+ }
+}
+
+static int
+add_task_to_group(struct task_struct *p, struct related_thread_group *grp)
+{
+ struct rq *rq;
+
+ raw_spin_lock(&grp->lock);
+
+ /*
+ * Change p->grp under rq->lock. Will prevent races with read-side
+ * reference of p->grp in various hot-paths
+ */
+ rq = __task_rq_lock(p);
+ transfer_busy_time(rq, grp, p, ADD_TASK);
+ list_add(&p->grp_list, &grp->tasks);
+ rcu_assign_pointer(p->grp, grp);
+ __task_rq_unlock(rq);
+
+ _set_preferred_cluster(grp);
+
+ raw_spin_unlock(&grp->lock);
+
+ return 0;
+}
+
+int sched_set_group_id(struct task_struct *p, unsigned int group_id)
+{
+ int rc = 0, destroy = 0;
+ unsigned long flags;
+ struct related_thread_group *grp = NULL, *new = NULL;
+
+redo:
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
+
+ if ((current != p && p->flags & PF_EXITING) ||
+ (!p->grp && !group_id) ||
+ (p->grp && p->grp->id == group_id))
+ goto done;
+
+ write_lock(&related_thread_group_lock);
+
+ if (!group_id) {
+ remove_task_from_group(p);
+ write_unlock(&related_thread_group_lock);
+ goto done;
+ }
+
+ if (p->grp && p->grp->id != group_id)
+ remove_task_from_group(p);
+
+ grp = lookup_related_thread_group(group_id);
+ if (!grp && !new) {
+ /* New group */
+ write_unlock(&related_thread_group_lock);
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ new = alloc_related_thread_group(group_id);
+ if (IS_ERR(new))
+ return -ENOMEM;
+ destroy = 1;
+ /* Rerun checks (like task exiting), since we dropped pi_lock */
+ goto redo;
+ } else if (!grp && new) {
+ /* New group - use object allocated before */
+ destroy = 0;
+ list_add(&new->list, &related_thread_groups);
+ grp = new;
+ }
+
+ BUG_ON(!grp);
+ rc = add_task_to_group(p, grp);
+ write_unlock(&related_thread_group_lock);
+done:
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+
+ if (new && destroy) {
+ free_group_cputime(new);
+ kfree(new);
+ }
+
+ return rc;
+}
+
+unsigned int sched_get_group_id(struct task_struct *p)
+{
+ unsigned int group_id;
+ struct related_thread_group *grp;
+
+ rcu_read_lock();
+ grp = task_related_thread_group(p);
+ group_id = grp ? grp->id : 0;
+ rcu_read_unlock();
+
+ return group_id;
+}
+
+static void update_cpu_cluster_capacity(const cpumask_t *cpus)
+{
+ int i;
+ struct sched_cluster *cluster;
+ struct cpumask cpumask;
+
+ cpumask_copy(&cpumask, cpus);
+ pre_big_task_count_change(cpu_possible_mask);
+
+ for_each_cpu(i, &cpumask) {
+ cluster = cpu_rq(i)->cluster;
+ cpumask_andnot(&cpumask, &cpumask, &cluster->cpus);
+
+ cluster->capacity = compute_capacity(cluster);
+ cluster->load_scale_factor = compute_load_scale_factor(cluster);
+
+ /* 'cpus' can contain cpumask more than one cluster */
+ check_for_up_down_migrate_update(&cluster->cpus);
+ }
+
+ __update_min_max_capacity();
+
+ post_big_task_count_change(cpu_possible_mask);
+}
+
+static DEFINE_SPINLOCK(cpu_freq_min_max_lock);
+void sched_update_cpu_freq_min_max(const cpumask_t *cpus, u32 fmin, u32 fmax)
+{
+ struct cpumask cpumask;
+ struct sched_cluster *cluster;
+ int i, update_capacity = 0;
+ unsigned long flags;
+
+ spin_lock_irqsave(&cpu_freq_min_max_lock, flags);
+ cpumask_copy(&cpumask, cpus);
+ for_each_cpu(i, &cpumask) {
+ cluster = cpu_rq(i)->cluster;
+ cpumask_andnot(&cpumask, &cpumask, &cluster->cpus);
+
+ update_capacity += (cluster->max_mitigated_freq != fmax);
+ cluster->max_mitigated_freq = fmax;
+ }
+ spin_unlock_irqrestore(&cpu_freq_min_max_lock, flags);
+
+ if (update_capacity)
+ update_cpu_cluster_capacity(cpus);
+}
+
+static int cpufreq_notifier_policy(struct notifier_block *nb,
+ unsigned long val, void *data)
+{
+ struct cpufreq_policy *policy = (struct cpufreq_policy *)data;
+ struct sched_cluster *cluster = NULL;
+ struct cpumask policy_cluster = *policy->related_cpus;
+ unsigned int orig_max_freq = 0;
+ int i, j, update_capacity = 0;
+
+ if (val != CPUFREQ_NOTIFY && val != CPUFREQ_REMOVE_POLICY &&
+ val != CPUFREQ_CREATE_POLICY)
+ return 0;
+
+ if (val == CPUFREQ_REMOVE_POLICY || val == CPUFREQ_CREATE_POLICY) {
+ update_min_max_capacity();
+ return 0;
+ }
+
+ max_possible_freq = max(max_possible_freq, policy->cpuinfo.max_freq);
+ if (min_max_freq == 1)
+ min_max_freq = UINT_MAX;
+ min_max_freq = min(min_max_freq, policy->cpuinfo.max_freq);
+ BUG_ON(!min_max_freq);
+ BUG_ON(!policy->max);
+
+ for_each_cpu(i, &policy_cluster) {
+ cluster = cpu_rq(i)->cluster;
+ cpumask_andnot(&policy_cluster, &policy_cluster,
+ &cluster->cpus);
+
+ orig_max_freq = cluster->max_freq;
+ cluster->min_freq = policy->min;
+ cluster->max_freq = policy->max;
+ cluster->cur_freq = policy->cur;
+
+ if (!cluster->freq_init_done) {
+ mutex_lock(&cluster_lock);
+ for_each_cpu(j, &cluster->cpus)
+ cpumask_copy(&cpu_rq(j)->freq_domain_cpumask,
+ policy->related_cpus);
+ cluster->max_possible_freq = policy->cpuinfo.max_freq;
+ cluster->max_possible_capacity =
+ compute_max_possible_capacity(cluster);
+ cluster->freq_init_done = true;
+
+ sort_clusters();
+ update_all_clusters_stats();
+ mutex_unlock(&cluster_lock);
+ continue;
+ }
+
+ update_capacity += (orig_max_freq != cluster->max_freq);
+ }
+
+ if (update_capacity)
+ update_cpu_cluster_capacity(policy->related_cpus);
+
+ return 0;
+}
+
+static int cpufreq_notifier_trans(struct notifier_block *nb,
+ unsigned long val, void *data)
+{
+ struct cpufreq_freqs *freq = (struct cpufreq_freqs *)data;
+ unsigned int cpu = freq->cpu, new_freq = freq->new;
+ unsigned long flags;
+ struct sched_cluster *cluster;
+ struct cpumask policy_cpus = cpu_rq(cpu)->freq_domain_cpumask;
+ int i, j;
+
+ if (val != CPUFREQ_POSTCHANGE)
+ return 0;
+
+ BUG_ON(!new_freq);
+
+ if (cpu_cur_freq(cpu) == new_freq)
+ return 0;
+
+ for_each_cpu(i, &policy_cpus) {
+ cluster = cpu_rq(i)->cluster;
+
+ for_each_cpu(j, &cluster->cpus) {
+ struct rq *rq = cpu_rq(j);
+
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ update_task_ravg(rq->curr, rq, TASK_UPDATE,
+ sched_ktime_clock(), 0);
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+ }
+
+ cluster->cur_freq = new_freq;
+ cpumask_andnot(&policy_cpus, &policy_cpus, &cluster->cpus);
+ }
+
+ return 0;
+}
+
+static int pwr_stats_ready_notifier(struct notifier_block *nb,
+ unsigned long cpu, void *data)
+{
+ cpumask_t mask = CPU_MASK_NONE;
+
+ cpumask_set_cpu(cpu, &mask);
+ sched_update_freq_max_load(&mask);
+
+ mutex_lock(&cluster_lock);
+ sort_clusters();
+ mutex_unlock(&cluster_lock);
+
+ return 0;
+}
+
+static struct notifier_block notifier_policy_block = {
+ .notifier_call = cpufreq_notifier_policy
+};
+
+static struct notifier_block notifier_trans_block = {
+ .notifier_call = cpufreq_notifier_trans
+};
+
+static struct notifier_block notifier_pwr_stats_ready = {
+ .notifier_call = pwr_stats_ready_notifier
+};
+
+int __weak register_cpu_pwr_stats_ready_notifier(struct notifier_block *nb)
+{
+ return -EINVAL;
+}
+
+static int register_sched_callback(void)
+{
+ int ret;
+
+ if (!sched_enable_hmp)
+ return 0;
+
+ ret = cpufreq_register_notifier(&notifier_policy_block,
+ CPUFREQ_POLICY_NOTIFIER);
+
+ if (!ret)
+ ret = cpufreq_register_notifier(&notifier_trans_block,
+ CPUFREQ_TRANSITION_NOTIFIER);
+
+ register_cpu_pwr_stats_ready_notifier(&notifier_pwr_stats_ready);
+
+ return 0;
+}
+
+/*
+ * cpufreq callbacks can be registered at core_initcall or later time.
+ * Any registration done prior to that is "forgotten" by cpufreq. See
+ * initialization of variable init_cpufreq_transition_notifier_list_called
+ * for further information.
+ */
+core_initcall(register_sched_callback);
+
+static inline int update_preferred_cluster(struct related_thread_group *grp,
+ struct task_struct *p, u32 old_load)
+{
+ u32 new_load = task_load(p);
+
+ if (!grp)
+ return 0;
+
+ /*
+ * Update if task's load has changed significantly or a complete window
+ * has passed since we last updated preference
+ */
+ if (abs(new_load - old_load) > sched_ravg_window / 4 ||
+ sched_ktime_clock() - grp->last_update > sched_ravg_window)
+ return 1;
+
+ return 0;
+}
+
+#else /* CONFIG_SCHED_HMP */
+
+static inline void fixup_busy_time(struct task_struct *p, int new_cpu) { }
+
+static void
+update_task_ravg(struct task_struct *p, struct rq *rq,
+ int event, u64 wallclock, u64 irqtime)
+{
+}
+
+static inline void mark_task_starting(struct task_struct *p) {}
+
+static inline void set_window_start(struct rq *rq) {}
+
+static inline void migrate_sync_cpu(int cpu) {}
+
+#endif /* CONFIG_SCHED_HMP */
+
#ifdef CONFIG_SMP
/*
* This is how migration works:
@@ -1071,17 +4284,19 @@ static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new
{
lockdep_assert_held(&rq->lock);
- dequeue_task(rq, p, 0);
p->on_rq = TASK_ON_RQ_MIGRATING;
+ dequeue_task(rq, p, 0);
+ double_lock_balance(rq, cpu_rq(new_cpu));
set_task_cpu(p, new_cpu);
+ double_unlock_balance(rq, cpu_rq(new_cpu));
raw_spin_unlock(&rq->lock);
rq = cpu_rq(new_cpu);
raw_spin_lock(&rq->lock);
BUG_ON(task_cpu(p) != new_cpu);
- p->on_rq = TASK_ON_RQ_QUEUED;
enqueue_task(rq, p, 0);
+ p->on_rq = TASK_ON_RQ_QUEUED;
check_preempt_curr(rq, p, 0);
return rq;
@@ -1103,6 +4318,8 @@ struct migration_arg {
*/
static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_cpu)
{
+ int src_cpu;
+
if (unlikely(!cpu_active(dest_cpu)))
return rq;
@@ -1110,11 +4327,40 @@ static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_
if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
return rq;
+ src_cpu = cpu_of(rq);
rq = move_queued_task(rq, p, dest_cpu);
return rq;
}
+static void notify_migration(int src_cpu, int dest_cpu, bool src_cpu_dead,
+ struct task_struct *p)
+{
+ struct migration_notify_data mnd;
+ bool check_groups;
+
+ rcu_read_lock();
+ check_groups = rcu_access_pointer(p->grp) != NULL;
+ rcu_read_unlock();
+
+ if (!same_freq_domain(src_cpu, dest_cpu)) {
+ if (!src_cpu_dead)
+ check_for_freq_change(cpu_rq(src_cpu), false,
+ check_groups);
+ check_for_freq_change(cpu_rq(dest_cpu), false, check_groups);
+ } else {
+ check_for_freq_change(cpu_rq(dest_cpu), true, check_groups);
+ }
+
+ if (task_notify_on_migrate(p)) {
+ mnd.src_cpu = src_cpu;
+ mnd.dest_cpu = dest_cpu;
+ mnd.load = pct_task_load(p);
+ atomic_notifier_call_chain(&migration_notifier_head, 0,
+ (void *)&mnd);
+ }
+}
+
/*
* migration_cpu_stop - this will be executed by a highprio stopper thread
* and performs thread migration by bumping thread off CPU then
@@ -1125,6 +4371,8 @@ static int migration_cpu_stop(void *data)
struct migration_arg *arg = data;
struct task_struct *p = arg->task;
struct rq *rq = this_rq();
+ int src_cpu = cpu_of(rq);
+ bool moved = false;
/*
* The original target cpu might have gone down and we might
@@ -1145,12 +4393,18 @@ static int migration_cpu_stop(void *data)
* holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
* we're holding p->pi_lock.
*/
- if (task_rq(p) == rq && task_on_rq_queued(p))
+ if (task_rq(p) == rq && task_on_rq_queued(p)) {
rq = __migrate_task(rq, p, arg->dest_cpu);
+ moved = true;
+ }
raw_spin_unlock(&rq->lock);
raw_spin_unlock(&p->pi_lock);
local_irq_enable();
+
+ if (moved)
+ notify_migration(src_cpu, arg->dest_cpu, false, p);
+
return 0;
}
@@ -1224,7 +4478,8 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
if (cpumask_equal(&p->cpus_allowed, new_mask))
goto out;
- if (!cpumask_intersects(new_mask, cpu_active_mask)) {
+ dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
+ if (dest_cpu >= nr_cpu_ids) {
ret = -EINVAL;
goto out;
}
@@ -1235,7 +4490,6 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
if (cpumask_test_cpu(task_cpu(p), new_mask))
goto out;
- dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
if (task_running(rq, p) || p->state == TASK_WAKING) {
struct migration_arg arg = { p, dest_cpu };
/* Need help from migration thread: drop lock and wait. */
@@ -1274,6 +4528,15 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
!p->on_rq);
+ /*
+ * Migrating fair class task must have p->on_rq = TASK_ON_RQ_MIGRATING,
+ * because schedstat_wait_{start,end} rebase migrating task's wait_start
+ * time relying on p->on_rq.
+ */
+ WARN_ON_ONCE(p->state == TASK_RUNNING &&
+ p->sched_class == &fair_sched_class &&
+ (p->on_rq && !task_on_rq_migrating(p)));
+
#ifdef CONFIG_LOCKDEP
/*
* The caller should hold either p->pi_lock or rq->lock, when changing
@@ -1290,13 +4553,15 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
#endif
#endif
- trace_sched_migrate_task(p, new_cpu);
+ trace_sched_migrate_task(p, new_cpu, pct_task_load(p));
if (task_cpu(p) != new_cpu) {
if (p->sched_class->migrate_task_rq)
p->sched_class->migrate_task_rq(p);
p->se.nr_migrations++;
perf_event_task_migrate(p);
+
+ fixup_busy_time(p, new_cpu);
}
__set_task_cpu(p, new_cpu);
@@ -1310,9 +4575,11 @@ static void __migrate_swap_task(struct task_struct *p, int cpu)
src_rq = task_rq(p);
dst_rq = cpu_rq(cpu);
+ p->on_rq = TASK_ON_RQ_MIGRATING;
deactivate_task(src_rq, p, 0);
set_task_cpu(p, cpu);
activate_task(dst_rq, p, 0);
+ p->on_rq = TASK_ON_RQ_QUEUED;
check_preempt_curr(dst_rq, p, 0);
} else {
/*
@@ -1498,7 +4765,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
* yield - it could be a while.
*/
if (unlikely(queued)) {
- ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
+ ktime_t to = ktime_set(0, NSEC_PER_MSEC);
set_current_state(TASK_UNINTERRUPTIBLE);
schedule_hrtimeout(&to, HRTIMER_MODE_REL);
@@ -1717,6 +4984,7 @@ static void
ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
{
check_preempt_curr(rq, p, wake_flags);
+
p->state = TASK_RUNNING;
trace_sched_wakeup(p);
@@ -1808,6 +5076,8 @@ void sched_ttwu_pending(void)
void scheduler_ipi(void)
{
+ int cpu = smp_processor_id();
+
/*
* Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
* TIF_NEED_RESCHED remotely (for the first time) will also send
@@ -1815,9 +5085,18 @@ void scheduler_ipi(void)
*/
preempt_fold_need_resched();
- if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
+ if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick() &&
+ !got_boost_kick())
return;
+ if (got_boost_kick()) {
+ struct rq *rq = cpu_rq(cpu);
+
+ if (rq->curr->sched_class == &fair_sched_class)
+ check_for_migration(rq, rq->curr);
+ clear_boost_kick(cpu);
+ }
+
/*
* Not all reschedule IPI handlers call irq_enter/irq_exit, since
* traditionally all their work was done from the interrupt return
@@ -1905,6 +5184,8 @@ static void ttwu_queue(struct task_struct *p, int cpu)
raw_spin_unlock(&rq->lock);
}
+__read_mostly unsigned int sysctl_sched_wakeup_load_threshold = 110;
+
/**
* try_to_wake_up - wake up a thread
* @p: the thread to be awakened
@@ -1924,7 +5205,19 @@ static int
try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
{
unsigned long flags;
- int cpu, success = 0;
+ int cpu, src_cpu, success = 0;
+ int notify = 0;
+ struct migration_notify_data mnd;
+#ifdef CONFIG_SMP
+ unsigned int old_load;
+ struct rq *rq;
+ u64 wallclock;
+ struct related_thread_group *grp = NULL;
+#endif
+ bool freq_notif_allowed = !(wake_flags & WF_NO_NOTIFIER);
+ bool check_group = false;
+
+ wake_flags &= ~WF_NO_NOTIFIER;
/*
* If we are going to wake up a thread waiting for CONDITION we
@@ -1934,13 +5227,14 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
*/
smp_mb__before_spinlock();
raw_spin_lock_irqsave(&p->pi_lock, flags);
+ src_cpu = cpu = task_cpu(p);
+
if (!(p->state & state))
goto out;
trace_sched_waking(p);
success = 1; /* we're going to change ->state */
- cpu = task_cpu(p);
if (p->on_rq && ttwu_remote(p, wake_flags))
goto stat;
@@ -1982,6 +5276,22 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
*/
smp_rmb();
+ rq = cpu_rq(task_cpu(p));
+
+ raw_spin_lock(&rq->lock);
+ old_load = task_load(p);
+ wallclock = sched_ktime_clock();
+ update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
+ update_task_ravg(p, rq, TASK_WAKE, wallclock, 0);
+ raw_spin_unlock(&rq->lock);
+
+ rcu_read_lock();
+ grp = task_related_thread_group(p);
+ if (update_preferred_cluster(grp, p, old_load))
+ set_preferred_cluster(grp);
+ rcu_read_unlock();
+ check_group = grp != NULL;
+
p->sched_contributes_to_load = !!task_contributes_to_load(p);
p->state = TASK_WAKING;
@@ -1989,18 +5299,55 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
p->sched_class->task_waking(p);
cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
- if (task_cpu(p) != cpu) {
+
+ /* Refresh src_cpu as it could have changed since we last read it */
+ src_cpu = task_cpu(p);
+ if (src_cpu != cpu) {
wake_flags |= WF_MIGRATED;
set_task_cpu(p, cpu);
}
-#endif /* CONFIG_SMP */
+ set_task_last_wake(p, wallclock);
+#endif /* CONFIG_SMP */
ttwu_queue(p, cpu);
stat:
ttwu_stat(p, cpu, wake_flags);
+
+ if (task_notify_on_migrate(p)) {
+ mnd.src_cpu = src_cpu;
+ mnd.dest_cpu = cpu;
+ mnd.load = pct_task_load(p);
+
+ /*
+ * Call the migration notifier with mnd for foreground task
+ * migrations as well as for wakeups if their load is above
+ * sysctl_sched_wakeup_load_threshold. This would prompt the
+ * cpu-boost to boost the CPU frequency on wake up of a heavy
+ * weight foreground task
+ */
+ if ((src_cpu != cpu) || (mnd.load >
+ sysctl_sched_wakeup_load_threshold))
+ notify = 1;
+ }
+
out:
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ if (notify)
+ atomic_notifier_call_chain(&migration_notifier_head,
+ 0, (void *)&mnd);
+
+ if (freq_notif_allowed) {
+ if (!same_freq_domain(src_cpu, cpu)) {
+ check_for_freq_change(cpu_rq(cpu),
+ false, check_group);
+ check_for_freq_change(cpu_rq(src_cpu),
+ false, check_group);
+ } else if (success) {
+ check_for_freq_change(cpu_rq(cpu), true, false);
+ }
+ }
+
return success;
}
@@ -2016,9 +5363,13 @@ static void try_to_wake_up_local(struct task_struct *p)
{
struct rq *rq = task_rq(p);
- if (WARN_ON_ONCE(rq != this_rq()) ||
- WARN_ON_ONCE(p == current))
+ if (rq != this_rq() || p == current) {
+ printk_deferred("%s: Failed to wakeup task %d (%s), rq = %p,"
+ " this_rq = %p, p = %p, current = %p\n",
+ __func__, task_pid_nr(p), p->comm, rq,
+ this_rq(), p, current);
return;
+ }
lockdep_assert_held(&rq->lock);
@@ -2041,13 +5392,20 @@ static void try_to_wake_up_local(struct task_struct *p)
trace_sched_waking(p);
- if (!task_on_rq_queued(p))
+ if (!task_on_rq_queued(p)) {
+ u64 wallclock = sched_ktime_clock();
+
+ update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
+ update_task_ravg(p, rq, TASK_WAKE, wallclock, 0);
ttwu_activate(rq, p, ENQUEUE_WAKEUP);
+ set_task_last_wake(p, wallclock);
+ }
ttwu_do_wakeup(rq, p, 0);
ttwu_stat(p, smp_processor_id(), 0);
out:
raw_spin_unlock(&p->pi_lock);
+ /* Todo : Send cpufreq notifier */
}
/**
@@ -2068,6 +5426,26 @@ int wake_up_process(struct task_struct *p)
}
EXPORT_SYMBOL(wake_up_process);
+/**
+ * wake_up_process_no_notif - Wake up a specific process without notifying
+ * governor
+ * @p: The process to be woken up.
+ *
+ * Attempt to wake up the nominated process and move it to the set of runnable
+ * processes.
+ *
+ * Return: 1 if the process was woken up, 0 if it was already running.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
+ */
+int wake_up_process_no_notif(struct task_struct *p)
+{
+ WARN_ON(task_is_stopped_or_traced(p));
+ return try_to_wake_up(p, TASK_NORMAL, WF_NO_NOTIFIER);
+}
+EXPORT_SYMBOL(wake_up_process_no_notif);
+
int wake_up_state(struct task_struct *p, unsigned int state)
{
return try_to_wake_up(p, state, 0);
@@ -2107,6 +5485,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
p->se.prev_sum_exec_runtime = 0;
p->se.nr_migrations = 0;
p->se.vruntime = 0;
+
INIT_LIST_HEAD(&p->se.group_node);
#ifdef CONFIG_SCHEDSTATS
@@ -2375,6 +5754,7 @@ void wake_up_new_task(struct task_struct *p)
struct rq *rq;
raw_spin_lock_irqsave(&p->pi_lock, flags);
+ init_new_task_load(p);
/* Initialize new task's runnable average */
init_entity_runnable_average(&p->se);
#ifdef CONFIG_SMP
@@ -2387,6 +5767,7 @@ void wake_up_new_task(struct task_struct *p)
#endif
rq = __task_rq_lock(p);
+ mark_task_starting(p);
activate_task(rq, p, 0);
p->on_rq = TASK_ON_RQ_QUEUED;
trace_sched_wakeup_new(p);
@@ -2514,6 +5895,11 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
fire_sched_out_preempt_notifiers(prev, next);
prepare_lock_switch(rq, next);
prepare_arch_switch(next);
+
+#ifdef CONFIG_MSM_APP_SETTINGS
+ if (use_app_setting)
+ switch_app_setting_bit(prev, next);
+#endif
}
/**
@@ -2775,7 +6161,7 @@ void get_iowait_load(unsigned long *nr_waiters, unsigned long *load)
*load = rq->load.weight;
}
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP)
/*
* sched_exec - execve() is a valuable balancing opportunity, because at
@@ -2785,9 +6171,13 @@ void sched_exec(void)
{
struct task_struct *p = current;
unsigned long flags;
- int dest_cpu;
+ int dest_cpu, curr_cpu;
+
+ if (sched_enable_hmp)
+ return;
raw_spin_lock_irqsave(&p->pi_lock, flags);
+ curr_cpu = task_cpu(p);
dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
if (dest_cpu == smp_processor_id())
goto unlock;
@@ -2796,7 +6186,7 @@ void sched_exec(void)
struct migration_arg arg = { p, dest_cpu };
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
- stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
+ stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
return;
}
unlock:
@@ -2854,6 +6244,37 @@ unsigned long long task_sched_runtime(struct task_struct *p)
return ns;
}
+#ifdef CONFIG_SCHED_HMP
+static bool early_detection_notify(struct rq *rq, u64 wallclock)
+{
+ struct task_struct *p;
+ int loop_max = 10;
+
+ if (!sched_boost() || !rq->cfs.h_nr_running)
+ return 0;
+
+ rq->ed_task = NULL;
+ list_for_each_entry(p, &rq->cfs_tasks, se.group_node) {
+ if (!loop_max)
+ break;
+
+ if (wallclock - p->last_wake_ts >= EARLY_DETECTION_DURATION) {
+ rq->ed_task = p;
+ return 1;
+ }
+
+ loop_max--;
+ }
+
+ return 0;
+}
+#else /* CONFIG_SCHED_HMP */
+static bool early_detection_notify(struct rq *rq, u64 wallclock)
+{
+ return 0;
+}
+#endif /* CONFIG_SCHED_HMP */
+
/*
* This function gets called by the timer code, with HZ frequency.
* We call it with interrupts disabled.
@@ -2863,16 +6284,29 @@ void scheduler_tick(void)
int cpu = smp_processor_id();
struct rq *rq = cpu_rq(cpu);
struct task_struct *curr = rq->curr;
+ u64 wallclock;
+ bool early_notif;
+ u32 old_load;
+ struct related_thread_group *grp;
sched_clock_tick();
raw_spin_lock(&rq->lock);
+ old_load = task_load(curr);
+ set_window_start(rq);
update_rq_clock(rq);
curr->sched_class->task_tick(rq, curr, 0);
update_cpu_load_active(rq);
calc_global_load_tick(rq);
+ wallclock = sched_ktime_clock();
+ update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
+ early_notif = early_detection_notify(rq, wallclock);
raw_spin_unlock(&rq->lock);
+ if (early_notif)
+ atomic_notifier_call_chain(&load_alert_notifier_head,
+ 0, (void *)(long)cpu);
+
perf_event_task_tick();
#ifdef CONFIG_SMP
@@ -2880,6 +6314,15 @@ void scheduler_tick(void)
trigger_load_balance(rq);
#endif
rq_last_tick_reset(rq);
+
+ rcu_read_lock();
+ grp = task_related_thread_group(curr);
+ if (update_preferred_cluster(grp, curr, old_load))
+ set_preferred_cluster(grp);
+ rcu_read_unlock();
+
+ if (curr->sched_class == &fair_sched_class)
+ check_for_migration(rq, curr);
}
#ifdef CONFIG_NO_HZ_FULL
@@ -2998,6 +6441,9 @@ static noinline void __schedule_bug(struct task_struct *prev)
pr_cont("\n");
}
#endif
+#ifdef CONFIG_PANIC_ON_SCHED_BUG
+ BUG();
+#endif
dump_stack();
add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
}
@@ -3106,6 +6552,7 @@ static void __sched notrace __schedule(bool preempt)
unsigned long *switch_count;
struct rq *rq;
int cpu;
+ u64 wallclock;
cpu = smp_processor_id();
rq = cpu_rq(cpu);
@@ -3167,15 +6614,22 @@ static void __sched notrace __schedule(bool preempt)
update_rq_clock(rq);
next = pick_next_task(rq, prev);
+ wallclock = sched_ktime_clock();
+ update_task_ravg(prev, rq, PUT_PREV_TASK, wallclock, 0);
+ update_task_ravg(next, rq, PICK_NEXT_TASK, wallclock, 0);
clear_tsk_need_resched(prev);
clear_preempt_need_resched();
rq->clock_skip_update = 0;
+ BUG_ON(task_cpu(next) != cpu_of(rq));
+
if (likely(prev != next)) {
rq->nr_switches++;
rq->curr = next;
++*switch_count;
+ set_task_last_switch_out(prev, wallclock);
+
trace_sched_switch(preempt, prev, next);
rq = context_switch(rq, prev, next); /* unlocks the rq */
cpu = cpu_of(rq);
@@ -4081,7 +7535,7 @@ int sched_setscheduler_nocheck(struct task_struct *p, int policy,
{
return _sched_setscheduler(p, policy, param, false);
}
-EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck);
+EXPORT_SYMBOL(sched_setscheduler_nocheck);
static int
do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
@@ -4958,7 +8412,7 @@ void show_state_filter(unsigned long state_filter)
touch_all_softlockup_watchdogs();
-#ifdef CONFIG_SCHED_DEBUG
+#ifdef CONFIG_SYSRQ_SCHED_DEBUG
sysrq_sched_debug_show();
#endif
rcu_read_unlock();
@@ -4987,10 +8441,11 @@ void init_idle(struct task_struct *idle, int cpu)
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
+ __sched_fork(0, idle);
+
raw_spin_lock_irqsave(&idle->pi_lock, flags);
raw_spin_lock(&rq->lock);
- __sched_fork(0, idle);
idle->state = TASK_RUNNING;
idle->se.exec_start = sched_clock();
@@ -5292,8 +8747,11 @@ static void migrate_tasks(struct rq *dead_rq)
rq = __migrate_task(rq, next, dest_cpu);
if (rq != dead_rq) {
+ raw_spin_unlock(&next->pi_lock);
raw_spin_unlock(&rq->lock);
+ notify_migration(dead_rq->cpu, dest_cpu, true, next);
rq = dead_rq;
+ raw_spin_lock(&next->pi_lock);
raw_spin_lock(&rq->lock);
}
raw_spin_unlock(&next->pi_lock);
@@ -5524,6 +8982,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
switch (action & ~CPU_TASKS_FROZEN) {
case CPU_UP_PREPARE:
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ set_window_start(rq);
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
rq->calc_load_update = calc_load_update;
account_reset_rq(rq);
break;
@@ -5544,6 +9005,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
sched_ttwu_pending();
/* Update our root-domain */
raw_spin_lock_irqsave(&rq->lock, flags);
+ migrate_sync_cpu(cpu);
+
if (rq->rd) {
BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
set_rq_offline(rq);
@@ -5554,6 +9017,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
break;
case CPU_DEAD:
+ clear_hmp_request(cpu);
calc_load_migrate(rq);
break;
#endif
@@ -6028,6 +9492,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
{
struct rq *rq = cpu_rq(cpu);
struct sched_domain *tmp;
+ unsigned long next_balance = rq->next_balance;
/* Remove the sched domains which do not contribute to scheduling. */
for (tmp = sd; tmp; ) {
@@ -6059,6 +9524,17 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
sd->child = NULL;
}
+ for (tmp = sd; tmp; ) {
+ unsigned long interval;
+
+ interval = msecs_to_jiffies(tmp->balance_interval);
+ if (time_after(next_balance, tmp->last_balance + interval))
+ next_balance = tmp->last_balance + interval;
+
+ tmp = tmp->parent;
+ }
+ rq->next_balance = next_balance;
+
sched_domain_debug(sd, cpu);
rq_attach_root(rq, rd);
@@ -6950,6 +10426,9 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
pr_err(" the %s domain not a subset of the %s domain\n",
child->name, sd->name);
#endif
+#ifdef CONFIG_PANIC_ON_SCHED_BUG
+ BUG();
+#endif
/* Fixup, ensure @sd has at least @child cpus. */
cpumask_or(sched_domain_span(sd),
sched_domain_span(sd),
@@ -7324,6 +10803,8 @@ void __init sched_init_smp(void)
hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
+ update_cluster_topology();
+
init_hrtick();
/* Move init over to a non-isolated CPU */
@@ -7342,6 +10823,7 @@ void __init sched_init_smp(void)
}
#endif /* CONFIG_SMP */
+
int in_sched_functions(unsigned long addr)
{
return in_lock_functions(addr) ||
@@ -7365,6 +10847,15 @@ void __init sched_init(void)
int i, j;
unsigned long alloc_size = 0, ptr;
+ if (sched_enable_hmp)
+ pr_info("HMP scheduling enabled.\n");
+
+ BUG_ON(num_possible_cpus() > BITS_PER_LONG);
+
+#ifdef CONFIG_SCHED_HMP
+ init_clusters();
+#endif
+
#ifdef CONFIG_FAIR_GROUP_SCHED
alloc_size += 2 * nr_cpu_ids * sizeof(void **);
#endif
@@ -7475,11 +10966,43 @@ void __init sched_init(void)
rq->active_balance = 0;
rq->next_balance = jiffies;
rq->push_cpu = 0;
+ rq->push_task = NULL;
rq->cpu = i;
rq->online = 0;
rq->idle_stamp = 0;
rq->avg_idle = 2*sysctl_sched_migration_cost;
+#ifdef CONFIG_SCHED_HMP
+ cpumask_set_cpu(i, &rq->freq_domain_cpumask);
+ rq->hmp_stats.cumulative_runnable_avg = 0;
+ rq->window_start = 0;
+ rq->hmp_stats.nr_big_tasks = 0;
+ rq->hmp_flags = 0;
+ rq->cur_irqload = 0;
+ rq->avg_irqload = 0;
+ rq->irqload_ts = 0;
+ rq->static_cpu_pwr_cost = 0;
+ rq->cc.cycles = SCHED_MIN_FREQ;
+ rq->cc.time = 1;
+
+ /*
+ * All cpus part of same cluster by default. This avoids the
+ * need to check for rq->cluster being non-NULL in hot-paths
+ * like select_best_cpu()
+ */
+ rq->cluster = &init_cluster;
+#ifdef CONFIG_SCHED_FREQ_INPUT
+ rq->curr_runnable_sum = rq->prev_runnable_sum = 0;
+ rq->nt_curr_runnable_sum = rq->nt_prev_runnable_sum = 0;
+ rq->old_busy_time = 0;
+ rq->old_estimated_time = 0;
+ rq->old_busy_time_group = 0;
+ rq->notifier_sent = 0;
+ rq->hmp_stats.pred_demands_sum = 0;
+#endif
+#endif
rq->max_idle_balance_cost = sysctl_sched_migration_cost;
+ rq->cstate = 0;
+ rq->wakeup_latency = 0;
INIT_LIST_HEAD(&rq->cfs_tasks);
@@ -7495,6 +11018,8 @@ void __init sched_init(void)
atomic_set(&rq->nr_iowait, 0);
}
+ set_hmp_defaults();
+
set_load_weight(&init_task);
#ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -7543,6 +11068,14 @@ static inline int preempt_count_equals(int preempt_offset)
return (nested == preempt_offset);
}
+static int __might_sleep_init_called;
+int __init __might_sleep_init(void)
+{
+ __might_sleep_init_called = 1;
+ return 0;
+}
+early_initcall(__might_sleep_init);
+
void __might_sleep(const char *file, int line, int preempt_offset)
{
/*
@@ -7567,8 +11100,10 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
- !is_idle_task(current)) ||
- system_state != SYSTEM_RUNNING || oops_in_progress)
+ !is_idle_task(current)) || oops_in_progress)
+ return;
+ if (system_state != SYSTEM_RUNNING &&
+ (!__might_sleep_init_called || system_state != SYSTEM_BOOTING))
return;
if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
return;
@@ -7595,6 +11130,9 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
pr_cont("\n");
}
#endif
+#ifdef CONFIG_PANIC_ON_SCHED_BUG
+ BUG();
+#endif
dump_stack();
}
EXPORT_SYMBOL(___might_sleep);
@@ -8269,6 +11807,63 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset)
sched_move_task(task);
}
+static u64 cpu_notify_on_migrate_read_u64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ struct task_group *tg = css_tg(css);
+
+ return tg->notify_on_migrate;
+}
+
+static int cpu_notify_on_migrate_write_u64(struct cgroup_subsys_state *css,
+ struct cftype *cft, u64 notify)
+{
+ struct task_group *tg = css_tg(css);
+
+ tg->notify_on_migrate = (notify > 0);
+
+ return 0;
+}
+
+#ifdef CONFIG_SCHED_HMP
+
+static u64 cpu_upmigrate_discourage_read_u64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ struct task_group *tg = css_tg(css);
+
+ return tg->upmigrate_discouraged;
+}
+
+static int cpu_upmigrate_discourage_write_u64(struct cgroup_subsys_state *css,
+ struct cftype *cft, u64 upmigrate_discourage)
+{
+ struct task_group *tg = css_tg(css);
+ int discourage = upmigrate_discourage > 0;
+
+ if (tg->upmigrate_discouraged == discourage)
+ return 0;
+
+ /*
+ * Revisit big-task classification for tasks of this cgroup. It would
+ * have been efficient to walk tasks of just this cgroup in running
+ * state, but we don't have easy means to do that. Walk all tasks in
+ * running state on all cpus instead and re-visit their big task
+ * classification.
+ */
+ get_online_cpus();
+ pre_big_task_count_change(cpu_online_mask);
+
+ tg->upmigrate_discouraged = discourage;
+
+ post_big_task_count_change(cpu_online_mask);
+ put_online_cpus();
+
+ return 0;
+}
+
+#endif /* CONFIG_SCHED_HMP */
+
#ifdef CONFIG_FAIR_GROUP_SCHED
static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
struct cftype *cftype, u64 shareval)
@@ -8554,6 +12149,18 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
#endif /* CONFIG_RT_GROUP_SCHED */
static struct cftype cpu_files[] = {
+ {
+ .name = "notify_on_migrate",
+ .read_u64 = cpu_notify_on_migrate_read_u64,
+ .write_u64 = cpu_notify_on_migrate_write_u64,
+ },
+#ifdef CONFIG_SCHED_HMP
+ {
+ .name = "upmigrate_discourage",
+ .read_u64 = cpu_upmigrate_discourage_read_u64,
+ .write_u64 = cpu_upmigrate_discourage_write_u64,
+ },
+#endif
#ifdef CONFIG_FAIR_GROUP_SCHED
{
.name = "shares",
@@ -8600,6 +12207,7 @@ struct cgroup_subsys cpu_cgrp_subsys = {
.fork = cpu_cgroup_fork,
.can_attach = cpu_cgroup_can_attach,
.attach = cpu_cgroup_attach,
+ .allow_attach = subsys_cgroup_allow_attach,
.legacy_cftypes = cpu_files,
.early_init = 1,
};
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index f74ea89e77a8..f29b132a9f8b 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -49,6 +49,8 @@ void irqtime_account_irq(struct task_struct *curr)
unsigned long flags;
s64 delta;
int cpu;
+ u64 wallclock;
+ bool account = true;
if (!sched_clock_irqtime)
return;
@@ -56,7 +58,8 @@ void irqtime_account_irq(struct task_struct *curr)
local_irq_save(flags);
cpu = smp_processor_id();
- delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
+ wallclock = sched_clock_cpu(cpu);
+ delta = wallclock - __this_cpu_read(irq_start_time);
__this_cpu_add(irq_start_time, delta);
irq_time_write_begin();
@@ -70,8 +73,16 @@ void irqtime_account_irq(struct task_struct *curr)
__this_cpu_add(cpu_hardirq_time, delta);
else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
__this_cpu_add(cpu_softirq_time, delta);
+ else
+ account = false;
irq_time_write_end();
+
+ if (account)
+ sched_account_irqtime(cpu, curr, delta, wallclock);
+ else if (curr != this_cpu_ksoftirqd())
+ sched_account_irqstart(cpu, curr, wallclock);
+
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(irqtime_account_irq);
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 8b0a15e285f9..44178fea87d0 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -271,9 +271,11 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
/*
* By now the task is replenished and enqueued; migrate it.
*/
+ p->on_rq = TASK_ON_RQ_MIGRATING;
deactivate_task(rq, p, 0);
set_task_cpu(p, later_rq->cpu);
activate_task(later_rq, p, 0);
+ p->on_rq = TASK_ON_RQ_QUEUED;
if (!fallback)
resched_curr(later_rq);
@@ -851,6 +853,41 @@ static inline void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {}
#endif /* CONFIG_SMP */
+#ifdef CONFIG_SCHED_HMP
+
+static void
+inc_hmp_sched_stats_dl(struct rq *rq, struct task_struct *p)
+{
+ inc_cumulative_runnable_avg(&rq->hmp_stats, p);
+}
+
+static void
+dec_hmp_sched_stats_dl(struct rq *rq, struct task_struct *p)
+{
+ dec_cumulative_runnable_avg(&rq->hmp_stats, p);
+}
+
+static void
+fixup_hmp_sched_stats_dl(struct rq *rq, struct task_struct *p,
+ u32 new_task_load, u32 new_pred_demand)
+{
+ s64 task_load_delta = (s64)new_task_load - task_load(p);
+ s64 pred_demand_delta = PRED_DEMAND_DELTA;
+
+ fixup_cumulative_runnable_avg(&rq->hmp_stats, p, task_load_delta,
+ pred_demand_delta);
+}
+
+#else /* CONFIG_SCHED_HMP */
+
+static inline void
+inc_hmp_sched_stats_dl(struct rq *rq, struct task_struct *p) { }
+
+static inline void
+dec_hmp_sched_stats_dl(struct rq *rq, struct task_struct *p) { }
+
+#endif /* CONFIG_SCHED_HMP */
+
static inline
void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
{
@@ -860,6 +897,7 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
WARN_ON(!dl_prio(prio));
dl_rq->dl_nr_running++;
add_nr_running(rq_of_dl_rq(dl_rq), 1);
+ inc_hmp_sched_stats_dl(rq_of_dl_rq(dl_rq), dl_task_of(dl_se));
inc_dl_deadline(dl_rq, deadline);
inc_dl_migration(dl_se, dl_rq);
@@ -874,6 +912,7 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
WARN_ON(!dl_rq->dl_nr_running);
dl_rq->dl_nr_running--;
sub_nr_running(rq_of_dl_rq(dl_rq), 1);
+ dec_hmp_sched_stats_dl(rq_of_dl_rq(dl_rq), dl_task_of(dl_se));
dec_dl_deadline(dl_rq, dl_se->deadline);
dec_dl_migration(dl_se, dl_rq);
@@ -1555,9 +1594,11 @@ retry:
goto retry;
}
+ next_task->on_rq = TASK_ON_RQ_MIGRATING;
deactivate_task(rq, next_task, 0);
set_task_cpu(next_task, later_rq->cpu);
activate_task(later_rq, next_task, 0);
+ next_task->on_rq = TASK_ON_RQ_QUEUED;
ret = 1;
resched_curr(later_rq);
@@ -1643,9 +1684,11 @@ static void pull_dl_task(struct rq *this_rq)
resched = true;
+ p->on_rq = TASK_ON_RQ_MIGRATING;
deactivate_task(src_rq, p, 0);
set_task_cpu(p, this_cpu);
activate_task(this_rq, p, 0);
+ p->on_rq = TASK_ON_RQ_QUEUED;
dmin = p->dl.deadline;
/* Is there any other task even earlier? */
@@ -1846,6 +1889,11 @@ const struct sched_class dl_sched_class = {
.switched_to = switched_to_dl,
.update_curr = update_curr_dl,
+#ifdef CONFIG_SCHED_HMP
+ .inc_hmp_sched_stats = inc_hmp_sched_stats_dl,
+ .dec_hmp_sched_stats = dec_hmp_sched_stats_dl,
+ .fixup_hmp_sched_stats = fixup_hmp_sched_stats_dl,
+#endif
};
#ifdef CONFIG_SCHED_DEBUG
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 641511771ae6..d1c0ef4bf07d 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -227,6 +227,14 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
cfs_rq->throttled);
SEQ_printf(m, " .%-30s: %d\n", "throttle_count",
cfs_rq->throttle_count);
+ SEQ_printf(m, " .%-30s: %d\n", "runtime_enabled",
+ cfs_rq->runtime_enabled);
+#ifdef CONFIG_SCHED_HMP
+ SEQ_printf(m, " .%-30s: %d\n", "nr_big_tasks",
+ cfs_rq->hmp_stats.nr_big_tasks);
+ SEQ_printf(m, " .%-30s: %llu\n", "cumulative_runnable_avg",
+ cfs_rq->hmp_stats.cumulative_runnable_avg);
+#endif
#endif
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -306,6 +314,25 @@ do { \
P(cpu_load[2]);
P(cpu_load[3]);
P(cpu_load[4]);
+#ifdef CONFIG_SMP
+ P(cpu_capacity);
+#endif
+#ifdef CONFIG_SCHED_HMP
+ P(static_cpu_pwr_cost);
+ P(cluster->static_cluster_pwr_cost);
+ P(cluster->load_scale_factor);
+ P(cluster->capacity);
+ P(cluster->max_possible_capacity);
+ P(cluster->efficiency);
+ P(cluster->cur_freq);
+ P(cluster->max_freq);
+ P(cluster->exec_scale_factor);
+#endif
+#ifdef CONFIG_SCHED_HMP
+ P(hmp_stats.nr_big_tasks);
+ SEQ_printf(m, " .%-30s: %llu\n", "hmp_stats.cumulative_runnable_avg",
+ rq->hmp_stats.cumulative_runnable_avg);
+#endif
#undef P
#undef PN
@@ -386,6 +413,16 @@ static void sched_debug_header(struct seq_file *m)
PN(sysctl_sched_wakeup_granularity);
P(sysctl_sched_child_runs_first);
P(sysctl_sched_features);
+#ifdef CONFIG_SCHED_HMP
+ P(sched_upmigrate);
+ P(sched_downmigrate);
+ P(sched_init_task_load_windows);
+ P(sched_init_task_load_pelt);
+ P(min_capacity);
+ P(max_capacity);
+ P(sched_use_pelt);
+ P(sched_ravg_window);
+#endif
#undef PN
#undef P
@@ -408,6 +445,7 @@ static int sched_debug_show(struct seq_file *m, void *v)
return 0;
}
+#ifdef CONFIG_SYSRQ_SCHED_DEBUG
void sysrq_sched_debug_show(void)
{
int cpu;
@@ -417,6 +455,7 @@ void sysrq_sched_debug_show(void)
print_cpu(NULL, cpu);
}
+#endif
/*
* This itererator needs some explanation.
@@ -547,6 +586,9 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m)
void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
{
unsigned long nr_switches;
+ unsigned int load_avg;
+
+ load_avg = pct_task_load(p);
SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr(p),
get_nr_threads(p));
@@ -598,6 +640,14 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
P(se.statistics.nr_wakeups_passive);
P(se.statistics.nr_wakeups_idle);
+#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
+ __P(load_avg);
+#ifdef CONFIG_SCHED_HMP
+ P(ravg.demand);
+ P(se.avg.runnable_avg_sum_scaled);
+#endif
+#endif
+
{
u64 avg_atom, avg_per_cpu;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index cfdc0e61066c..958d79e1933c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -31,9 +31,8 @@
#include <linux/migrate.h>
#include <linux/task_work.h>
-#include <trace/events/sched.h>
-
#include "sched.h"
+#include <trace/events/sched.h>
/*
* Targeted preemption latency for CPU-bound tasks:
@@ -81,6 +80,14 @@ static unsigned int sched_nr_latency = 8;
unsigned int sysctl_sched_child_runs_first __read_mostly;
/*
+ * Controls whether, when SD_SHARE_PKG_RESOURCES is on, if all
+ * tasks go to idle CPUs when woken. If this is off, note that the
+ * per-task flag PF_WAKE_UP_IDLE can still cause a task to go to an
+ * idle CPU upon being woken.
+ */
+unsigned int __read_mostly sysctl_sched_wake_to_idle;
+
+/*
* SCHED_OTHER wake-up granularity.
* (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
*
@@ -236,6 +243,9 @@ static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight
return mul_u64_u32_shr(delta_exec, fact, shift);
}
+#ifdef CONFIG_SMP
+static int active_load_balance_cpu_stop(void *data);
+#endif
const struct sched_class fair_sched_class;
@@ -738,12 +748,56 @@ static void update_curr_fair(struct rq *rq)
update_curr(cfs_rq_of(&rq->curr->se));
}
+#ifdef CONFIG_SCHEDSTATS
static inline void
update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq)));
+ u64 wait_start = rq_clock(rq_of(cfs_rq));
+
+ if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
+ likely(wait_start > se->statistics.wait_start))
+ wait_start -= se->statistics.wait_start;
+
+ se->statistics.wait_start = wait_start;
}
+static void
+update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ struct task_struct *p;
+ u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start;
+
+ if (entity_is_task(se)) {
+ p = task_of(se);
+ if (task_on_rq_migrating(p)) {
+ /*
+ * Preserve migrating task's wait time so wait_start
+ * time stamp can be adjusted to accumulate wait time
+ * prior to migration.
+ */
+ se->statistics.wait_start = delta;
+ return;
+ }
+ trace_sched_stat_wait(p, delta);
+ }
+
+ se->statistics.wait_max = max(se->statistics.wait_max, delta);
+ se->statistics.wait_count++;
+ se->statistics.wait_sum += delta;
+ se->statistics.wait_start = 0;
+}
+#else
+static inline void
+update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+}
+
+static inline void
+update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+}
+#endif
+
/*
* Task is being enqueued - update stats:
*/
@@ -757,23 +811,6 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
update_stats_wait_start(cfs_rq, se);
}
-static void
-update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
- schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
- rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start));
- schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
- schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
- rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
-#ifdef CONFIG_SCHEDSTATS
- if (entity_is_task(se)) {
- trace_sched_stat_wait(task_of(se),
- rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
- }
-#endif
- schedstat_set(se->statistics.wait_start, 0);
-}
-
static inline void
update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
@@ -2442,7 +2479,25 @@ static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_SMP
-/* Precomputed fixed inverse multiplies for multiplication by y^n */
+u32 sched_get_wake_up_idle(struct task_struct *p)
+{
+ u32 enabled = p->flags & PF_WAKE_UP_IDLE;
+
+ return !!enabled;
+}
+
+int sched_set_wake_up_idle(struct task_struct *p, int wake_up_idle)
+{
+ int enable = !!wake_up_idle;
+
+ if (enable)
+ p->flags |= PF_WAKE_UP_IDLE;
+ else
+ p->flags &= ~PF_WAKE_UP_IDLE;
+
+ return 0;
+}
+
static const u32 runnable_avg_yN_inv[] = {
0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
@@ -2522,6 +2577,1709 @@ static u32 __compute_runnable_contrib(u64 n)
return contrib + runnable_avg_yN_sum[n];
}
+static void add_to_scaled_stat(int cpu, struct sched_avg *sa, u64 delta);
+static inline void decay_scaled_stat(struct sched_avg *sa, u64 periods);
+
+struct cpu_pwr_stats __weak *get_cpu_pwr_stats(void)
+{
+ return NULL;
+}
+
+enum sched_boost_type {
+ SCHED_BOOST_NONE,
+ SCHED_BOOST_ON_BIG,
+ SCHED_BOOST_ON_ALL,
+};
+
+#ifdef CONFIG_SCHED_HMP
+
+/* Initial task load. Newly created tasks are assigned this load. */
+unsigned int __read_mostly sched_init_task_load_pelt;
+unsigned int __read_mostly sched_init_task_load_windows;
+unsigned int __read_mostly sysctl_sched_init_task_load_pct = 15;
+
+unsigned int max_task_load(void)
+{
+ if (sched_use_pelt)
+ return LOAD_AVG_MAX;
+
+ return sched_ravg_window;
+}
+
+/* Use this knob to turn on or off HMP-aware task placement logic */
+unsigned int __read_mostly sched_enable_hmp = 0;
+
+/* A cpu can no longer accomodate more tasks if:
+ *
+ * rq->nr_running > sysctl_sched_spill_nr_run ||
+ * rq->hmp_stats.cumulative_runnable_avg > sched_spill_load
+ */
+unsigned int __read_mostly sysctl_sched_spill_nr_run = 10;
+
+/*
+ * Place sync wakee tasks those have less than configured demand to the waker's
+ * cluster.
+ */
+unsigned int __read_mostly sched_small_wakee_task_load;
+unsigned int __read_mostly sysctl_sched_small_wakee_task_load_pct = 10;
+
+unsigned int __read_mostly sched_big_waker_task_load;
+unsigned int __read_mostly sysctl_sched_big_waker_task_load_pct = 25;
+
+/*
+ * CPUs with load greater than the sched_spill_load_threshold are not
+ * eligible for task placement. When all CPUs in a cluster achieve a
+ * load higher than this level, tasks becomes eligible for inter
+ * cluster migration.
+ */
+unsigned int __read_mostly sched_spill_load;
+unsigned int __read_mostly sysctl_sched_spill_load_pct = 100;
+
+/*
+ * Tasks whose bandwidth consumption on a cpu is more than
+ * sched_upmigrate are considered "big" tasks. Big tasks will be
+ * considered for "up" migration, i.e migrating to a cpu with better
+ * capacity.
+ */
+unsigned int __read_mostly sched_upmigrate;
+unsigned int __read_mostly sysctl_sched_upmigrate_pct = 80;
+
+/*
+ * Big tasks, once migrated, will need to drop their bandwidth
+ * consumption to less than sched_downmigrate before they are "down"
+ * migrated.
+ */
+unsigned int __read_mostly sched_downmigrate;
+unsigned int __read_mostly sysctl_sched_downmigrate_pct = 60;
+
+#define SCHED_UPMIGRATE_MIN_NICE 15
+
+/*
+ * The load scale factor of a CPU gets boosted when its max frequency
+ * is restricted due to which the tasks are migrating to higher capacity
+ * CPUs early. The sched_upmigrate threshold is auto-upgraded by
+ * rq->max_possible_freq/rq->max_freq of a lower capacity CPU.
+ */
+unsigned int up_down_migrate_scale_factor = 1024;
+
+/*
+ * Scheduler boost is a mechanism to temporarily place tasks on CPUs
+ * with higher capacity than those where a task would have normally
+ * ended up with their load characteristics. Any entity enabling
+ * boost is responsible for disabling it as well.
+ */
+unsigned int sysctl_sched_boost;
+
+/*
+ * Scheduler selects and places task to its previous CPU if sleep time is
+ * less than sysctl_sched_select_prev_cpu_us.
+ */
+static unsigned int __read_mostly
+sched_short_sleep_task_threshold = 2000 * NSEC_PER_USEC;
+unsigned int __read_mostly sysctl_sched_select_prev_cpu_us = 2000;
+
+static unsigned int __read_mostly
+sched_long_cpu_selection_threshold = 100 * NSEC_PER_MSEC;
+
+unsigned int __read_mostly sysctl_sched_restrict_cluster_spill;
+
+void update_up_down_migrate(void)
+{
+ unsigned int up_migrate = pct_to_real(sysctl_sched_upmigrate_pct);
+ unsigned int down_migrate = pct_to_real(sysctl_sched_downmigrate_pct);
+ unsigned int delta;
+
+ if (up_down_migrate_scale_factor == 1024)
+ goto done;
+
+ delta = up_migrate - down_migrate;
+
+ up_migrate /= NSEC_PER_USEC;
+ up_migrate *= up_down_migrate_scale_factor;
+ up_migrate >>= 10;
+ up_migrate *= NSEC_PER_USEC;
+
+ up_migrate = min(up_migrate, sched_ravg_window);
+
+ down_migrate /= NSEC_PER_USEC;
+ down_migrate *= up_down_migrate_scale_factor;
+ down_migrate >>= 10;
+ down_migrate *= NSEC_PER_USEC;
+
+ down_migrate = min(down_migrate, up_migrate - delta);
+done:
+ sched_upmigrate = up_migrate;
+ sched_downmigrate = down_migrate;
+}
+
+void set_hmp_defaults(void)
+{
+ sched_spill_load =
+ pct_to_real(sysctl_sched_spill_load_pct);
+
+ update_up_down_migrate();
+
+#ifdef CONFIG_SCHED_FREQ_INPUT
+ sched_major_task_runtime =
+ mult_frac(sched_ravg_window, MAJOR_TASK_PCT, 100);
+#endif
+
+ sched_init_task_load_pelt =
+ div64_u64((u64)sysctl_sched_init_task_load_pct *
+ (u64)LOAD_AVG_MAX, 100);
+
+ sched_init_task_load_windows =
+ div64_u64((u64)sysctl_sched_init_task_load_pct *
+ (u64)sched_ravg_window, 100);
+
+ sched_short_sleep_task_threshold = sysctl_sched_select_prev_cpu_us *
+ NSEC_PER_USEC;
+
+ sched_small_wakee_task_load =
+ div64_u64((u64)sysctl_sched_small_wakee_task_load_pct *
+ (u64)sched_ravg_window, 100);
+
+ sched_big_waker_task_load =
+ div64_u64((u64)sysctl_sched_big_waker_task_load_pct *
+ (u64)sched_ravg_window, 100);
+}
+
+u32 sched_get_init_task_load(struct task_struct *p)
+{
+ return p->init_load_pct;
+}
+
+int sched_set_init_task_load(struct task_struct *p, int init_load_pct)
+{
+ if (init_load_pct < 0 || init_load_pct > 100)
+ return -EINVAL;
+
+ p->init_load_pct = init_load_pct;
+
+ return 0;
+}
+
+#ifdef CONFIG_CGROUP_SCHED
+
+static inline int upmigrate_discouraged(struct task_struct *p)
+{
+ return task_group(p)->upmigrate_discouraged;
+}
+
+#else
+
+static inline int upmigrate_discouraged(struct task_struct *p)
+{
+ return 0;
+}
+
+#endif
+
+/* Is a task "big" on its current cpu */
+static inline int __is_big_task(struct task_struct *p, u64 scaled_load)
+{
+ int nice = task_nice(p);
+
+ if (nice > SCHED_UPMIGRATE_MIN_NICE || upmigrate_discouraged(p))
+ return 0;
+
+ return scaled_load > sched_upmigrate;
+}
+
+static inline int is_big_task(struct task_struct *p)
+{
+ return __is_big_task(p, scale_load_to_cpu(task_load(p), task_cpu(p)));
+}
+
+static inline u64 cpu_load(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ return scale_load_to_cpu(rq->hmp_stats.cumulative_runnable_avg, cpu);
+}
+
+static inline u64 cpu_load_sync(int cpu, int sync)
+{
+ return scale_load_to_cpu(cpu_cravg_sync(cpu, sync), cpu);
+}
+
+static int boost_refcount;
+static DEFINE_SPINLOCK(boost_lock);
+static DEFINE_MUTEX(boost_mutex);
+
+static void boost_kick_cpus(void)
+{
+ int i;
+
+ for_each_online_cpu(i) {
+ if (cpu_capacity(i) != max_capacity)
+ boost_kick(i);
+ }
+}
+
+int sched_boost(void)
+{
+ return boost_refcount > 0;
+}
+
+int sched_set_boost(int enable)
+{
+ unsigned long flags;
+ int ret = 0;
+ int old_refcount;
+
+ if (!sched_enable_hmp)
+ return -EINVAL;
+
+ spin_lock_irqsave(&boost_lock, flags);
+
+ old_refcount = boost_refcount;
+
+ if (enable == 1) {
+ boost_refcount++;
+ } else if (!enable) {
+ if (boost_refcount >= 1)
+ boost_refcount--;
+ else
+ ret = -EINVAL;
+ } else {
+ ret = -EINVAL;
+ }
+
+ if (!old_refcount && boost_refcount)
+ boost_kick_cpus();
+
+ trace_sched_set_boost(boost_refcount);
+ spin_unlock_irqrestore(&boost_lock, flags);
+
+ return ret;
+}
+
+int sched_boost_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int ret;
+
+ mutex_lock(&boost_mutex);
+ if (!write)
+ sysctl_sched_boost = sched_boost();
+
+ ret = proc_dointvec(table, write, buffer, lenp, ppos);
+ if (ret || !write)
+ goto done;
+
+ ret = (sysctl_sched_boost <= 1) ?
+ sched_set_boost(sysctl_sched_boost) : -EINVAL;
+
+done:
+ mutex_unlock(&boost_mutex);
+ return ret;
+}
+
+/*
+ * Task will fit on a cpu if it's bandwidth consumption on that cpu
+ * will be less than sched_upmigrate. A big task that was previously
+ * "up" migrated will be considered fitting on "little" cpu if its
+ * bandwidth consumption on "little" cpu will be less than
+ * sched_downmigrate. This will help avoid frequenty migrations for
+ * tasks with load close to the upmigrate threshold
+ */
+
+static int task_load_will_fit(struct task_struct *p, u64 task_load, int cpu,
+ enum sched_boost_type boost_type)
+{
+ int upmigrate;
+
+ if (cpu_capacity(cpu) == max_capacity)
+ return 1;
+
+ if (boost_type != SCHED_BOOST_ON_BIG) {
+ if (task_nice(p) > SCHED_UPMIGRATE_MIN_NICE ||
+ upmigrate_discouraged(p))
+ return 1;
+
+ upmigrate = sched_upmigrate;
+ if (cpu_capacity(task_cpu(p)) > cpu_capacity(cpu))
+ upmigrate = sched_downmigrate;
+
+ if (task_load < upmigrate)
+ return 1;
+ }
+
+ return 0;
+}
+
+static enum sched_boost_type sched_boost_type(void)
+{
+ if (sched_boost()) {
+ if (min_possible_efficiency != max_possible_efficiency)
+ return SCHED_BOOST_ON_BIG;
+ else
+ return SCHED_BOOST_ON_ALL;
+ }
+ return SCHED_BOOST_NONE;
+}
+
+static int task_will_fit(struct task_struct *p, int cpu)
+{
+ u64 tload = scale_load_to_cpu(task_load(p), cpu);
+
+ return task_load_will_fit(p, tload, cpu, sched_boost_type());
+}
+
+int group_will_fit(struct sched_cluster *cluster,
+ struct related_thread_group *grp, u64 demand)
+{
+ int cpu = cluster_first_cpu(cluster);
+ int prev_capacity = 0;
+ unsigned int threshold = sched_upmigrate;
+ u64 load;
+
+ if (cluster->capacity == max_capacity)
+ return 1;
+
+ if (grp->preferred_cluster)
+ prev_capacity = grp->preferred_cluster->capacity;
+
+ if (cluster->capacity < prev_capacity)
+ threshold = sched_downmigrate;
+
+ load = scale_load_to_cpu(demand, cpu);
+ if (load < threshold)
+ return 1;
+
+ return 0;
+}
+
+/*
+ * Return the cost of running task p on CPU cpu. This function
+ * currently assumes that task p is the only task which will run on
+ * the CPU.
+ */
+unsigned int power_cost(int cpu, u64 demand)
+{
+ int first, mid, last;
+ struct cpu_pwr_stats *per_cpu_info = get_cpu_pwr_stats();
+ struct cpu_pstate_pwr *costs;
+ struct freq_max_load *max_load;
+ int total_static_pwr_cost = 0;
+ struct rq *rq = cpu_rq(cpu);
+ unsigned int pc;
+
+ if (!per_cpu_info || !per_cpu_info[cpu].ptable)
+ /* When power aware scheduling is not in use, or CPU
+ * power data is not available, just use the CPU
+ * capacity as a rough stand-in for real CPU power
+ * numbers, assuming bigger CPUs are more power
+ * hungry. */
+ return cpu_max_possible_capacity(cpu);
+
+ rcu_read_lock();
+ max_load = rcu_dereference(per_cpu(freq_max_load, cpu));
+ if (!max_load) {
+ pc = cpu_max_possible_capacity(cpu);
+ goto unlock;
+ }
+
+ costs = per_cpu_info[cpu].ptable;
+
+ if (demand <= max_load->freqs[0].hdemand) {
+ pc = costs[0].power;
+ goto unlock;
+ } else if (demand > max_load->freqs[max_load->length - 1].hdemand) {
+ pc = costs[max_load->length - 1].power;
+ goto unlock;
+ }
+
+ first = 0;
+ last = max_load->length - 1;
+ mid = (last - first) >> 1;
+ while (1) {
+ if (demand <= max_load->freqs[mid].hdemand)
+ last = mid;
+ else
+ first = mid;
+
+ if (last - first == 1)
+ break;
+ mid = first + ((last - first) >> 1);
+ }
+
+ pc = costs[last].power;
+
+unlock:
+ rcu_read_unlock();
+
+ if (idle_cpu(cpu) && rq->cstate) {
+ total_static_pwr_cost += rq->static_cpu_pwr_cost;
+ if (rq->cluster->dstate)
+ total_static_pwr_cost +=
+ rq->cluster->static_cluster_pwr_cost;
+ }
+
+ return pc + total_static_pwr_cost;
+
+}
+
+struct cpu_select_env {
+ struct task_struct *p;
+ struct related_thread_group *rtg;
+ u8 reason;
+ u8 need_idle:1;
+ u8 need_waker_cluster:1;
+ u8 sync:1;
+ u8 ignore_prev_cpu:1;
+ enum sched_boost_type boost_type;
+ int prev_cpu;
+ DECLARE_BITMAP(candidate_list, NR_CPUS);
+ DECLARE_BITMAP(backup_list, NR_CPUS);
+ u64 task_load;
+ u64 cpu_load;
+};
+
+struct cluster_cpu_stats {
+ int best_idle_cpu, least_loaded_cpu;
+ int best_capacity_cpu, best_cpu, best_sibling_cpu;
+ int min_cost, best_sibling_cpu_cost;
+ int best_cpu_cstate;
+ u64 min_load, best_load, best_sibling_cpu_load;
+ s64 highest_spare_capacity;
+};
+
+#define UP_MIGRATION 1
+#define DOWN_MIGRATION 2
+#define IRQLOAD_MIGRATION 3
+
+/*
+ * Invoked from three places:
+ * 1) try_to_wake_up() -> ... -> select_best_cpu()
+ * 2) scheduler_tick() -> ... -> migration_needed() -> select_best_cpu()
+ * 3) can_migrate_task()
+ *
+ * Its safe to de-reference p->grp in first case (since p->pi_lock is held)
+ * but not in other cases. p->grp is hence freed after a RCU grace period and
+ * accessed under rcu_read_lock()
+ */
+static inline int
+preferred_cluster(struct sched_cluster *cluster, struct task_struct *p)
+{
+ struct related_thread_group *grp;
+ int rc = 0;
+
+ rcu_read_lock();
+
+ grp = task_related_thread_group(p);
+ if (!grp || !sysctl_sched_enable_colocation)
+ rc = 1;
+ else
+ rc = (grp->preferred_cluster == cluster);
+
+ rcu_read_unlock();
+ return rc;
+}
+
+static inline struct sched_cluster *rq_cluster(struct rq *rq)
+{
+ return rq->cluster;
+}
+
+static int
+spill_threshold_crossed(struct cpu_select_env *env, struct rq *rq)
+{
+ u64 total_load;
+
+ total_load = env->task_load + env->cpu_load;
+
+ if (total_load > sched_spill_load ||
+ (rq->nr_running + 1) > sysctl_sched_spill_nr_run)
+ return 1;
+
+ return 0;
+}
+
+static int skip_cpu(int cpu, struct cpu_select_env *env)
+{
+ int tcpu = task_cpu(env->p);
+ int skip = 0;
+
+ if (!env->reason)
+ return 0;
+
+ if (is_reserved(cpu))
+ return 1;
+
+ switch (env->reason) {
+ case UP_MIGRATION:
+ skip = !idle_cpu(cpu);
+ break;
+ case IRQLOAD_MIGRATION:
+ /* Purposely fall through */
+ default:
+ skip = (cpu == tcpu);
+ break;
+ }
+
+ return skip;
+}
+
+static inline int
+acceptable_capacity(struct sched_cluster *cluster, struct cpu_select_env *env)
+{
+ int tcpu;
+
+ if (!env->reason)
+ return 1;
+
+ tcpu = task_cpu(env->p);
+ switch (env->reason) {
+ case UP_MIGRATION:
+ return cluster->capacity > cpu_capacity(tcpu);
+
+ case DOWN_MIGRATION:
+ return cluster->capacity < cpu_capacity(tcpu);
+
+ default:
+ break;
+ }
+
+ return 1;
+}
+
+static int
+skip_cluster(struct sched_cluster *cluster, struct cpu_select_env *env)
+{
+ if (!test_bit(cluster->id, env->candidate_list))
+ return 1;
+
+ if (!acceptable_capacity(cluster, env)) {
+ __clear_bit(cluster->id, env->candidate_list);
+ return 1;
+ }
+
+ return 0;
+}
+
+static struct sched_cluster *
+select_least_power_cluster(struct cpu_select_env *env)
+{
+ struct sched_cluster *cluster;
+
+ if (env->rtg) {
+ env->task_load = scale_load_to_cpu(task_load(env->p),
+ cluster_first_cpu(env->rtg->preferred_cluster));
+ return env->rtg->preferred_cluster;
+ }
+
+ for_each_sched_cluster(cluster) {
+ if (!skip_cluster(cluster, env)) {
+ int cpu = cluster_first_cpu(cluster);
+
+ env->task_load = scale_load_to_cpu(task_load(env->p),
+ cpu);
+ if (task_load_will_fit(env->p, env->task_load, cpu,
+ env->boost_type))
+ return cluster;
+
+ __set_bit(cluster->id, env->backup_list);
+ __clear_bit(cluster->id, env->candidate_list);
+ }
+ }
+
+ return NULL;
+}
+
+static struct sched_cluster *
+next_candidate(const unsigned long *list, int start, int end)
+{
+ int cluster_id;
+
+ cluster_id = find_next_bit(list, end, start - 1 + 1);
+ if (cluster_id >= end)
+ return NULL;
+
+ return sched_cluster[cluster_id];
+}
+
+static void
+update_spare_capacity(struct cluster_cpu_stats *stats,
+ struct cpu_select_env *env, int cpu, int capacity,
+ u64 cpu_load)
+{
+ s64 spare_capacity = sched_ravg_window - cpu_load;
+
+ if (spare_capacity > 0 &&
+ (spare_capacity > stats->highest_spare_capacity ||
+ (spare_capacity == stats->highest_spare_capacity &&
+ ((!env->need_waker_cluster &&
+ capacity > cpu_capacity(stats->best_capacity_cpu)) ||
+ (env->need_waker_cluster &&
+ cpu_rq(cpu)->nr_running <
+ cpu_rq(stats->best_capacity_cpu)->nr_running))))) {
+ /*
+ * If sync waker is the only runnable of CPU, cr_avg of the
+ * CPU is 0 so we have high chance to place the wakee on the
+ * waker's CPU which likely causes preemtion of the waker.
+ * This can lead migration of preempted waker. Place the
+ * wakee on the real idle CPU when it's possible by checking
+ * nr_running to avoid such preemption.
+ */
+ stats->highest_spare_capacity = spare_capacity;
+ stats->best_capacity_cpu = cpu;
+ }
+}
+
+static inline void find_backup_cluster(
+struct cpu_select_env *env, struct cluster_cpu_stats *stats)
+{
+ struct sched_cluster *next = NULL;
+ int i;
+
+ while (!bitmap_empty(env->backup_list, num_clusters)) {
+ next = next_candidate(env->backup_list, 0, num_clusters);
+ __clear_bit(next->id, env->backup_list);
+ for_each_cpu_and(i, &env->p->cpus_allowed, &next->cpus) {
+ trace_sched_cpu_load_wakeup(cpu_rq(i), idle_cpu(i),
+ sched_irqload(i), power_cost(i, task_load(env->p) +
+ cpu_cravg_sync(i, env->sync)), 0);
+
+ update_spare_capacity(stats, env, i, next->capacity,
+ cpu_load_sync(i, env->sync));
+ }
+ }
+}
+
+struct sched_cluster *
+next_best_cluster(struct sched_cluster *cluster, struct cpu_select_env *env,
+ struct cluster_cpu_stats *stats)
+{
+ struct sched_cluster *next = NULL;
+
+ __clear_bit(cluster->id, env->candidate_list);
+
+ if (env->rtg && preferred_cluster(cluster, env->p))
+ return NULL;
+
+ do {
+ if (bitmap_empty(env->candidate_list, num_clusters))
+ return NULL;
+
+ next = next_candidate(env->candidate_list, 0, num_clusters);
+ if (next) {
+ if (next->min_power_cost > stats->min_cost) {
+ clear_bit(next->id, env->candidate_list);
+ next = NULL;
+ continue;
+ }
+
+ if (skip_cluster(next, env))
+ next = NULL;
+ }
+ } while (!next);
+
+ env->task_load = scale_load_to_cpu(task_load(env->p),
+ cluster_first_cpu(next));
+ return next;
+}
+
+#ifdef CONFIG_SCHED_HMP_CSTATE_AWARE
+static void __update_cluster_stats(int cpu, struct cluster_cpu_stats *stats,
+ struct cpu_select_env *env, int cpu_cost)
+{
+ int cpu_cstate;
+ int prev_cpu = env->prev_cpu;
+
+ cpu_cstate = cpu_rq(cpu)->cstate;
+
+ if (env->need_idle) {
+ stats->min_cost = cpu_cost;
+ if (idle_cpu(cpu)) {
+ if (cpu_cstate < stats->best_cpu_cstate ||
+ (cpu_cstate == stats->best_cpu_cstate &&
+ cpu == prev_cpu)) {
+ stats->best_idle_cpu = cpu;
+ stats->best_cpu_cstate = cpu_cstate;
+ }
+ } else {
+ if (env->cpu_load < stats->min_load ||
+ (env->cpu_load == stats->min_load &&
+ cpu == prev_cpu)) {
+ stats->least_loaded_cpu = cpu;
+ stats->min_load = env->cpu_load;
+ }
+ }
+
+ return;
+ }
+
+ if (cpu_cost < stats->min_cost) {
+ stats->min_cost = cpu_cost;
+ stats->best_cpu_cstate = cpu_cstate;
+ stats->best_load = env->cpu_load;
+ stats->best_cpu = cpu;
+ return;
+ }
+
+ /* CPU cost is the same. Start breaking the tie by C-state */
+
+ if (cpu_cstate > stats->best_cpu_cstate)
+ return;
+
+ if (cpu_cstate < stats->best_cpu_cstate) {
+ stats->best_cpu_cstate = cpu_cstate;
+ stats->best_load = env->cpu_load;
+ stats->best_cpu = cpu;
+ return;
+ }
+
+ /* C-state is the same. Use prev CPU to break the tie */
+ if (cpu == prev_cpu) {
+ stats->best_cpu = cpu;
+ return;
+ }
+
+ if (stats->best_cpu != prev_cpu &&
+ ((cpu_cstate == 0 && env->cpu_load < stats->best_load) ||
+ (cpu_cstate > 0 && env->cpu_load > stats->best_load))) {
+ stats->best_load = env->cpu_load;
+ stats->best_cpu = cpu;
+ }
+}
+#else /* CONFIG_SCHED_HMP_CSTATE_AWARE */
+static void __update_cluster_stats(int cpu, struct cluster_cpu_stats *stats,
+ struct cpu_select_env *env, int cpu_cost)
+{
+ int prev_cpu = env->prev_cpu;
+
+ if (cpu != prev_cpu && cpus_share_cache(prev_cpu, cpu)) {
+ if (stats->best_sibling_cpu_cost > cpu_cost ||
+ (stats->best_sibling_cpu_cost == cpu_cost &&
+ stats->best_sibling_cpu_load > env->cpu_load)) {
+ stats->best_sibling_cpu_cost = cpu_cost;
+ stats->best_sibling_cpu_load = env->cpu_load;
+ stats->best_sibling_cpu = cpu;
+ }
+ }
+
+ if ((cpu_cost < stats->min_cost) ||
+ ((stats->best_cpu != prev_cpu &&
+ stats->min_load > env->cpu_load) || cpu == prev_cpu)) {
+ if (env->need_idle) {
+ if (idle_cpu(cpu)) {
+ stats->min_cost = cpu_cost;
+ stats->best_idle_cpu = cpu;
+ }
+ } else {
+ stats->min_cost = cpu_cost;
+ stats->min_load = env->cpu_load;
+ stats->best_cpu = cpu;
+ }
+ }
+}
+#endif
+
+static void update_cluster_stats(int cpu, struct cluster_cpu_stats *stats,
+ struct cpu_select_env *env)
+{
+ int cpu_cost;
+
+ cpu_cost = power_cost(cpu, task_load(env->p) +
+ cpu_cravg_sync(cpu, env->sync));
+ if (cpu_cost <= stats->min_cost)
+ __update_cluster_stats(cpu, stats, env, cpu_cost);
+}
+
+static void find_best_cpu_in_cluster(struct sched_cluster *c,
+ struct cpu_select_env *env, struct cluster_cpu_stats *stats)
+{
+ int i;
+ struct cpumask search_cpus;
+
+ cpumask_and(&search_cpus, tsk_cpus_allowed(env->p), &c->cpus);
+ if (env->ignore_prev_cpu)
+ cpumask_clear_cpu(env->prev_cpu, &search_cpus);
+
+ for_each_cpu(i, &search_cpus) {
+ env->cpu_load = cpu_load_sync(i, env->sync);
+
+ trace_sched_cpu_load_wakeup(cpu_rq(i), idle_cpu(i),
+ sched_irqload(i),
+ power_cost(i, task_load(env->p) +
+ cpu_cravg_sync(i, env->sync)), 0);
+
+ if (unlikely(!cpu_active(i)) || skip_cpu(i, env))
+ continue;
+
+ update_spare_capacity(stats, env, i, c->capacity,
+ env->cpu_load);
+
+ if (env->boost_type == SCHED_BOOST_ON_ALL ||
+ env->need_waker_cluster ||
+ sched_cpu_high_irqload(i) ||
+ spill_threshold_crossed(env, cpu_rq(i)))
+ continue;
+
+ update_cluster_stats(i, stats, env);
+ }
+}
+
+static inline void init_cluster_cpu_stats(struct cluster_cpu_stats *stats)
+{
+ stats->best_cpu = stats->best_idle_cpu = -1;
+ stats->best_capacity_cpu = stats->best_sibling_cpu = -1;
+ stats->min_cost = stats->best_sibling_cpu_cost = INT_MAX;
+ stats->min_load = stats->best_sibling_cpu_load = ULLONG_MAX;
+ stats->highest_spare_capacity = 0;
+ stats->least_loaded_cpu = -1;
+ stats->best_cpu_cstate = INT_MAX;
+ /* No need to initialize stats->best_load */
+}
+
+/*
+ * Should task be woken to any available idle cpu?
+ *
+ * Waking tasks to idle cpu has mixed implications on both performance and
+ * power. In many cases, scheduler can't estimate correctly impact of using idle
+ * cpus on either performance or power. PF_WAKE_UP_IDLE allows external kernel
+ * module to pass a strong hint to scheduler that the task in question should be
+ * woken to idle cpu, generally to improve performance.
+ */
+static inline int wake_to_idle(struct task_struct *p)
+{
+ return (current->flags & PF_WAKE_UP_IDLE) ||
+ (p->flags & PF_WAKE_UP_IDLE);
+}
+
+static inline bool
+bias_to_prev_cpu(struct cpu_select_env *env, struct cluster_cpu_stats *stats)
+{
+ int prev_cpu;
+ struct task_struct *task = env->p;
+ struct sched_cluster *cluster;
+
+ if (env->boost_type != SCHED_BOOST_NONE || env->reason ||
+ !task->ravg.mark_start ||
+ env->need_idle || !sched_short_sleep_task_threshold)
+ return false;
+
+ prev_cpu = env->prev_cpu;
+ if (!cpumask_test_cpu(prev_cpu, tsk_cpus_allowed(task)) ||
+ unlikely(!cpu_active(prev_cpu)))
+ return false;
+
+ if (task->ravg.mark_start - task->last_cpu_selected_ts >=
+ sched_long_cpu_selection_threshold)
+ return false;
+
+ /*
+ * This function should be used by task wake up path only as it's
+ * assuming p->last_switch_out_ts as last sleep time.
+ * p->last_switch_out_ts can denote last preemption time as well as
+ * last sleep time.
+ */
+ if (task->ravg.mark_start - task->last_switch_out_ts >=
+ sched_short_sleep_task_threshold)
+ return false;
+
+ env->task_load = scale_load_to_cpu(task_load(task), prev_cpu);
+ cluster = cpu_rq(prev_cpu)->cluster;
+
+ if (!task_load_will_fit(task, env->task_load, prev_cpu,
+ sched_boost_type())) {
+
+ __set_bit(cluster->id, env->backup_list);
+ __clear_bit(cluster->id, env->candidate_list);
+ return false;
+ }
+
+ env->cpu_load = cpu_load_sync(prev_cpu, env->sync);
+ if (sched_cpu_high_irqload(prev_cpu) ||
+ spill_threshold_crossed(env, cpu_rq(prev_cpu))) {
+ update_spare_capacity(stats, env, prev_cpu,
+ cluster->capacity, env->cpu_load);
+ env->ignore_prev_cpu = 1;
+ return false;
+ }
+
+ return true;
+}
+
+static inline bool
+wake_to_waker_cluster(struct cpu_select_env *env)
+{
+ return !env->need_idle && !env->reason && env->sync &&
+ task_load(current) > sched_big_waker_task_load &&
+ task_load(env->p) < sched_small_wakee_task_load;
+}
+
+static inline int
+cluster_allowed(struct task_struct *p, struct sched_cluster *cluster)
+{
+ cpumask_t tmp_mask;
+
+ cpumask_and(&tmp_mask, &cluster->cpus, cpu_active_mask);
+ cpumask_and(&tmp_mask, &tmp_mask, &p->cpus_allowed);
+
+ return !cpumask_empty(&tmp_mask);
+}
+
+
+/* return cheapest cpu that can fit this task */
+static int select_best_cpu(struct task_struct *p, int target, int reason,
+ int sync)
+{
+ struct sched_cluster *cluster, *pref_cluster = NULL;
+ struct cluster_cpu_stats stats;
+ bool fast_path = false;
+ struct related_thread_group *grp;
+
+ struct cpu_select_env env = {
+ .p = p,
+ .reason = reason,
+ .need_idle = wake_to_idle(p),
+ .need_waker_cluster = 0,
+ .boost_type = sched_boost_type(),
+ .sync = sync,
+ .prev_cpu = target,
+ .ignore_prev_cpu = 0,
+ .rtg = NULL,
+ };
+
+ bitmap_copy(env.candidate_list, all_cluster_ids, NR_CPUS);
+ bitmap_zero(env.backup_list, NR_CPUS);
+
+ init_cluster_cpu_stats(&stats);
+
+ rcu_read_lock();
+
+ grp = task_related_thread_group(p);
+
+ if (grp && grp->preferred_cluster) {
+ pref_cluster = grp->preferred_cluster;
+ if (!cluster_allowed(p, pref_cluster))
+ clear_bit(pref_cluster->id, env.candidate_list);
+ else
+ env.rtg = grp;
+ } else {
+ cluster = cpu_rq(smp_processor_id())->cluster;
+ if (wake_to_waker_cluster(&env) &&
+ cluster_allowed(p, cluster)) {
+ env.need_waker_cluster = 1;
+ bitmap_zero(env.candidate_list, NR_CPUS);
+ __set_bit(cluster->id, env.candidate_list);
+ } else if (bias_to_prev_cpu(&env, &stats)) {
+ fast_path = true;
+ goto out;
+ }
+ }
+
+retry:
+ cluster = select_least_power_cluster(&env);
+
+ if (!cluster)
+ goto out;
+
+ /*
+ * 'cluster' now points to the minimum power cluster which can satisfy
+ * task's perf goals. Walk down the cluster list starting with that
+ * cluster. For non-small tasks, skip clusters that don't have
+ * mostly_idle/idle cpus
+ */
+
+ do {
+ find_best_cpu_in_cluster(cluster, &env, &stats);
+
+ } while ((cluster = next_best_cluster(cluster, &env, &stats)));
+
+ if (env.need_idle) {
+ if (stats.best_idle_cpu >= 0)
+ target = stats.best_idle_cpu;
+ else if (stats.least_loaded_cpu >= 0)
+ target = stats.least_loaded_cpu;
+ } else if (stats.best_cpu >= 0) {
+ if (stats.best_cpu != task_cpu(p) &&
+ stats.min_cost == stats.best_sibling_cpu_cost)
+ stats.best_cpu = stats.best_sibling_cpu;
+
+ target = stats.best_cpu;
+ } else {
+ if (env.rtg) {
+ env.rtg = NULL;
+ goto retry;
+ }
+
+ find_backup_cluster(&env, &stats);
+ if (stats.best_capacity_cpu >= 0)
+ target = stats.best_capacity_cpu;
+ }
+ p->last_cpu_selected_ts = sched_ktime_clock();
+
+out:
+ rcu_read_unlock();
+ trace_sched_task_load(p, sched_boost(), env.reason, env.sync,
+ env.need_idle, fast_path, target);
+ return target;
+}
+
+static void
+inc_nr_big_task(struct hmp_sched_stats *stats, struct task_struct *p)
+{
+ if (!sched_enable_hmp || sched_disable_window_stats)
+ return;
+
+ if (is_big_task(p))
+ stats->nr_big_tasks++;
+}
+
+static void
+dec_nr_big_task(struct hmp_sched_stats *stats, struct task_struct *p)
+{
+ if (!sched_enable_hmp || sched_disable_window_stats)
+ return;
+
+ if (is_big_task(p))
+ stats->nr_big_tasks--;
+
+ BUG_ON(stats->nr_big_tasks < 0);
+}
+
+static void
+inc_rq_hmp_stats(struct rq *rq, struct task_struct *p, int change_cra)
+{
+ inc_nr_big_task(&rq->hmp_stats, p);
+ if (change_cra)
+ inc_cumulative_runnable_avg(&rq->hmp_stats, p);
+}
+
+static void
+dec_rq_hmp_stats(struct rq *rq, struct task_struct *p, int change_cra)
+{
+ dec_nr_big_task(&rq->hmp_stats, p);
+ if (change_cra)
+ dec_cumulative_runnable_avg(&rq->hmp_stats, p);
+}
+
+static void reset_hmp_stats(struct hmp_sched_stats *stats, int reset_cra)
+{
+ stats->nr_big_tasks = 0;
+ if (reset_cra) {
+ stats->cumulative_runnable_avg = 0;
+ set_pred_demands_sum(stats, 0);
+ }
+}
+
+
+#ifdef CONFIG_CFS_BANDWIDTH
+
+static inline struct task_group *next_task_group(struct task_group *tg)
+{
+ tg = list_entry_rcu(tg->list.next, typeof(struct task_group), list);
+
+ return (&tg->list == &task_groups) ? NULL : tg;
+}
+
+/* Iterate over all cfs_rq in a cpu */
+#define for_each_cfs_rq(cfs_rq, tg, cpu) \
+ for (tg = container_of(&task_groups, struct task_group, list); \
+ ((tg = next_task_group(tg)) && (cfs_rq = tg->cfs_rq[cpu]));)
+
+static void reset_cfs_rq_hmp_stats(int cpu, int reset_cra)
+{
+ struct task_group *tg;
+ struct cfs_rq *cfs_rq;
+
+ rcu_read_lock();
+
+ for_each_cfs_rq(cfs_rq, tg, cpu)
+ reset_hmp_stats(&cfs_rq->hmp_stats, reset_cra);
+
+ rcu_read_unlock();
+}
+
+#else /* CONFIG_CFS_BANDWIDTH */
+
+static inline void reset_cfs_rq_hmp_stats(int cpu, int reset_cra) { }
+
+#endif /* CONFIG_CFS_BANDWIDTH */
+
+/*
+ * Return total number of tasks "eligible" to run on highest capacity cpu
+ *
+ * This is simply nr_big_tasks for cpus which are not of max_capacity and
+ * nr_running for cpus of max_capacity
+ */
+unsigned int nr_eligible_big_tasks(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ int nr_big = rq->hmp_stats.nr_big_tasks;
+ int nr = rq->nr_running;
+
+ if (cpu_max_possible_capacity(cpu) != max_possible_capacity)
+ return nr_big;
+
+ return nr;
+}
+
+/*
+ * reset_cpu_hmp_stats - reset HMP stats for a cpu
+ * nr_big_tasks
+ * cumulative_runnable_avg (iff reset_cra is true)
+ */
+void reset_cpu_hmp_stats(int cpu, int reset_cra)
+{
+ reset_cfs_rq_hmp_stats(cpu, reset_cra);
+ reset_hmp_stats(&cpu_rq(cpu)->hmp_stats, reset_cra);
+}
+
+static void
+fixup_nr_big_tasks(struct hmp_sched_stats *stats, struct task_struct *p,
+ s64 delta)
+{
+ u64 new_task_load;
+ u64 old_task_load;
+
+ if (!sched_enable_hmp || sched_disable_window_stats)
+ return;
+
+ old_task_load = scale_load_to_cpu(task_load(p), task_cpu(p));
+ new_task_load = scale_load_to_cpu(delta + task_load(p), task_cpu(p));
+
+ if (__is_big_task(p, old_task_load) && !__is_big_task(p, new_task_load))
+ stats->nr_big_tasks--;
+ else if (!__is_big_task(p, old_task_load) &&
+ __is_big_task(p, new_task_load))
+ stats->nr_big_tasks++;
+
+ BUG_ON(stats->nr_big_tasks < 0);
+}
+
+
+#ifdef CONFIG_CFS_BANDWIDTH
+
+static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq);
+
+static void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
+ struct task_struct *p, int change_cra);
+static void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
+ struct task_struct *p, int change_cra);
+
+/* Add task's contribution to a cpu' HMP statistics */
+static void
+_inc_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p, int change_cra)
+{
+ struct cfs_rq *cfs_rq;
+ struct sched_entity *se = &p->se;
+
+ /*
+ * Although below check is not strictly required (as
+ * inc/dec_nr_big_task and inc/dec_cumulative_runnable_avg called
+ * from inc_cfs_rq_hmp_stats() have similar checks), we gain a bit on
+ * efficiency by short-circuiting for_each_sched_entity() loop when
+ * !sched_enable_hmp || sched_disable_window_stats
+ */
+ if (!sched_enable_hmp || sched_disable_window_stats)
+ return;
+
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+ inc_cfs_rq_hmp_stats(cfs_rq, p, change_cra);
+ if (cfs_rq_throttled(cfs_rq))
+ break;
+ }
+
+ /* Update rq->hmp_stats only if we didn't find any throttled cfs_rq */
+ if (!se)
+ inc_rq_hmp_stats(rq, p, change_cra);
+}
+
+/* Remove task's contribution from a cpu' HMP statistics */
+static void
+_dec_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p, int change_cra)
+{
+ struct cfs_rq *cfs_rq;
+ struct sched_entity *se = &p->se;
+
+ /* See comment on efficiency in _inc_hmp_sched_stats_fair */
+ if (!sched_enable_hmp || sched_disable_window_stats)
+ return;
+
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+ dec_cfs_rq_hmp_stats(cfs_rq, p, change_cra);
+ if (cfs_rq_throttled(cfs_rq))
+ break;
+ }
+
+ /* Update rq->hmp_stats only if we didn't find any throttled cfs_rq */
+ if (!se)
+ dec_rq_hmp_stats(rq, p, change_cra);
+}
+
+static void inc_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p)
+{
+ _inc_hmp_sched_stats_fair(rq, p, 1);
+}
+
+static void dec_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p)
+{
+ _dec_hmp_sched_stats_fair(rq, p, 1);
+}
+
+static void fixup_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p,
+ u32 new_task_load, u32 new_pred_demand)
+{
+ struct cfs_rq *cfs_rq;
+ struct sched_entity *se = &p->se;
+ s64 task_load_delta = (s64)new_task_load - task_load(p);
+ s64 pred_demand_delta = PRED_DEMAND_DELTA;
+
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+
+ fixup_cumulative_runnable_avg(&cfs_rq->hmp_stats, p,
+ task_load_delta,
+ pred_demand_delta);
+ fixup_nr_big_tasks(&cfs_rq->hmp_stats, p, task_load_delta);
+ if (cfs_rq_throttled(cfs_rq))
+ break;
+ }
+
+ /* Fix up rq->hmp_stats only if we didn't find any throttled cfs_rq */
+ if (!se) {
+ fixup_cumulative_runnable_avg(&rq->hmp_stats, p,
+ task_load_delta,
+ pred_demand_delta);
+ fixup_nr_big_tasks(&rq->hmp_stats, p, task_load_delta);
+ }
+}
+
+static int task_will_be_throttled(struct task_struct *p);
+
+#else /* CONFIG_CFS_BANDWIDTH */
+
+static void
+inc_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p)
+{
+ inc_nr_big_task(&rq->hmp_stats, p);
+ inc_cumulative_runnable_avg(&rq->hmp_stats, p);
+}
+
+static void
+dec_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p)
+{
+ dec_nr_big_task(&rq->hmp_stats, p);
+ dec_cumulative_runnable_avg(&rq->hmp_stats, p);
+}
+static void
+fixup_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p,
+ u32 new_task_load, u32 new_pred_demand)
+{
+ s64 task_load_delta = (s64)new_task_load - task_load(p);
+ s64 pred_demand_delta = PRED_DEMAND_DELTA;
+
+ fixup_cumulative_runnable_avg(&rq->hmp_stats, p, task_load_delta,
+ pred_demand_delta);
+ fixup_nr_big_tasks(&rq->hmp_stats, p, task_load_delta);
+}
+
+static inline int task_will_be_throttled(struct task_struct *p)
+{
+ return 0;
+}
+
+static void
+_inc_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p, int change_cra)
+{
+ inc_nr_big_task(&rq->hmp_stats, p);
+}
+
+#endif /* CONFIG_CFS_BANDWIDTH */
+
+/*
+ * Walk runqueue of cpu and re-initialize 'nr_big_tasks' counters.
+ */
+static void update_nr_big_tasks(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ struct task_struct *p;
+
+ /* Do not reset cumulative_runnable_avg */
+ reset_cpu_hmp_stats(cpu, 0);
+
+ list_for_each_entry(p, &rq->cfs_tasks, se.group_node)
+ _inc_hmp_sched_stats_fair(rq, p, 0);
+}
+
+/* Disable interrupts and grab runqueue lock of all cpus listed in @cpus */
+void pre_big_task_count_change(const struct cpumask *cpus)
+{
+ int i;
+
+ local_irq_disable();
+
+ for_each_cpu(i, cpus)
+ raw_spin_lock(&cpu_rq(i)->lock);
+}
+
+/*
+ * Reinitialize 'nr_big_tasks' counters on all affected cpus
+ */
+void post_big_task_count_change(const struct cpumask *cpus)
+{
+ int i;
+
+ /* Assumes local_irq_disable() keeps online cpumap stable */
+ for_each_cpu(i, cpus)
+ update_nr_big_tasks(i);
+
+ for_each_cpu(i, cpus)
+ raw_spin_unlock(&cpu_rq(i)->lock);
+
+ local_irq_enable();
+}
+
+DEFINE_MUTEX(policy_mutex);
+
+#ifdef CONFIG_SCHED_FREQ_INPUT
+static inline int invalid_value_freq_input(unsigned int *data)
+{
+ if (data == &sysctl_sched_freq_aggregate)
+ return !(*data == 0 || *data == 1);
+
+ return 0;
+}
+#else
+static inline int invalid_value_freq_input(unsigned int *data)
+{
+ return 0;
+}
+#endif
+
+static inline int invalid_value(unsigned int *data)
+{
+ unsigned int val = *data;
+
+ if (data == &sysctl_sched_ravg_hist_size)
+ return (val < 2 || val > RAVG_HIST_SIZE_MAX);
+
+ if (data == &sysctl_sched_window_stats_policy)
+ return val >= WINDOW_STATS_INVALID_POLICY;
+
+ return invalid_value_freq_input(data);
+}
+
+/*
+ * Handle "atomic" update of sysctl_sched_window_stats_policy,
+ * sysctl_sched_ravg_hist_size and sched_freq_legacy_mode variables.
+ */
+int sched_window_update_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int ret;
+ unsigned int *data = (unsigned int *)table->data;
+ unsigned int old_val;
+
+ if (!sched_enable_hmp)
+ return -EINVAL;
+
+ mutex_lock(&policy_mutex);
+
+ old_val = *data;
+
+ ret = proc_dointvec(table, write, buffer, lenp, ppos);
+ if (ret || !write || (write && (old_val == *data)))
+ goto done;
+
+ if (invalid_value(data)) {
+ *data = old_val;
+ ret = -EINVAL;
+ goto done;
+ }
+
+ reset_all_window_stats(0, 0);
+
+done:
+ mutex_unlock(&policy_mutex);
+
+ return ret;
+}
+
+/*
+ * Convert percentage value into absolute form. This will avoid div() operation
+ * in fast path, to convert task load in percentage scale.
+ */
+int sched_hmp_proc_update_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int ret;
+ unsigned int old_val;
+ unsigned int *data = (unsigned int *)table->data;
+ int update_min_nice = 0;
+
+ mutex_lock(&policy_mutex);
+
+ old_val = *data;
+
+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+
+ if (ret || !write || !sched_enable_hmp)
+ goto done;
+
+ if (write && (old_val == *data))
+ goto done;
+
+ if (data != &sysctl_sched_select_prev_cpu_us) {
+ /*
+ * all tunables other than sched_select_prev_cpu_us are
+ * in percentage.
+ */
+ if (sysctl_sched_downmigrate_pct >
+ sysctl_sched_upmigrate_pct || *data > 100) {
+ *data = old_val;
+ ret = -EINVAL;
+ goto done;
+ }
+ }
+
+ /*
+ * Big task tunable change will need to re-classify tasks on
+ * runqueue as big and set their counters appropriately.
+ * sysctl interface affects secondary variables (*_pct), which is then
+ * "atomically" carried over to the primary variables. Atomic change
+ * includes taking runqueue lock of all online cpus and re-initiatizing
+ * their big counter values based on changed criteria.
+ */
+ if ((data == &sysctl_sched_upmigrate_pct || update_min_nice)) {
+ get_online_cpus();
+ pre_big_task_count_change(cpu_online_mask);
+ }
+
+ set_hmp_defaults();
+
+ if ((data == &sysctl_sched_upmigrate_pct || update_min_nice)) {
+ post_big_task_count_change(cpu_online_mask);
+ put_online_cpus();
+ }
+
+done:
+ mutex_unlock(&policy_mutex);
+ return ret;
+}
+
+/*
+ * Reset balance_interval at all sched_domain levels of given cpu, so that it
+ * honors kick.
+ */
+static inline void reset_balance_interval(int cpu)
+{
+ struct sched_domain *sd;
+
+ if (cpu >= nr_cpu_ids)
+ return;
+
+ rcu_read_lock();
+ for_each_domain(cpu, sd)
+ sd->balance_interval = 0;
+ rcu_read_unlock();
+}
+
+/*
+ * Check if a task is on the "wrong" cpu (i.e its current cpu is not the ideal
+ * cpu as per its demand or priority)
+ *
+ * Returns reason why task needs to be migrated
+ */
+static inline int migration_needed(struct task_struct *p, int cpu)
+{
+ int nice;
+ struct related_thread_group *grp;
+
+ if (!sched_enable_hmp || p->state != TASK_RUNNING)
+ return 0;
+
+ /* No need to migrate task that is about to be throttled */
+ if (task_will_be_throttled(p))
+ return 0;
+
+ if (sched_boost_type() == SCHED_BOOST_ON_BIG) {
+ if (cpu_capacity(cpu) != max_capacity)
+ return UP_MIGRATION;
+ return 0;
+ }
+
+ if (sched_cpu_high_irqload(cpu))
+ return IRQLOAD_MIGRATION;
+
+ nice = task_nice(p);
+ rcu_read_lock();
+ grp = task_related_thread_group(p);
+ if (!grp && (nice > SCHED_UPMIGRATE_MIN_NICE ||
+ upmigrate_discouraged(p)) && cpu_capacity(cpu) > min_capacity) {
+ rcu_read_unlock();
+ return DOWN_MIGRATION;
+ }
+
+ if (!grp && !task_will_fit(p, cpu)) {
+ rcu_read_unlock();
+ return UP_MIGRATION;
+ }
+ rcu_read_unlock();
+
+ return 0;
+}
+
+static DEFINE_RAW_SPINLOCK(migration_lock);
+
+static inline int
+kick_active_balance(struct rq *rq, struct task_struct *p, int new_cpu)
+{
+ unsigned long flags;
+ int rc = 0;
+
+ /* Invoke active balance to force migrate currently running task */
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ if (!rq->active_balance) {
+ rq->active_balance = 1;
+ rq->push_cpu = new_cpu;
+ get_task_struct(p);
+ rq->push_task = p;
+ rc = 1;
+ }
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+
+ return rc;
+}
+
+/*
+ * Check if currently running task should be migrated to a better cpu.
+ *
+ * Todo: Effect this via changes to nohz_balancer_kick() and load balance?
+ */
+void check_for_migration(struct rq *rq, struct task_struct *p)
+{
+ int cpu = cpu_of(rq), new_cpu;
+ int active_balance = 0, reason;
+
+ reason = migration_needed(p, cpu);
+ if (!reason)
+ return;
+
+ raw_spin_lock(&migration_lock);
+ new_cpu = select_best_cpu(p, cpu, reason, 0);
+
+ if (new_cpu != cpu) {
+ active_balance = kick_active_balance(rq, p, new_cpu);
+ if (active_balance)
+ mark_reserved(new_cpu);
+ }
+
+ raw_spin_unlock(&migration_lock);
+
+ if (active_balance)
+ stop_one_cpu_nowait(cpu, active_load_balance_cpu_stop, rq,
+ &rq->active_balance_work);
+}
+
+static inline int nr_big_tasks(struct rq *rq)
+{
+ return rq->hmp_stats.nr_big_tasks;
+}
+
+unsigned int cpu_temp(int cpu)
+{
+ struct cpu_pwr_stats *per_cpu_info = get_cpu_pwr_stats();
+ if (per_cpu_info)
+ return per_cpu_info[cpu].temp;
+ else
+ return 0;
+}
+
+#else /* CONFIG_SCHED_HMP */
+
+struct cpu_select_env;
+struct sched_cluster;
+
+static inline int task_will_fit(struct task_struct *p, int cpu,
+ enum sched_boost_type boost_type)
+{
+ return 1;
+}
+
+static inline int select_best_cpu(struct task_struct *p, int target,
+ int reason, int sync)
+{
+ return 0;
+}
+
+unsigned int power_cost(int cpu, u64 demand)
+{
+ return SCHED_CAPACITY_SCALE;
+}
+
+static inline int
+spill_threshold_crossed(struct cpu_select_env *env, struct rq *rq)
+{
+ return 0;
+}
+
+static inline int sched_boost(void)
+{
+ return 0;
+}
+
+static inline int is_big_task(struct task_struct *p)
+{
+ return 0;
+}
+
+static inline int nr_big_tasks(struct rq *rq)
+{
+ return 0;
+}
+
+static inline int is_cpu_throttling_imminent(int cpu)
+{
+ return 0;
+}
+
+static inline int is_task_migration_throttled(struct task_struct *p)
+{
+ return 0;
+}
+
+unsigned int cpu_temp(int cpu)
+{
+ return 0;
+}
+
+static inline void
+inc_rq_hmp_stats(struct rq *rq, struct task_struct *p, int change_cra) { }
+static inline void
+dec_rq_hmp_stats(struct rq *rq, struct task_struct *p, int change_cra) { }
+
+static inline void
+inc_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p) { }
+
+static inline void
+dec_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p) { }
+
+static inline int
+preferred_cluster(struct sched_cluster *cluster, struct task_struct *p)
+{
+ return 1;
+}
+
+static inline struct sched_cluster *rq_cluster(struct rq *rq)
+{
+ return NULL;
+}
+
+#endif /* CONFIG_SCHED_HMP */
+
+
+
#if (SCHED_LOAD_SHIFT - SCHED_LOAD_RESOLUTION) != 10 || SCHED_CAPACITY_SHIFT != 10
#error "load tracking assumes 2^10 as unit"
#endif
@@ -2564,6 +4322,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
u32 contrib;
unsigned int delta_w, scaled_delta_w, decayed = 0;
unsigned long scale_freq, scale_cpu;
+ struct sched_entity *se = NULL;
delta = now - sa->last_update_time;
/*
@@ -2584,6 +4343,12 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
return 0;
sa->last_update_time = now;
+ if (sched_use_pelt && cfs_rq && weight) {
+ se = container_of(sa, struct sched_entity, avg);
+ if (entity_is_task(se) && se->on_rq)
+ dec_hmp_sched_stats_fair(rq_of(cfs_rq), task_of(se));
+ }
+
scale_freq = arch_scale_freq_capacity(NULL, cpu);
scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
@@ -2604,6 +4369,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
scaled_delta_w = cap_scale(delta_w, scale_freq);
if (weight) {
sa->load_sum += weight * scaled_delta_w;
+ add_to_scaled_stat(cpu, sa, delta_w);
if (cfs_rq) {
cfs_rq->runnable_load_sum +=
weight * scaled_delta_w;
@@ -2630,6 +4396,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
contrib = cap_scale(contrib, scale_freq);
if (weight) {
sa->load_sum += weight * contrib;
+ add_to_scaled_stat(cpu, sa, contrib);
if (cfs_rq)
cfs_rq->runnable_load_sum += weight * contrib;
}
@@ -2641,9 +4408,14 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
scaled_delta = cap_scale(delta, scale_freq);
if (weight) {
sa->load_sum += weight * scaled_delta;
+ add_to_scaled_stat(cpu, sa, delta);
if (cfs_rq)
cfs_rq->runnable_load_sum += weight * scaled_delta;
}
+
+ if (se && entity_is_task(se) && se->on_rq)
+ inc_hmp_sched_stats_fair(rq_of(cfs_rq), task_of(se));
+
if (running)
sa->util_sum += scaled_delta * scale_cpu;
@@ -2884,8 +4656,192 @@ static inline int idle_balance(struct rq *rq)
return 0;
}
+static inline void
+inc_rq_hmp_stats(struct rq *rq, struct task_struct *p, int change_cra) { }
+static inline void
+dec_rq_hmp_stats(struct rq *rq, struct task_struct *p, int change_cra) { }
+
#endif /* CONFIG_SMP */
+#ifdef CONFIG_SCHED_HMP
+
+#ifdef CONFIG_SCHED_FREQ_INPUT
+#define clear_ravg_pred_demand() (p->ravg.pred_demand = 0)
+#else
+#define clear_ravg_pred_demand()
+#endif
+
+void init_new_task_load(struct task_struct *p)
+{
+ int i;
+ u32 init_load_windows = sched_init_task_load_windows;
+ u32 init_load_pelt = sched_init_task_load_pelt;
+ u32 init_load_pct = current->init_load_pct;
+
+ p->init_load_pct = 0;
+ rcu_assign_pointer(p->grp, NULL);
+ INIT_LIST_HEAD(&p->grp_list);
+ memset(&p->ravg, 0, sizeof(struct ravg));
+ p->cpu_cycles = 0;
+
+ if (init_load_pct) {
+ init_load_pelt = div64_u64((u64)init_load_pct *
+ (u64)LOAD_AVG_MAX, 100);
+ init_load_windows = div64_u64((u64)init_load_pct *
+ (u64)sched_ravg_window, 100);
+ }
+
+ p->ravg.demand = init_load_windows;
+ clear_ravg_pred_demand();
+ for (i = 0; i < RAVG_HIST_SIZE_MAX; ++i)
+ p->ravg.sum_history[i] = init_load_windows;
+ p->se.avg.runnable_avg_sum_scaled = init_load_pelt;
+}
+
+#else /* CONFIG_SCHED_HMP */
+
+void init_new_task_load(struct task_struct *p)
+{
+}
+
+#endif /* CONFIG_SCHED_HMP */
+
+#ifdef CONFIG_SCHED_HMP
+
+/* Return task demand in percentage scale */
+unsigned int pct_task_load(struct task_struct *p)
+{
+ unsigned int load;
+
+ load = div64_u64((u64)task_load(p) * 100, (u64)max_task_load());
+
+ return load;
+}
+
+/*
+ * Add scaled version of 'delta' to runnable_avg_sum_scaled
+ * 'delta' is scaled in reference to "best" cpu
+ */
+static inline void
+add_to_scaled_stat(int cpu, struct sched_avg *sa, u64 delta)
+{
+ int cur_freq = cpu_cur_freq(cpu);
+ u64 scaled_delta;
+ int sf;
+
+ if (!sched_enable_hmp)
+ return;
+
+ if (unlikely(cur_freq > max_possible_freq))
+ cur_freq = max_possible_freq;
+
+ scaled_delta = div64_u64(delta * cur_freq, max_possible_freq);
+ sf = (cpu_efficiency(cpu) * 1024) / max_possible_efficiency;
+ scaled_delta *= sf;
+ scaled_delta >>= 10;
+ sa->runnable_avg_sum_scaled += scaled_delta;
+}
+
+static inline void decay_scaled_stat(struct sched_avg *sa, u64 periods)
+{
+ if (!sched_enable_hmp)
+ return;
+
+ sa->runnable_avg_sum_scaled =
+ decay_load(sa->runnable_avg_sum_scaled,
+ periods);
+}
+
+#ifdef CONFIG_CFS_BANDWIDTH
+
+static void init_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq)
+{
+ cfs_rq->hmp_stats.nr_big_tasks = 0;
+ cfs_rq->hmp_stats.cumulative_runnable_avg = 0;
+ set_pred_demands_sum(&cfs_rq->hmp_stats, 0);
+}
+
+static void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
+ struct task_struct *p, int change_cra)
+{
+ inc_nr_big_task(&cfs_rq->hmp_stats, p);
+ if (change_cra)
+ inc_cumulative_runnable_avg(&cfs_rq->hmp_stats, p);
+}
+
+static void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
+ struct task_struct *p, int change_cra)
+{
+ dec_nr_big_task(&cfs_rq->hmp_stats, p);
+ if (change_cra)
+ dec_cumulative_runnable_avg(&cfs_rq->hmp_stats, p);
+}
+
+static void inc_throttled_cfs_rq_hmp_stats(struct hmp_sched_stats *stats,
+ struct cfs_rq *cfs_rq)
+{
+ stats->nr_big_tasks += cfs_rq->hmp_stats.nr_big_tasks;
+ stats->cumulative_runnable_avg +=
+ cfs_rq->hmp_stats.cumulative_runnable_avg;
+ set_pred_demands_sum(stats, stats->pred_demands_sum +
+ cfs_rq->hmp_stats.pred_demands_sum);
+}
+
+static void dec_throttled_cfs_rq_hmp_stats(struct hmp_sched_stats *stats,
+ struct cfs_rq *cfs_rq)
+{
+ stats->nr_big_tasks -= cfs_rq->hmp_stats.nr_big_tasks;
+ stats->cumulative_runnable_avg -=
+ cfs_rq->hmp_stats.cumulative_runnable_avg;
+ set_pred_demands_sum(stats, stats->pred_demands_sum -
+ cfs_rq->hmp_stats.pred_demands_sum);
+
+ BUG_ON(stats->nr_big_tasks < 0 ||
+ (s64)stats->cumulative_runnable_avg < 0);
+ verify_pred_demands_sum(stats);
+}
+
+#else /* CONFIG_CFS_BANDWIDTH */
+
+static inline void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
+ struct task_struct *p, int change_cra) { }
+
+static inline void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
+ struct task_struct *p, int change_cra) { }
+
+#endif /* CONFIG_CFS_BANDWIDTH */
+
+#else /* CONFIG_SCHED_HMP */
+
+static inline void
+add_to_scaled_stat(int cpu, struct sched_avg *sa, u64 delta)
+{
+}
+
+static inline void decay_scaled_stat(struct sched_avg *sa, u64 periods)
+{
+}
+
+static inline void init_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq) { }
+
+static inline void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
+ struct task_struct *p, int change_cra) { }
+
+static inline void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
+ struct task_struct *p, int change_cra) { }
+
+static inline void inc_throttled_cfs_rq_hmp_stats(struct hmp_sched_stats *stats,
+ struct cfs_rq *cfs_rq)
+{
+}
+
+static inline void dec_throttled_cfs_rq_hmp_stats(struct hmp_sched_stats *stats,
+ struct cfs_rq *cfs_rq)
+{
+}
+
+#endif /* CONFIG_SCHED_HMP */
+
static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
#ifdef CONFIG_SCHEDSTATS
@@ -2931,6 +4887,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
}
trace_sched_stat_blocked(tsk, delta);
+ trace_sched_blocked_reason(tsk);
/*
* Blocking time is in units of nanosecs, so shift by
@@ -3498,6 +5455,33 @@ static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
return cfs_bandwidth_used() && cfs_rq->throttled;
}
+/*
+ * Check if task is part of a hierarchy where some cfs_rq does not have any
+ * runtime left.
+ *
+ * We can't rely on throttled_hierarchy() to do this test, as
+ * cfs_rq->throttle_count will not be updated yet when this function is called
+ * from scheduler_tick()
+ */
+static int task_will_be_throttled(struct task_struct *p)
+{
+ struct sched_entity *se = &p->se;
+ struct cfs_rq *cfs_rq;
+
+ if (!cfs_bandwidth_used())
+ return 0;
+
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+ if (!cfs_rq->runtime_enabled)
+ continue;
+ if (cfs_rq->runtime_remaining <= 0)
+ return 1;
+ }
+
+ return 0;
+}
+
/* check whether cfs_rq, or any parent, is throttled */
static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
{
@@ -3577,13 +5561,20 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
if (dequeue)
dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
qcfs_rq->h_nr_running -= task_delta;
+#ifdef CONFIG_SCHED_HMP
+ dec_throttled_cfs_rq_hmp_stats(&qcfs_rq->hmp_stats, cfs_rq);
+#endif
if (qcfs_rq->load.weight)
dequeue = 0;
}
- if (!se)
+ if (!se) {
sub_nr_running(rq, task_delta);
+#ifdef CONFIG_SCHED_HMP
+ dec_throttled_cfs_rq_hmp_stats(&rq->hmp_stats, cfs_rq);
+#endif
+ }
cfs_rq->throttled = 1;
cfs_rq->throttled_clock = rq_clock(rq);
@@ -3604,6 +5595,12 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
start_cfs_bandwidth(cfs_b);
raw_spin_unlock(&cfs_b->lock);
+
+ /* Log effect on hmp stats after throttling */
+ trace_sched_cpu_load_cgroup(rq, idle_cpu(cpu_of(rq)),
+ sched_irqload(cpu_of(rq)),
+ power_cost(cpu_of(rq), 0),
+ cpu_temp(cpu_of(rq)));
}
void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
@@ -3613,6 +5610,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
struct sched_entity *se;
int enqueue = 1;
long task_delta;
+ struct cfs_rq *tcfs_rq = cfs_rq;
se = cfs_rq->tg->se[cpu_of(rq)];
@@ -3640,17 +5638,30 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
if (enqueue)
enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
cfs_rq->h_nr_running += task_delta;
+#ifdef CONFIG_SCHED_HMP
+ inc_throttled_cfs_rq_hmp_stats(&cfs_rq->hmp_stats, tcfs_rq);
+#endif
if (cfs_rq_throttled(cfs_rq))
break;
}
- if (!se)
+ if (!se) {
add_nr_running(rq, task_delta);
+#ifdef CONFIG_SCHED_HMP
+ inc_throttled_cfs_rq_hmp_stats(&rq->hmp_stats, tcfs_rq);
+#endif
+ }
/* determine whether we need to wake up potentially idle cpu */
if (rq->curr == rq->idle && rq->cfs.nr_running)
resched_curr(rq);
+
+ /* Log effect on hmp stats after un-throttling */
+ trace_sched_cpu_load_cgroup(rq, idle_cpu(cpu_of(rq)),
+ sched_irqload(cpu_of(rq)),
+ power_cost(cpu_of(rq), 0),
+ cpu_temp(cpu_of(rq)));
}
static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
@@ -3971,6 +5982,7 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
cfs_rq->runtime_enabled = 0;
INIT_LIST_HEAD(&cfs_rq->throttled_list);
+ init_cfs_rq_hmp_stats(cfs_rq);
}
void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
@@ -4086,7 +6098,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
WARN_ON(task_rq(p) != rq);
- if (cfs_rq->nr_running > 1) {
+ if (rq->cfs.h_nr_running > 1) {
u64 slice = sched_slice(cfs_rq, se);
u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
s64 delta = slice - ran;
@@ -4102,8 +6114,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
/*
* called from enqueue/dequeue and updates the hrtick when the
- * current task is from our class and nr_running is low enough
- * to matter.
+ * current task is from our class.
*/
static void hrtick_update(struct rq *rq)
{
@@ -4112,8 +6123,7 @@ static void hrtick_update(struct rq *rq)
if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
return;
- if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
- hrtick_start_fair(rq, curr);
+ hrtick_start_fair(rq, curr);
}
#else /* !CONFIG_SCHED_HRTICK */
static inline void
@@ -4152,6 +6162,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_throttled(cfs_rq))
break;
cfs_rq->h_nr_running++;
+ inc_cfs_rq_hmp_stats(cfs_rq, p, 1);
flags = ENQUEUE_WAKEUP;
}
@@ -4159,6 +6170,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
cfs_rq->h_nr_running++;
+ inc_cfs_rq_hmp_stats(cfs_rq, p, 1);
if (cfs_rq_throttled(cfs_rq))
break;
@@ -4167,9 +6179,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
update_cfs_shares(cfs_rq);
}
- if (!se)
+ if (!se) {
add_nr_running(rq, 1);
-
+ inc_rq_hmp_stats(rq, p, 1);
+ }
hrtick_update(rq);
}
@@ -4199,6 +6212,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_throttled(cfs_rq))
break;
cfs_rq->h_nr_running--;
+ dec_cfs_rq_hmp_stats(cfs_rq, p, 1);
/* Don't dequeue parent if it has other entities besides us */
if (cfs_rq->load.weight) {
@@ -4219,6 +6233,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
cfs_rq->h_nr_running--;
+ dec_cfs_rq_hmp_stats(cfs_rq, p, 1);
if (cfs_rq_throttled(cfs_rq))
break;
@@ -4227,9 +6242,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
update_cfs_shares(cfs_rq);
}
- if (!se)
+ if (!se) {
sub_nr_running(rq, 1);
-
+ dec_rq_hmp_stats(rq, p, 1);
+ }
hrtick_update(rq);
}
@@ -4849,6 +6865,11 @@ static int select_idle_sibling(struct task_struct *p, int target)
if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
return i;
+ if (!sysctl_sched_wake_to_idle &&
+ !(current->flags & PF_WAKE_UP_IDLE) &&
+ !(p->flags & PF_WAKE_UP_IDLE))
+ return target;
+
/*
* Otherwise, iterate the domains and find an elegible idle cpu.
*/
@@ -4931,6 +6952,9 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
int want_affine = 0;
int sync = wake_flags & WF_SYNC;
+ if (sched_enable_hmp)
+ return select_best_cpu(p, prev_cpu, 0, sync);
+
if (sd_flag & SD_BALANCE_WAKE)
want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
@@ -5518,6 +7542,13 @@ enum fbq_type { regular, remote, all };
#define LBF_NEED_BREAK 0x02
#define LBF_DST_PINNED 0x04
#define LBF_SOME_PINNED 0x08
+#define LBF_SCHED_BOOST_ACTIVE_BALANCE 0x40
+#define LBF_BIG_TASK_ACTIVE_BALANCE 0x80
+#define LBF_HMP_ACTIVE_BALANCE (LBF_SCHED_BOOST_ACTIVE_BALANCE | \
+ LBF_BIG_TASK_ACTIVE_BALANCE)
+#define LBF_IGNORE_BIG_TASKS 0x100
+#define LBF_IGNORE_PREFERRED_CLUSTER_TASKS 0x200
+#define LBF_MOVED_RELATED_THREAD_GROUP_TASK 0x400
struct lb_env {
struct sched_domain *sd;
@@ -5534,6 +7565,8 @@ struct lb_env {
long imbalance;
/* The set of CPUs under consideration for load-balancing */
struct cpumask *cpus;
+ unsigned int busiest_grp_capacity;
+ unsigned int busiest_nr_running;
unsigned int flags;
@@ -5545,6 +7578,9 @@ struct lb_env {
struct list_head tasks;
};
+static DEFINE_PER_CPU(bool, dbs_boost_needed);
+static DEFINE_PER_CPU(int, dbs_boost_load_moved);
+
/*
* Is this task likely cache-hot:
*/
@@ -5640,6 +7676,7 @@ static
int can_migrate_task(struct task_struct *p, struct lb_env *env)
{
int tsk_cache_hot;
+ int twf, group_cpus;
lockdep_assert_held(&env->src_rq->lock);
@@ -5686,6 +7723,34 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
/* Record that we found atleast one task that could run on dst_cpu */
env->flags &= ~LBF_ALL_PINNED;
+ if (cpu_capacity(env->dst_cpu) > cpu_capacity(env->src_cpu) &&
+ nr_big_tasks(env->src_rq) && !is_big_task(p))
+ return 0;
+
+ twf = task_will_fit(p, env->dst_cpu);
+
+ /*
+ * Attempt to not pull tasks that don't fit. We may get lucky and find
+ * one that actually fits.
+ */
+ if (env->flags & LBF_IGNORE_BIG_TASKS && !twf)
+ return 0;
+
+ if (env->flags & LBF_IGNORE_PREFERRED_CLUSTER_TASKS &&
+ !preferred_cluster(rq_cluster(cpu_rq(env->dst_cpu)), p))
+ return 0;
+
+ /*
+ * Group imbalance can sometimes cause work to be pulled across groups
+ * even though the group could have managed the imbalance on its own.
+ * Prevent inter-cluster migrations for big tasks when the number of
+ * tasks is lower than the capacity of the group.
+ */
+ group_cpus = DIV_ROUND_UP(env->busiest_grp_capacity,
+ SCHED_CAPACITY_SCALE);
+ if (!twf && env->busiest_nr_running <= group_cpus)
+ return 0;
+
if (task_running(env->src_rq, p)) {
schedstat_inc(p, se.statistics.nr_failed_migrations_running);
return 0;
@@ -5693,15 +7758,16 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
/*
* Aggressive migration if:
- * 1) destination numa is preferred
- * 2) task is cache cold, or
- * 3) too many balance attempts have failed.
+ * 1) IDLE or NEWLY_IDLE balance.
+ * 2) destination numa is preferred
+ * 3) task is cache cold, or
+ * 4) too many balance attempts have failed.
*/
tsk_cache_hot = migrate_degrades_locality(p, env);
if (tsk_cache_hot == -1)
tsk_cache_hot = task_hot(p, env);
- if (tsk_cache_hot <= 0 ||
+ if (env->idle != CPU_NOT_IDLE || tsk_cache_hot <= 0 ||
env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
if (tsk_cache_hot == 1) {
schedstat_inc(env->sd, lb_hot_gained[env->idle]);
@@ -5721,9 +7787,13 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
{
lockdep_assert_held(&env->src_rq->lock);
- deactivate_task(env->src_rq, p, 0);
p->on_rq = TASK_ON_RQ_MIGRATING;
+ deactivate_task(env->src_rq, p, 0);
+ double_lock_balance(env->src_rq, env->dst_rq);
set_task_cpu(p, env->dst_cpu);
+ if (rcu_access_pointer(p->grp))
+ env->flags |= LBF_MOVED_RELATED_THREAD_GROUP_TASK;
+ double_unlock_balance(env->src_rq, env->dst_rq);
}
/*
@@ -5751,6 +7821,8 @@ static struct task_struct *detach_one_task(struct lb_env *env)
* inside detach_tasks().
*/
schedstat_inc(env->sd, lb_gained[env->idle]);
+ per_cpu(dbs_boost_load_moved, env->dst_cpu) += pct_task_load(p);
+
return p;
}
return NULL;
@@ -5770,12 +7842,20 @@ static int detach_tasks(struct lb_env *env)
struct task_struct *p;
unsigned long load;
int detached = 0;
+ int orig_loop = env->loop;
lockdep_assert_held(&env->src_rq->lock);
if (env->imbalance <= 0)
return 0;
+ if (cpu_capacity(env->dst_cpu) < cpu_capacity(env->src_cpu) &&
+ !sched_boost())
+ env->flags |= LBF_IGNORE_BIG_TASKS;
+ else if (!same_cluster(env->dst_cpu, env->src_cpu))
+ env->flags |= LBF_IGNORE_PREFERRED_CLUSTER_TASKS;
+
+redo:
while (!list_empty(tasks)) {
/*
* We don't want to steal all, otherwise we may be treated likewise,
@@ -5814,6 +7894,7 @@ static int detach_tasks(struct lb_env *env)
detached++;
env->imbalance -= load;
+ per_cpu(dbs_boost_load_moved, env->dst_cpu) += pct_task_load(p);
#ifdef CONFIG_PREEMPT
/*
@@ -5837,6 +7918,15 @@ next:
list_move_tail(&p->se.group_node, tasks);
}
+ if (env->flags & (LBF_IGNORE_BIG_TASKS |
+ LBF_IGNORE_PREFERRED_CLUSTER_TASKS) && !detached) {
+ tasks = &env->src_rq->cfs_tasks;
+ env->flags &= ~(LBF_IGNORE_BIG_TASKS |
+ LBF_IGNORE_PREFERRED_CLUSTER_TASKS);
+ env->loop = orig_loop;
+ goto redo;
+ }
+
/*
* Right now, this is one of only two places we collect this stat
* so we can safely collect detach_one_task() stats here rather
@@ -5855,9 +7945,11 @@ static void attach_task(struct rq *rq, struct task_struct *p)
lockdep_assert_held(&rq->lock);
BUG_ON(task_rq(p) != rq);
- p->on_rq = TASK_ON_RQ_QUEUED;
activate_task(rq, p, 0);
+ p->on_rq = TASK_ON_RQ_QUEUED;
check_preempt_curr(rq, p, 0);
+ if (task_notify_on_migrate(p))
+ per_cpu(dbs_boost_needed, task_cpu(p)) = true;
}
/*
@@ -6001,6 +8093,10 @@ struct sg_lb_stats {
unsigned long group_capacity;
unsigned long group_util; /* Total utilization of the group */
unsigned int sum_nr_running; /* Nr tasks running in the group */
+#ifdef CONFIG_SCHED_HMP
+ unsigned long sum_nr_big_tasks;
+ u64 group_cpu_load; /* Scaled load of all CPUs of the group */
+#endif
unsigned int idle_cpus;
unsigned int group_weight;
enum group_type group_type;
@@ -6047,6 +8143,56 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
};
}
+#ifdef CONFIG_SCHED_HMP
+
+static int
+bail_inter_cluster_balance(struct lb_env *env, struct sd_lb_stats *sds)
+{
+ int local_cpu, busiest_cpu;
+ int local_capacity, busiest_capacity;
+ int local_pwr_cost, busiest_pwr_cost;
+ int nr_cpus;
+
+ if (!sysctl_sched_restrict_cluster_spill || sched_boost())
+ return 0;
+
+ local_cpu = group_first_cpu(sds->local);
+ busiest_cpu = group_first_cpu(sds->busiest);
+
+ local_capacity = cpu_max_possible_capacity(local_cpu);
+ busiest_capacity = cpu_max_possible_capacity(busiest_cpu);
+
+ local_pwr_cost = cpu_max_power_cost(local_cpu);
+ busiest_pwr_cost = cpu_max_power_cost(busiest_cpu);
+
+ if (local_capacity < busiest_capacity ||
+ (local_capacity == busiest_capacity &&
+ local_pwr_cost <= busiest_pwr_cost))
+ return 0;
+
+ if (local_capacity > busiest_capacity &&
+ sds->busiest_stat.sum_nr_big_tasks)
+ return 0;
+
+ nr_cpus = cpumask_weight(sched_group_cpus(sds->busiest));
+ if ((sds->busiest_stat.group_cpu_load < nr_cpus * sched_spill_load) &&
+ (sds->busiest_stat.sum_nr_running <
+ nr_cpus * sysctl_sched_spill_nr_run))
+ return 1;
+
+ return 0;
+}
+
+#else /* CONFIG_SCHED_HMP */
+
+static inline int
+bail_inter_cluster_balance(struct lb_env *env, struct sd_lb_stats *sds)
+{
+ return 0;
+}
+
+#endif /* CONFIG_SCHED_HMP */
+
/**
* get_sd_load_idx - Obtain the load index for a given sched domain.
* @sd: The sched_domain whose load_idx is to be obtained.
@@ -6276,7 +8422,7 @@ group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
static inline enum
group_type group_classify(struct sched_group *group,
- struct sg_lb_stats *sgs)
+ struct sg_lb_stats *sgs, struct lb_env *env)
{
if (sgs->group_no_capacity)
return group_overloaded;
@@ -6309,6 +8455,11 @@ static inline void update_sg_lb_stats(struct lb_env *env,
for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
struct rq *rq = cpu_rq(i);
+ trace_sched_cpu_load_lb(cpu_rq(i), idle_cpu(i),
+ sched_irqload(i),
+ power_cost(i, 0),
+ cpu_temp(i));
+
/* Bias balancing toward cpus of our domain */
if (local_group)
load = target_load(i, load_idx);
@@ -6322,6 +8473,11 @@ static inline void update_sg_lb_stats(struct lb_env *env,
if (rq->nr_running > 1)
*overload = true;
+#ifdef CONFIG_SCHED_HMP
+ sgs->sum_nr_big_tasks += rq->hmp_stats.nr_big_tasks;
+ sgs->group_cpu_load += cpu_load(i);
+#endif
+
#ifdef CONFIG_NUMA_BALANCING
sgs->nr_numa_running += rq->nr_numa_running;
sgs->nr_preferred_running += rq->nr_preferred_running;
@@ -6341,8 +8497,40 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->group_weight = group->group_weight;
sgs->group_no_capacity = group_is_overloaded(env, sgs);
- sgs->group_type = group_classify(group, sgs);
+ sgs->group_type = group_classify(group, sgs, env);
+}
+
+#ifdef CONFIG_SCHED_HMP
+static bool update_sd_pick_busiest_active_balance(struct lb_env *env,
+ struct sd_lb_stats *sds,
+ struct sched_group *sg,
+ struct sg_lb_stats *sgs)
+{
+ if (env->idle != CPU_NOT_IDLE &&
+ cpu_capacity(env->dst_cpu) > group_rq_capacity(sg)) {
+ if (sched_boost() && !sds->busiest && sgs->sum_nr_running) {
+ env->flags |= LBF_SCHED_BOOST_ACTIVE_BALANCE;
+ return true;
+ }
+
+ if (sgs->sum_nr_big_tasks >
+ sds->busiest_stat.sum_nr_big_tasks) {
+ env->flags |= LBF_BIG_TASK_ACTIVE_BALANCE;
+ return true;
+ }
+ }
+
+ return false;
+}
+#else
+static bool update_sd_pick_busiest_active_balance(struct lb_env *env,
+ struct sd_lb_stats *sds,
+ struct sched_group *sg,
+ struct sg_lb_stats *sgs)
+{
+ return false;
}
+#endif
/**
* update_sd_pick_busiest - return 1 on busiest group
@@ -6364,6 +8552,9 @@ static bool update_sd_pick_busiest(struct lb_env *env,
{
struct sg_lb_stats *busiest = &sds->busiest_stat;
+ if (update_sd_pick_busiest_active_balance(env, sds, sg, sgs))
+ return true;
+
if (sgs->group_type > busiest->group_type)
return true;
@@ -6475,12 +8666,14 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
group_has_capacity(env, &sds->local_stat) &&
(sgs->sum_nr_running > 1)) {
sgs->group_no_capacity = 1;
- sgs->group_type = group_classify(sg, sgs);
+ sgs->group_type = group_classify(sg, sgs, env);
}
if (update_sd_pick_busiest(env, sds, sg, sgs)) {
sds->busiest = sg;
sds->busiest_stat = *sgs;
+ env->busiest_nr_running = sgs->sum_nr_running;
+ env->busiest_grp_capacity = sgs->group_capacity;
}
next_group:
@@ -6732,6 +8925,12 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
if (!sds.busiest || busiest->sum_nr_running == 0)
goto out_balanced;
+ if (env->flags & LBF_HMP_ACTIVE_BALANCE)
+ goto force_balance;
+
+ if (bail_inter_cluster_balance(env, &sds))
+ goto out_balanced;
+
sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
/ sds.total_capacity;
@@ -6793,6 +8992,57 @@ out_balanced:
return NULL;
}
+#ifdef CONFIG_SCHED_HMP
+static struct rq *find_busiest_queue_hmp(struct lb_env *env,
+ struct sched_group *group)
+{
+ struct rq *busiest = NULL, *busiest_big = NULL;
+ u64 max_runnable_avg = 0, max_runnable_avg_big = 0;
+ int max_nr_big = 0, nr_big;
+ bool find_big = !!(env->flags & LBF_BIG_TASK_ACTIVE_BALANCE);
+ int i;
+
+ for_each_cpu(i, sched_group_cpus(group)) {
+ struct rq *rq = cpu_rq(i);
+ u64 cumulative_runnable_avg =
+ rq->hmp_stats.cumulative_runnable_avg;
+
+ if (!cpumask_test_cpu(i, env->cpus))
+ continue;
+
+
+ if (find_big) {
+ nr_big = nr_big_tasks(rq);
+ if (nr_big > max_nr_big ||
+ (nr_big > 0 && nr_big == max_nr_big &&
+ cumulative_runnable_avg > max_runnable_avg_big)) {
+ max_runnable_avg_big = cumulative_runnable_avg;
+ busiest_big = rq;
+ max_nr_big = nr_big;
+ continue;
+ }
+ }
+
+ if (cumulative_runnable_avg > max_runnable_avg) {
+ max_runnable_avg = cumulative_runnable_avg;
+ busiest = rq;
+ }
+ }
+
+ if (busiest_big)
+ return busiest_big;
+
+ env->flags &= ~LBF_BIG_TASK_ACTIVE_BALANCE;
+ return busiest;
+}
+#else
+static inline struct rq *find_busiest_queue_hmp(struct lb_env *env,
+ struct sched_group *group)
+{
+ return NULL;
+}
+#endif
+
/*
* find_busiest_queue - find the busiest runqueue among the cpus in group.
*/
@@ -6803,6 +9053,9 @@ static struct rq *find_busiest_queue(struct lb_env *env,
unsigned long busiest_load = 0, busiest_capacity = 1;
int i;
+ if (sched_enable_hmp)
+ return find_busiest_queue_hmp(env, group);
+
for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
unsigned long capacity, wl;
enum fbq_type rt;
@@ -6870,15 +9123,20 @@ static struct rq *find_busiest_queue(struct lb_env *env,
* Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
* so long as it is large enough.
*/
-#define MAX_PINNED_INTERVAL 512
+#define MAX_PINNED_INTERVAL 16
/* Working cpumask for load_balance and load_balance_newidle. */
DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
+#define NEED_ACTIVE_BALANCE_THRESHOLD 10
+
static int need_active_balance(struct lb_env *env)
{
struct sched_domain *sd = env->sd;
+ if (env->flags & LBF_HMP_ACTIVE_BALANCE)
+ return 1;
+
if (env->idle == CPU_NEWLY_IDLE) {
/*
@@ -6903,11 +9161,10 @@ static int need_active_balance(struct lb_env *env)
return 1;
}
- return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
+ return unlikely(sd->nr_balance_failed >
+ sd->cache_nice_tries + NEED_ACTIVE_BALANCE_THRESHOLD);
}
-static int active_load_balance_cpu_stop(void *data);
-
static int should_we_balance(struct lb_env *env)
{
struct sched_group *sg = env->sd->groups;
@@ -6950,10 +9207,10 @@ static int load_balance(int this_cpu, struct rq *this_rq,
struct sched_domain *sd, enum cpu_idle_type idle,
int *continue_balancing)
{
- int ld_moved, cur_ld_moved, active_balance = 0;
+ int ld_moved = 0, cur_ld_moved, active_balance = 0;
struct sched_domain *sd_parent = sd->parent;
- struct sched_group *group;
- struct rq *busiest;
+ struct sched_group *group = NULL;
+ struct rq *busiest = NULL;
unsigned long flags;
struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
@@ -6967,6 +9224,11 @@ static int load_balance(int this_cpu, struct rq *this_rq,
.cpus = cpus,
.fbq_type = all,
.tasks = LIST_HEAD_INIT(env.tasks),
+ .imbalance = 0,
+ .flags = 0,
+ .loop = 0,
+ .busiest_nr_running = 0,
+ .busiest_grp_capacity = 0,
};
/*
@@ -6978,6 +9240,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
cpumask_copy(cpus, cpu_active_mask);
+ per_cpu(dbs_boost_load_moved, this_cpu) = 0;
schedstat_inc(sd, lb_count[idle]);
redo:
@@ -7019,6 +9282,13 @@ redo:
more_balance:
raw_spin_lock_irqsave(&busiest->lock, flags);
+ /* The world might have changed. Validate assumptions */
+ if (busiest->nr_running <= 1) {
+ raw_spin_unlock_irqrestore(&busiest->lock, flags);
+ env.flags &= ~LBF_ALL_PINNED;
+ goto no_move;
+ }
+
/*
* cur_ld_moved - load moved in current iteration
* ld_moved - cumulative load moved across iterations
@@ -7106,15 +9376,19 @@ more_balance:
}
}
+no_move:
if (!ld_moved) {
- schedstat_inc(sd, lb_failed[idle]);
+ if (!(env.flags & LBF_HMP_ACTIVE_BALANCE))
+ schedstat_inc(sd, lb_failed[idle]);
+
/*
* Increment the failure counter only on periodic balance.
* We do not want newidle balance, which can be very
* frequent, pollute the failure counter causing
* excessive cache_hot migrations and active balances.
*/
- if (idle != CPU_NEWLY_IDLE)
+ if (idle != CPU_NEWLY_IDLE &&
+ !(env.flags & LBF_HMP_ACTIVE_BALANCE))
sd->nr_balance_failed++;
if (need_active_balance(&env)) {
@@ -7148,17 +9422,45 @@ more_balance:
stop_one_cpu_nowait(cpu_of(busiest),
active_load_balance_cpu_stop, busiest,
&busiest->active_balance_work);
+ *continue_balancing = 0;
}
/*
* We've kicked active balancing, reset the failure
* counter.
*/
- sd->nr_balance_failed = sd->cache_nice_tries+1;
+ sd->nr_balance_failed =
+ sd->cache_nice_tries +
+ NEED_ACTIVE_BALANCE_THRESHOLD - 1;
}
- } else
+ } else {
sd->nr_balance_failed = 0;
+ if (per_cpu(dbs_boost_needed, this_cpu)) {
+ struct migration_notify_data mnd;
+
+ mnd.src_cpu = cpu_of(busiest);
+ mnd.dest_cpu = this_cpu;
+ mnd.load = per_cpu(dbs_boost_load_moved, this_cpu);
+ if (mnd.load > 100)
+ mnd.load = 100;
+ atomic_notifier_call_chain(&migration_notifier_head,
+ 0, (void *)&mnd);
+ per_cpu(dbs_boost_needed, this_cpu) = false;
+ per_cpu(dbs_boost_load_moved, this_cpu) = 0;
+ }
+
+ /* Assumes one 'busiest' cpu that we pulled tasks from */
+ if (!same_freq_domain(this_cpu, cpu_of(busiest))) {
+ int check_groups = !!(env.flags &
+ LBF_MOVED_RELATED_THREAD_GROUP_TASK);
+
+ check_for_freq_change(this_rq, false, check_groups);
+ check_for_freq_change(busiest, false, check_groups);
+ } else {
+ check_for_freq_change(this_rq, true, false);
+ }
+ }
if (likely(!active_balance)) {
/* We were unbalanced, so reset the balancing interval */
sd->balance_interval = sd->min_interval;
@@ -7206,6 +9508,11 @@ out_one_pinned:
ld_moved = 0;
out:
+ trace_sched_load_balance(this_cpu, idle, *continue_balancing,
+ group ? group->cpumask[0] : 0,
+ busiest ? busiest->nr_running : 0,
+ env.imbalance, env.flags, ld_moved,
+ sd->balance_interval);
return ld_moved;
}
@@ -7301,9 +9608,12 @@ static int idle_balance(struct rq *this_rq)
/*
* Stop searching for tasks to pull if there are
- * now runnable tasks on this rq.
+ * now runnable tasks on the balance rq or if
+ * continue_balancing has been unset (only possible
+ * due to active migration).
*/
- if (pulled_task || this_rq->nr_running > 0)
+ if (pulled_task || this_rq->nr_running > 0 ||
+ !continue_balancing)
break;
}
rcu_read_unlock();
@@ -7350,11 +9660,28 @@ static int active_load_balance_cpu_stop(void *data)
int busiest_cpu = cpu_of(busiest_rq);
int target_cpu = busiest_rq->push_cpu;
struct rq *target_rq = cpu_rq(target_cpu);
- struct sched_domain *sd;
+ struct sched_domain *sd = NULL;
struct task_struct *p = NULL;
+ struct task_struct *push_task;
+ int push_task_detached = 0;
+ struct lb_env env = {
+ .sd = sd,
+ .dst_cpu = target_cpu,
+ .dst_rq = target_rq,
+ .src_cpu = busiest_rq->cpu,
+ .src_rq = busiest_rq,
+ .idle = CPU_IDLE,
+ .busiest_nr_running = 0,
+ .busiest_grp_capacity = 0,
+ .flags = 0,
+ .loop = 0,
+ };
+ bool moved = false;
raw_spin_lock_irq(&busiest_rq->lock);
+ per_cpu(dbs_boost_load_moved, target_cpu) = 0;
+
/* make sure the requested cpu hasn't gone down in the meantime */
if (unlikely(busiest_cpu != smp_processor_id() ||
!busiest_rq->active_balance))
@@ -7371,6 +9698,20 @@ static int active_load_balance_cpu_stop(void *data)
*/
BUG_ON(busiest_rq == target_rq);
+ push_task = busiest_rq->push_task;
+ target_cpu = busiest_rq->push_cpu;
+ if (push_task) {
+ if (task_on_rq_queued(push_task) &&
+ push_task->state == TASK_RUNNING &&
+ task_cpu(push_task) == busiest_cpu &&
+ cpu_online(target_cpu)) {
+ detach_task(push_task, &env);
+ push_task_detached = 1;
+ moved = true;
+ }
+ goto out_unlock;
+ }
+
/* Search for an sd spanning us and the target CPU. */
rcu_read_lock();
for_each_domain(target_cpu, sd) {
@@ -7380,33 +9721,63 @@ static int active_load_balance_cpu_stop(void *data)
}
if (likely(sd)) {
- struct lb_env env = {
- .sd = sd,
- .dst_cpu = target_cpu,
- .dst_rq = target_rq,
- .src_cpu = busiest_rq->cpu,
- .src_rq = busiest_rq,
- .idle = CPU_IDLE,
- };
-
+ env.sd = sd;
schedstat_inc(sd, alb_count);
p = detach_one_task(&env);
- if (p)
+ if (p) {
schedstat_inc(sd, alb_pushed);
- else
+ moved = true;
+ } else {
schedstat_inc(sd, alb_failed);
+ }
}
rcu_read_unlock();
out_unlock:
busiest_rq->active_balance = 0;
+ push_task = busiest_rq->push_task;
+ target_cpu = busiest_rq->push_cpu;
+
+ if (push_task)
+ busiest_rq->push_task = NULL;
+
raw_spin_unlock(&busiest_rq->lock);
+ if (push_task) {
+ if (push_task_detached)
+ attach_one_task(target_rq, push_task);
+ put_task_struct(push_task);
+ clear_reserved(target_cpu);
+ }
+
if (p)
attach_one_task(target_rq, p);
local_irq_enable();
+ if (moved && !same_freq_domain(busiest_cpu, target_cpu)) {
+ int check_groups = !!(env.flags &
+ LBF_MOVED_RELATED_THREAD_GROUP_TASK);
+ check_for_freq_change(busiest_rq, false, check_groups);
+ check_for_freq_change(target_rq, false, check_groups);
+ } else if (moved) {
+ check_for_freq_change(target_rq, true, false);
+ }
+
+ if (per_cpu(dbs_boost_needed, target_cpu)) {
+ struct migration_notify_data mnd;
+
+ mnd.src_cpu = cpu_of(busiest_rq);
+ mnd.dest_cpu = target_cpu;
+ mnd.load = per_cpu(dbs_boost_load_moved, target_cpu);
+ if (mnd.load > 100)
+ mnd.load = 100;
+ atomic_notifier_call_chain(&migration_notifier_head,
+ 0, (void *)&mnd);
+
+ per_cpu(dbs_boost_needed, target_cpu) = false;
+ per_cpu(dbs_boost_load_moved, target_cpu) = 0;
+ }
return 0;
}
@@ -7428,9 +9799,50 @@ static struct {
unsigned long next_balance; /* in jiffy units */
} nohz ____cacheline_aligned;
-static inline int find_new_ilb(void)
+#ifdef CONFIG_SCHED_HMP
+static inline int find_new_hmp_ilb(int type)
+{
+ int call_cpu = raw_smp_processor_id();
+ struct sched_domain *sd;
+ int ilb;
+
+ rcu_read_lock();
+
+ /* Pick an idle cpu "closest" to call_cpu */
+ for_each_domain(call_cpu, sd) {
+ for_each_cpu_and(ilb, nohz.idle_cpus_mask,
+ sched_domain_span(sd)) {
+ if (idle_cpu(ilb) && (type != NOHZ_KICK_RESTRICT ||
+ (hmp_capable() &&
+ cpu_max_possible_capacity(ilb) <=
+ cpu_max_possible_capacity(call_cpu)) ||
+ cpu_max_power_cost(ilb) <=
+ cpu_max_power_cost(call_cpu))) {
+ rcu_read_unlock();
+ reset_balance_interval(ilb);
+ return ilb;
+ }
+ }
+ }
+
+ rcu_read_unlock();
+ return nr_cpu_ids;
+}
+#else /* CONFIG_SCHED_HMP */
+static inline int find_new_hmp_ilb(int type)
+{
+ return 0;
+}
+#endif /* CONFIG_SCHED_HMP */
+
+static inline int find_new_ilb(int type)
{
- int ilb = cpumask_first(nohz.idle_cpus_mask);
+ int ilb;
+
+ if (sched_enable_hmp)
+ return find_new_hmp_ilb(type);
+
+ ilb = cpumask_first(nohz.idle_cpus_mask);
if (ilb < nr_cpu_ids && idle_cpu(ilb))
return ilb;
@@ -7443,13 +9855,13 @@ static inline int find_new_ilb(void)
* nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
* CPU (if there is one).
*/
-static void nohz_balancer_kick(void)
+static void nohz_balancer_kick(int type)
{
int ilb_cpu;
nohz.next_balance++;
- ilb_cpu = find_new_ilb();
+ ilb_cpu = find_new_ilb(type);
if (ilb_cpu >= nr_cpu_ids)
return;
@@ -7734,6 +10146,70 @@ end:
clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
}
+#ifdef CONFIG_SCHED_HMP
+static inline int _nohz_kick_needed_hmp(struct rq *rq, int cpu, int *type)
+{
+ struct sched_domain *sd;
+ int i;
+
+ if (rq->nr_running < 2)
+ return 0;
+
+ if (!sysctl_sched_restrict_cluster_spill || sched_boost())
+ return 1;
+
+ if (hmp_capable() && cpu_max_possible_capacity(cpu) ==
+ max_possible_capacity)
+ return 1;
+
+ rcu_read_lock();
+ sd = rcu_dereference_check_sched_domain(rq->sd);
+ if (!sd) {
+ rcu_read_unlock();
+ return 0;
+ }
+
+ for_each_cpu(i, sched_domain_span(sd)) {
+ if (cpu_load(i) < sched_spill_load &&
+ cpu_rq(i)->nr_running <
+ sysctl_sched_spill_nr_run) {
+ /* Change the kick type to limit to CPUs that
+ * are of equal or lower capacity.
+ */
+ *type = NOHZ_KICK_RESTRICT;
+ break;
+ }
+ }
+ rcu_read_unlock();
+ return 1;
+}
+#else
+static inline int _nohz_kick_needed_hmp(struct rq *rq, int cpu, int *type)
+{
+ return 0;
+}
+#endif
+
+static inline int _nohz_kick_needed(struct rq *rq, int cpu, int *type)
+{
+ unsigned long now = jiffies;
+
+ /*
+ * None are in tickless mode and hence no need for NOHZ idle load
+ * balancing.
+ */
+ if (likely(!atomic_read(&nohz.nr_cpus)))
+ return 0;
+
+ if (sched_enable_hmp)
+ return _nohz_kick_needed_hmp(rq, cpu, type);
+
+ if (time_before(now, nohz.next_balance))
+ return 0;
+
+ return (rq->nr_running >= 2);
+}
+
/*
* Current heuristic for kicking the idle load balancer in the presence
* of an idle cpu in the system.
@@ -7745,12 +10221,14 @@ end:
* - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
* domain span are idle.
*/
-static inline bool nohz_kick_needed(struct rq *rq)
+static inline bool nohz_kick_needed(struct rq *rq, int *type)
{
- unsigned long now = jiffies;
+#ifndef CONFIG_SCHED_HMP
struct sched_domain *sd;
struct sched_group_capacity *sgc;
- int nr_busy, cpu = rq->cpu;
+ int nr_busy;
+#endif
+ int cpu = rq->cpu;
bool kick = false;
if (unlikely(rq->idle_balance))
@@ -7763,19 +10241,10 @@ static inline bool nohz_kick_needed(struct rq *rq)
set_cpu_sd_state_busy();
nohz_balance_exit_idle(cpu);
- /*
- * None are in tickless mode and hence no need for NOHZ idle load
- * balancing.
- */
- if (likely(!atomic_read(&nohz.nr_cpus)))
- return false;
-
- if (time_before(now, nohz.next_balance))
- return false;
-
- if (rq->nr_running >= 2)
+ if (_nohz_kick_needed(rq, cpu, type))
return true;
+#ifndef CONFIG_SCHED_HMP
rcu_read_lock();
sd = rcu_dereference(per_cpu(sd_busy, cpu));
if (sd) {
@@ -7807,6 +10276,7 @@ static inline bool nohz_kick_needed(struct rq *rq)
unlock:
rcu_read_unlock();
+#endif
return kick;
}
#else
@@ -7840,6 +10310,8 @@ static void run_rebalance_domains(struct softirq_action *h)
*/
void trigger_load_balance(struct rq *rq)
{
+ int type = NOHZ_KICK_ANY;
+
/* Don't need to rebalance while attached to NULL domain */
if (unlikely(on_null_domain(rq)))
return;
@@ -7847,8 +10319,8 @@ void trigger_load_balance(struct rq *rq)
if (time_after_eq(jiffies, rq->next_balance))
raise_softirq(SCHED_SOFTIRQ);
#ifdef CONFIG_NO_HZ_COMMON
- if (nohz_kick_needed(rq))
- nohz_balancer_kick();
+ if (nohz_kick_needed(rq, &type))
+ nohz_balancer_kick(type);
#endif
}
@@ -8308,6 +10780,11 @@ const struct sched_class fair_sched_class = {
#ifdef CONFIG_FAIR_GROUP_SCHED
.task_move_group = task_move_group_fair,
#endif
+#ifdef CONFIG_SCHED_HMP
+ .inc_hmp_sched_stats = inc_hmp_sched_stats_fair,
+ .dec_hmp_sched_stats = dec_hmp_sched_stats_fair,
+ .fixup_hmp_sched_stats = fixup_hmp_sched_stats_fair,
+#endif
};
#ifdef CONFIG_SCHED_DEBUG
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 69631fa46c2f..acee1854c3d0 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -49,7 +49,7 @@ SCHED_FEAT(NONTASK_CAPACITY, true)
* Queue remote wakeups on the target CPU and process them
* using the scheduler IPI. Reduces rq->lock contention/bounces.
*/
-SCHED_FEAT(TTWU_QUEUE, true)
+SCHED_FEAT(TTWU_QUEUE, false)
#ifdef HAVE_RT_PUSH_IPI
/*
@@ -68,4 +68,3 @@ SCHED_FEAT(FORCE_SD_OVERLAP, false)
SCHED_FEAT(RT_RUNTIME_SHARE, true)
SCHED_FEAT(LB_MIN, false)
SCHED_FEAT(ATTACH_AGE_LOAD, true)
-
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 4a2ef5a02fd3..2489140a7c51 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -219,6 +219,7 @@ static void cpu_idle_loop(void)
*/
__current_set_polling();
+ quiet_vmstat();
tick_nohz_idle_enter();
while (!need_resched()) {
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index c4ae0f1fdf9b..36c6634236fb 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -79,6 +79,26 @@ static void update_curr_idle(struct rq *rq)
{
}
+#ifdef CONFIG_SCHED_HMP
+
+static void
+inc_hmp_sched_stats_idle(struct rq *rq, struct task_struct *p)
+{
+}
+
+static void
+dec_hmp_sched_stats_idle(struct rq *rq, struct task_struct *p)
+{
+}
+
+static void
+fixup_hmp_sched_stats_idle(struct rq *rq, struct task_struct *p,
+ u32 new_task_load, u32 new_pred_demand)
+{
+}
+
+#endif
+
/*
* Simple, special scheduling class for the per-CPU idle tasks:
*/
@@ -107,4 +127,9 @@ const struct sched_class idle_sched_class = {
.prio_changed = prio_changed_idle,
.switched_to = switched_to_idle,
.update_curr = update_curr_idle,
+#ifdef CONFIG_SCHED_HMP
+ .inc_hmp_sched_stats = inc_hmp_sched_stats_idle,
+ .dec_hmp_sched_stats = dec_hmp_sched_stats_idle,
+ .fixup_hmp_sched_stats = fixup_hmp_sched_stats_idle,
+#endif
};
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 8ec86abe0ea1..cfec881491ef 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -7,6 +7,7 @@
#include <linux/slab.h>
#include <linux/irq_work.h>
+#include <trace/events/sched.h>
int sched_rr_timeslice = RR_TIMESLICE;
@@ -889,6 +890,51 @@ static inline int rt_se_prio(struct sched_rt_entity *rt_se)
return rt_task_of(rt_se)->prio;
}
+static void dump_throttled_rt_tasks(struct rt_rq *rt_rq)
+{
+ struct rt_prio_array *array = &rt_rq->active;
+ struct sched_rt_entity *rt_se;
+ char buf[500];
+ char *pos = buf;
+ char *end = buf + sizeof(buf);
+ int idx;
+
+ pos += snprintf(pos, sizeof(buf),
+ "sched: RT throttling activated for rt_rq %p (cpu %d)\n",
+ rt_rq, cpu_of(rq_of_rt_rq(rt_rq)));
+
+ if (bitmap_empty(array->bitmap, MAX_RT_PRIO))
+ goto out;
+
+ pos += snprintf(pos, end - pos, "potential CPU hogs:\n");
+ idx = sched_find_first_bit(array->bitmap);
+ while (idx < MAX_RT_PRIO) {
+ list_for_each_entry(rt_se, array->queue + idx, run_list) {
+ struct task_struct *p;
+
+ if (!rt_entity_is_task(rt_se))
+ continue;
+
+ p = rt_task_of(rt_se);
+ if (pos < end)
+ pos += snprintf(pos, end - pos, "\t%s (%d)\n",
+ p->comm, p->pid);
+ }
+ idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx + 1);
+ }
+out:
+#ifdef CONFIG_PANIC_ON_RT_THROTTLING
+ /*
+ * Use pr_err() in the BUG() case since printk_sched() will
+ * not get flushed and deadlock is not a concern.
+ */
+ pr_err("%s", buf);
+ BUG();
+#else
+ printk_deferred("%s", buf);
+#endif
+}
+
static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
{
u64 runtime = sched_rt_runtime(rt_rq);
@@ -912,8 +958,14 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
* but accrue some time due to boosting.
*/
if (likely(rt_b->rt_runtime)) {
+ static bool once = false;
+
rt_rq->rt_throttled = 1;
- printk_deferred_once("sched: RT throttling activated\n");
+
+ if (!once) {
+ once = true;
+ dump_throttled_rt_tasks(rt_rq);
+ }
} else {
/*
* In case we did anyway, make it go away,
@@ -1130,6 +1182,41 @@ void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
#endif /* CONFIG_RT_GROUP_SCHED */
+#ifdef CONFIG_SCHED_HMP
+
+static void
+inc_hmp_sched_stats_rt(struct rq *rq, struct task_struct *p)
+{
+ inc_cumulative_runnable_avg(&rq->hmp_stats, p);
+}
+
+static void
+dec_hmp_sched_stats_rt(struct rq *rq, struct task_struct *p)
+{
+ dec_cumulative_runnable_avg(&rq->hmp_stats, p);
+}
+
+static void
+fixup_hmp_sched_stats_rt(struct rq *rq, struct task_struct *p,
+ u32 new_task_load, u32 new_pred_demand)
+{
+ s64 task_load_delta = (s64)new_task_load - task_load(p);
+ s64 pred_demand_delta = PRED_DEMAND_DELTA;
+
+ fixup_cumulative_runnable_avg(&rq->hmp_stats, p, task_load_delta,
+ pred_demand_delta);
+}
+
+#else /* CONFIG_SCHED_HMP */
+
+static inline void
+inc_hmp_sched_stats_rt(struct rq *rq, struct task_struct *p) { }
+
+static inline void
+dec_hmp_sched_stats_rt(struct rq *rq, struct task_struct *p) { }
+
+#endif /* CONFIG_SCHED_HMP */
+
static inline
unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se)
{
@@ -1261,6 +1348,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
rt_se->timeout = 0;
enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
+ inc_hmp_sched_stats_rt(rq, p);
if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
enqueue_pushable_task(rq, p);
@@ -1272,6 +1360,7 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
update_curr_rt(rq);
dequeue_rt_entity(rt_se);
+ dec_hmp_sched_stats_rt(rq, p);
dequeue_pushable_task(rq, p);
}
@@ -1314,11 +1403,28 @@ static void yield_task_rt(struct rq *rq)
static int find_lowest_rq(struct task_struct *task);
static int
+select_task_rq_rt_hmp(struct task_struct *p, int cpu, int sd_flag, int flags)
+{
+ int target;
+
+ rcu_read_lock();
+ target = find_lowest_rq(p);
+ if (target != -1)
+ cpu = target;
+ rcu_read_unlock();
+
+ return cpu;
+}
+
+static int
select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
{
struct task_struct *curr;
struct rq *rq;
+ if (sched_enable_hmp)
+ return select_task_rq_rt_hmp(p, cpu, sd_flag, flags);
+
/* For anything but wake ups, just return the task_cpu */
if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
goto out;
@@ -1556,6 +1662,74 @@ static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu)
static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
+#ifdef CONFIG_SCHED_HMP
+
+static int find_lowest_rq_hmp(struct task_struct *task)
+{
+ struct cpumask *lowest_mask = *this_cpu_ptr(&local_cpu_mask);
+ struct cpumask candidate_mask = CPU_MASK_NONE;
+ struct sched_cluster *cluster;
+ int best_cpu = -1;
+ int prev_cpu = task_cpu(task);
+ u64 cpu_load, min_load = ULLONG_MAX;
+ int i;
+ int restrict_cluster = sched_boost() ? 0 :
+ sysctl_sched_restrict_cluster_spill;
+
+ /* Make sure the mask is initialized first */
+ if (unlikely(!lowest_mask))
+ return best_cpu;
+
+ if (task->nr_cpus_allowed == 1)
+ return best_cpu; /* No other targets possible */
+
+ if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
+ return best_cpu; /* No targets found */
+
+ /*
+ * At this point we have built a mask of cpus representing the
+ * lowest priority tasks in the system. Now we want to elect
+ * the best one based on our affinity and topology.
+ */
+
+ for_each_sched_cluster(cluster) {
+ cpumask_and(&candidate_mask, &cluster->cpus, lowest_mask);
+
+ if (cpumask_empty(&candidate_mask))
+ continue;
+
+ for_each_cpu(i, &candidate_mask) {
+ if (sched_cpu_high_irqload(i))
+ continue;
+
+ cpu_load = cpu_rq(i)->hmp_stats.cumulative_runnable_avg;
+ if (!restrict_cluster)
+ cpu_load = scale_load_to_cpu(cpu_load, i);
+
+ if (cpu_load < min_load ||
+ (cpu_load == min_load &&
+ (i == prev_cpu || (best_cpu != prev_cpu &&
+ cpus_share_cache(prev_cpu, i))))) {
+ min_load = cpu_load;
+ best_cpu = i;
+ }
+ }
+ if (restrict_cluster && best_cpu != -1)
+ break;
+ }
+
+ return best_cpu;
+}
+
+#else /* CONFIG_SCHED_HMP */
+
+static int find_lowest_rq_hmp(struct task_struct *task)
+{
+ return -1;
+}
+
+#endif /* CONFIG_SCHED_HMP */
+
static int find_lowest_rq(struct task_struct *task)
{
struct sched_domain *sd;
@@ -1563,6 +1737,9 @@ static int find_lowest_rq(struct task_struct *task)
int this_cpu = smp_processor_id();
int cpu = task_cpu(task);
+ if (sched_enable_hmp)
+ return find_lowest_rq_hmp(task);
+
/* Make sure the mask is initialized first */
if (unlikely(!lowest_mask))
return -1;
@@ -1780,7 +1957,9 @@ retry:
}
deactivate_task(rq, next_task, 0);
+ next_task->on_rq = TASK_ON_RQ_MIGRATING;
set_task_cpu(next_task, lowest_rq->cpu);
+ next_task->on_rq = TASK_ON_RQ_QUEUED;
activate_task(lowest_rq, next_task, 0);
ret = 1;
@@ -2034,7 +2213,9 @@ static void pull_rt_task(struct rq *this_rq)
resched = true;
deactivate_task(src_rq, p, 0);
+ p->on_rq = TASK_ON_RQ_MIGRATING;
set_task_cpu(p, this_cpu);
+ p->on_rq = TASK_ON_RQ_QUEUED;
activate_task(this_rq, p, 0);
/*
* We continue with the search, just in
@@ -2116,6 +2297,7 @@ void __init init_sched_rt_class(void)
GFP_KERNEL, cpu_to_node(i));
}
}
+
#endif /* CONFIG_SMP */
/*
@@ -2290,6 +2472,11 @@ const struct sched_class rt_sched_class = {
.switched_to = switched_to_rt,
.update_curr = update_curr_rt,
+#ifdef CONFIG_SCHED_HMP
+ .inc_hmp_sched_stats = inc_hmp_sched_stats_rt,
+ .dec_hmp_sched_stats = dec_hmp_sched_stats_rt,
+ .fixup_hmp_sched_stats = fixup_hmp_sched_stats_rt,
+#endif
};
#ifdef CONFIG_SCHED_DEBUG
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 0517abd7dd73..b9566cf3ad37 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -27,6 +27,20 @@ extern unsigned long calc_load_update;
extern atomic_long_t calc_load_tasks;
extern void calc_global_load_tick(struct rq *this_rq);
+
+struct freq_max_load_entry {
+ /* The maximum load which has accounted governor's headroom. */
+ u64 hdemand;
+};
+
+struct freq_max_load {
+ struct rcu_head rcu;
+ int length;
+ struct freq_max_load_entry freqs[0];
+};
+
+extern DEFINE_PER_CPU(struct freq_max_load *, freq_max_load);
+
extern long calc_load_fold_active(struct rq *this_rq);
#ifdef CONFIG_SMP
@@ -240,6 +254,11 @@ struct cfs_bandwidth {
struct task_group {
struct cgroup_subsys_state css;
+ bool notify_on_migrate;
+#ifdef CONFIG_SCHED_HMP
+ bool upmigrate_discouraged;
+#endif
+
#ifdef CONFIG_FAIR_GROUP_SCHED
/* schedulable entities of this group on each cpu */
struct sched_entity **se;
@@ -343,6 +362,82 @@ struct cfs_bandwidth { };
#endif /* CONFIG_CGROUP_SCHED */
+#ifdef CONFIG_SCHED_HMP
+
+struct hmp_sched_stats {
+ int nr_big_tasks;
+ u64 cumulative_runnable_avg;
+#ifdef CONFIG_SCHED_FREQ_INPUT
+ u64 pred_demands_sum;
+#endif
+};
+
+struct sched_cluster {
+ struct list_head list;
+ struct cpumask cpus;
+ int id;
+ int max_power_cost;
+ int min_power_cost;
+ int max_possible_capacity;
+ int capacity;
+ int efficiency; /* Differentiate cpus with different IPC capability */
+ int load_scale_factor;
+ unsigned int exec_scale_factor;
+ /*
+ * max_freq = user maximum
+ * max_mitigated_freq = thermal defined maximum
+ * max_possible_freq = maximum supported by hardware
+ */
+ unsigned int cur_freq, max_freq, max_mitigated_freq, min_freq;
+ unsigned int max_possible_freq;
+ bool freq_init_done;
+ int dstate, dstate_wakeup_latency, dstate_wakeup_energy;
+ unsigned int static_cluster_pwr_cost;
+};
+
+extern unsigned long all_cluster_ids[];
+
+static inline int cluster_first_cpu(struct sched_cluster *cluster)
+{
+ return cpumask_first(&cluster->cpus);
+}
+
+struct related_thread_group {
+ int id;
+ raw_spinlock_t lock;
+ struct list_head tasks;
+ struct list_head list;
+ struct sched_cluster *preferred_cluster;
+ struct rcu_head rcu;
+ u64 last_update;
+#ifdef CONFIG_SCHED_FREQ_INPUT
+ struct group_cpu_time __percpu *cpu_time; /* one per cluster */
+#endif
+};
+
+struct migration_sum_data {
+ struct rq *src_rq, *dst_rq;
+#ifdef CONFIG_SCHED_FREQ_INPUT
+ struct group_cpu_time *src_cpu_time, *dst_cpu_time;
+#endif
+};
+
+extern struct list_head cluster_head;
+extern int num_clusters;
+extern struct sched_cluster *sched_cluster[NR_CPUS];
+extern int group_will_fit(struct sched_cluster *cluster,
+ struct related_thread_group *grp, u64 demand);
+
+struct cpu_cycle {
+ u64 cycles;
+ u64 time;
+};
+
+#define for_each_sched_cluster(cluster) \
+ list_for_each_entry_rcu(cluster, &cluster_head, list)
+
+#endif
+
/* CFS-related fields in a runqueue */
struct cfs_rq {
struct load_weight load;
@@ -411,6 +506,11 @@ struct cfs_rq {
struct task_group *tg; /* group that "owns" this runqueue */
#ifdef CONFIG_CFS_BANDWIDTH
+
+#ifdef CONFIG_SCHED_HMP
+ struct hmp_sched_stats hmp_stats;
+#endif
+
int runtime_enabled;
u64 runtime_expires;
s64 runtime_remaining;
@@ -624,6 +724,7 @@ struct rq {
/* For active balancing */
int active_balance;
int push_cpu;
+ struct task_struct *push_task;
struct cpu_stop_work active_balance_work;
/* cpu of this runqueue: */
int cpu;
@@ -635,11 +736,41 @@ struct rq {
u64 age_stamp;
u64 idle_stamp;
u64 avg_idle;
+ int cstate, wakeup_latency, wakeup_energy;
/* This is used to determine avg_idle's max value */
u64 max_idle_balance_cost;
#endif
+#ifdef CONFIG_SCHED_HMP
+ struct sched_cluster *cluster;
+ struct cpumask freq_domain_cpumask;
+ struct hmp_sched_stats hmp_stats;
+
+ u64 window_start;
+ unsigned long hmp_flags;
+
+ u64 cur_irqload;
+ u64 avg_irqload;
+ u64 irqload_ts;
+ unsigned int static_cpu_pwr_cost;
+ struct task_struct *ed_task;
+ struct cpu_cycle cc;
+
+#ifdef CONFIG_SCHED_FREQ_INPUT
+ u64 old_busy_time, old_busy_time_group;
+ int notifier_sent;
+ u64 old_estimated_time;
+#endif
+#endif
+
+#ifdef CONFIG_SCHED_FREQ_INPUT
+ u64 curr_runnable_sum;
+ u64 prev_runnable_sum;
+ u64 nt_curr_runnable_sum;
+ u64 nt_prev_runnable_sum;
+#endif
+
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
u64 prev_irq_time;
#endif
@@ -905,6 +1036,446 @@ static inline void sched_ttwu_pending(void) { }
#include "stats.h"
#include "auto_group.h"
+extern void init_new_task_load(struct task_struct *p);
+
+#ifdef CONFIG_SCHED_HMP
+
+#define WINDOW_STATS_RECENT 0
+#define WINDOW_STATS_MAX 1
+#define WINDOW_STATS_MAX_RECENT_AVG 2
+#define WINDOW_STATS_AVG 3
+#define WINDOW_STATS_INVALID_POLICY 4
+
+extern struct mutex policy_mutex;
+extern unsigned int sched_ravg_window;
+extern unsigned int sched_use_pelt;
+extern unsigned int sched_disable_window_stats;
+extern unsigned int sched_enable_hmp;
+extern unsigned int max_possible_freq;
+extern unsigned int min_max_freq;
+extern unsigned int pct_task_load(struct task_struct *p);
+extern unsigned int max_possible_efficiency;
+extern unsigned int min_possible_efficiency;
+extern unsigned int max_capacity;
+extern unsigned int min_capacity;
+extern unsigned int max_load_scale_factor;
+extern unsigned int max_possible_capacity;
+extern unsigned int min_max_possible_capacity;
+extern unsigned int sched_upmigrate;
+extern unsigned int sched_downmigrate;
+extern unsigned int sched_init_task_load_pelt;
+extern unsigned int sched_init_task_load_windows;
+extern unsigned int up_down_migrate_scale_factor;
+extern unsigned int sysctl_sched_restrict_cluster_spill;
+extern unsigned int sched_pred_alert_load;
+
+#ifdef CONFIG_SCHED_FREQ_INPUT
+#define MAJOR_TASK_PCT 85
+extern unsigned int sched_major_task_runtime;
+#endif
+
+extern void reset_cpu_hmp_stats(int cpu, int reset_cra);
+extern unsigned int max_task_load(void);
+extern void sched_account_irqtime(int cpu, struct task_struct *curr,
+ u64 delta, u64 wallclock);
+extern void sched_account_irqstart(int cpu, struct task_struct *curr,
+ u64 wallclock);
+
+unsigned int cpu_temp(int cpu);
+int sched_set_group_id(struct task_struct *p, unsigned int group_id);
+extern unsigned int nr_eligible_big_tasks(int cpu);
+extern void update_up_down_migrate(void);
+
+static inline struct sched_cluster *cpu_cluster(int cpu)
+{
+ return cpu_rq(cpu)->cluster;
+}
+
+static inline int cpu_capacity(int cpu)
+{
+ return cpu_rq(cpu)->cluster->capacity;
+}
+
+static inline int cpu_max_possible_capacity(int cpu)
+{
+ return cpu_rq(cpu)->cluster->max_possible_capacity;
+}
+
+static inline int cpu_load_scale_factor(int cpu)
+{
+ return cpu_rq(cpu)->cluster->load_scale_factor;
+}
+
+static inline int cpu_efficiency(int cpu)
+{
+ return cpu_rq(cpu)->cluster->efficiency;
+}
+
+static inline unsigned int cpu_cur_freq(int cpu)
+{
+ return cpu_rq(cpu)->cluster->cur_freq;
+}
+
+static inline unsigned int cpu_min_freq(int cpu)
+{
+ return cpu_rq(cpu)->cluster->min_freq;
+}
+
+static inline unsigned int cluster_max_freq(struct sched_cluster *cluster)
+{
+ /*
+ * Governor and thermal driver don't know the other party's mitigation
+ * voting. So struct cluster saves both and return min() for current
+ * cluster fmax.
+ */
+ return min(cluster->max_mitigated_freq, cluster->max_freq);
+}
+
+static inline unsigned int cpu_max_freq(int cpu)
+{
+ return cluster_max_freq(cpu_rq(cpu)->cluster);
+}
+
+static inline unsigned int cpu_max_possible_freq(int cpu)
+{
+ return cpu_rq(cpu)->cluster->max_possible_freq;
+}
+
+static inline int same_cluster(int src_cpu, int dst_cpu)
+{
+ return cpu_rq(src_cpu)->cluster == cpu_rq(dst_cpu)->cluster;
+}
+
+static inline int cpu_max_power_cost(int cpu)
+{
+ return cpu_rq(cpu)->cluster->max_power_cost;
+}
+
+static inline u32 cpu_cycles_to_freq(u64 cycles, u32 period)
+{
+ return div64_u64(cycles, period);
+}
+
+static inline bool hmp_capable(void)
+{
+ return max_possible_capacity != min_max_possible_capacity;
+}
+
+/*
+ * 'load' is in reference to "best cpu" at its best frequency.
+ * Scale that in reference to a given cpu, accounting for how bad it is
+ * in reference to "best cpu".
+ */
+static inline u64 scale_load_to_cpu(u64 task_load, int cpu)
+{
+ u64 lsf = cpu_load_scale_factor(cpu);
+
+ if (lsf != 1024) {
+ task_load *= lsf;
+ task_load /= 1024;
+ }
+
+ return task_load;
+}
+
+static inline unsigned int task_load(struct task_struct *p)
+{
+ if (sched_use_pelt)
+ return p->se.avg.runnable_avg_sum_scaled;
+
+ return p->ravg.demand;
+}
+
+#ifdef CONFIG_SCHED_FREQ_INPUT
+#define set_pred_demands_sum(stats, x) ((stats)->pred_demands_sum = (x))
+#define verify_pred_demands_sum(stat) BUG_ON((s64)(stat)->pred_demands_sum < 0)
+#else
+#define set_pred_demands_sum(stats, x)
+#define verify_pred_demands_sum(stat)
+#endif
+
+static inline void
+inc_cumulative_runnable_avg(struct hmp_sched_stats *stats,
+ struct task_struct *p)
+{
+ u32 task_load;
+
+ if (!sched_enable_hmp || sched_disable_window_stats)
+ return;
+
+ task_load = sched_use_pelt ? p->se.avg.runnable_avg_sum_scaled :
+ (sched_disable_window_stats ? 0 : p->ravg.demand);
+
+ stats->cumulative_runnable_avg += task_load;
+ set_pred_demands_sum(stats, stats->pred_demands_sum +
+ p->ravg.pred_demand);
+}
+
+static inline void
+dec_cumulative_runnable_avg(struct hmp_sched_stats *stats,
+ struct task_struct *p)
+{
+ u32 task_load;
+
+ if (!sched_enable_hmp || sched_disable_window_stats)
+ return;
+
+ task_load = sched_use_pelt ? p->se.avg.runnable_avg_sum_scaled :
+ (sched_disable_window_stats ? 0 : p->ravg.demand);
+
+ stats->cumulative_runnable_avg -= task_load;
+
+ BUG_ON((s64)stats->cumulative_runnable_avg < 0);
+
+ set_pred_demands_sum(stats, stats->pred_demands_sum -
+ p->ravg.pred_demand);
+ verify_pred_demands_sum(stats);
+}
+
+static inline void
+fixup_cumulative_runnable_avg(struct hmp_sched_stats *stats,
+ struct task_struct *p, s64 task_load_delta,
+ s64 pred_demand_delta)
+{
+ if (!sched_enable_hmp || sched_disable_window_stats)
+ return;
+
+ stats->cumulative_runnable_avg += task_load_delta;
+ BUG_ON((s64)stats->cumulative_runnable_avg < 0);
+
+ set_pred_demands_sum(stats, stats->pred_demands_sum +
+ pred_demand_delta);
+ verify_pred_demands_sum(stats);
+}
+
+
+#define pct_to_real(tunable) \
+ (div64_u64((u64)tunable * (u64)max_task_load(), 100))
+
+#define real_to_pct(tunable) \
+ (div64_u64((u64)tunable * (u64)100, (u64)max_task_load()))
+
+#define SCHED_HIGH_IRQ_TIMEOUT 3
+static inline u64 sched_irqload(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ s64 delta;
+
+ delta = get_jiffies_64() - rq->irqload_ts;
+ /*
+ * Current context can be preempted by irq and rq->irqload_ts can be
+ * updated by irq context so that delta can be negative.
+ * But this is okay and we can safely return as this means there
+ * was recent irq occurrence.
+ */
+
+ if (delta < SCHED_HIGH_IRQ_TIMEOUT)
+ return rq->avg_irqload;
+ else
+ return 0;
+}
+
+static inline int sched_cpu_high_irqload(int cpu)
+{
+ return sched_irqload(cpu) >= sysctl_sched_cpu_high_irqload;
+}
+
+static inline
+struct related_thread_group *task_related_thread_group(struct task_struct *p)
+{
+ return rcu_dereference(p->grp);
+}
+
+#else /* CONFIG_SCHED_HMP */
+
+#define sched_use_pelt 0
+
+struct hmp_sched_stats;
+struct related_thread_group;
+
+static inline u64 scale_load_to_cpu(u64 load, int cpu)
+{
+ return load;
+}
+
+static inline unsigned int nr_eligible_big_tasks(int cpu)
+{
+ return 0;
+}
+
+static inline int pct_task_load(struct task_struct *p) { return 0; }
+
+static inline int cpu_capacity(int cpu)
+{
+ return SCHED_LOAD_SCALE;
+}
+
+static inline int same_cluster(int src_cpu, int dst_cpu) { return 1; }
+
+static inline void inc_cumulative_runnable_avg(struct hmp_sched_stats *stats,
+ struct task_struct *p)
+{
+}
+
+static inline void dec_cumulative_runnable_avg(struct hmp_sched_stats *stats,
+ struct task_struct *p)
+{
+}
+
+static inline void sched_account_irqtime(int cpu, struct task_struct *curr,
+ u64 delta, u64 wallclock)
+{
+}
+
+static inline void sched_account_irqstart(int cpu, struct task_struct *curr,
+ u64 wallclock)
+{
+}
+
+static inline int sched_cpu_high_irqload(int cpu) { return 0; }
+
+static inline void set_preferred_cluster(struct related_thread_group *grp) { }
+
+static inline
+struct related_thread_group *task_related_thread_group(struct task_struct *p)
+{
+ return NULL;
+}
+
+static inline u32 task_load(struct task_struct *p) { return 0; }
+
+static inline int update_preferred_cluster(struct related_thread_group *grp,
+ struct task_struct *p, u32 old_load)
+{
+ return 0;
+}
+
+#endif /* CONFIG_SCHED_HMP */
+
+/*
+ * Returns the rq capacity of any rq in a group. This does not play
+ * well with groups where rq capacity can change independently.
+ */
+#define group_rq_capacity(group) cpu_capacity(group_first_cpu(group))
+
+#ifdef CONFIG_SCHED_FREQ_INPUT
+#define PRED_DEMAND_DELTA ((s64)new_pred_demand - p->ravg.pred_demand)
+
+extern void
+check_for_freq_change(struct rq *rq, bool check_pred, bool check_groups);
+
+struct group_cpu_time {
+ u64 curr_runnable_sum;
+ u64 prev_runnable_sum;
+ u64 nt_curr_runnable_sum;
+ u64 nt_prev_runnable_sum;
+ u64 window_start;
+};
+
+/* Is frequency of two cpus synchronized with each other? */
+static inline int same_freq_domain(int src_cpu, int dst_cpu)
+{
+ struct rq *rq = cpu_rq(src_cpu);
+
+ if (src_cpu == dst_cpu)
+ return 1;
+
+ return cpumask_test_cpu(dst_cpu, &rq->freq_domain_cpumask);
+}
+
+#else /* CONFIG_SCHED_FREQ_INPUT */
+
+#define sched_migration_fixup 0
+#define PRED_DEMAND_DELTA (0)
+
+static inline void
+check_for_freq_change(struct rq *rq, bool check_pred, bool check_groups) { }
+
+static inline int same_freq_domain(int src_cpu, int dst_cpu)
+{
+ return 1;
+}
+
+#endif /* CONFIG_SCHED_FREQ_INPUT */
+
+#ifdef CONFIG_SCHED_HMP
+
+#define BOOST_KICK 0
+#define CPU_RESERVED 1
+
+static inline int is_reserved(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ return test_bit(CPU_RESERVED, &rq->hmp_flags);
+}
+
+static inline int mark_reserved(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ /* Name boost_flags as hmp_flags? */
+ return test_and_set_bit(CPU_RESERVED, &rq->hmp_flags);
+}
+
+static inline void clear_reserved(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ clear_bit(CPU_RESERVED, &rq->hmp_flags);
+}
+
+static inline u64 cpu_cravg_sync(int cpu, int sync)
+{
+ struct rq *rq = cpu_rq(cpu);
+ u64 load;
+
+ load = rq->hmp_stats.cumulative_runnable_avg;
+
+ /*
+ * If load is being checked in a sync wakeup environment,
+ * we may want to discount the load of the currently running
+ * task.
+ */
+ if (sync && cpu == smp_processor_id()) {
+ if (load > rq->curr->ravg.demand)
+ load -= rq->curr->ravg.demand;
+ else
+ load = 0;
+ }
+
+ return load;
+}
+
+extern void check_for_migration(struct rq *rq, struct task_struct *p);
+extern void pre_big_task_count_change(const struct cpumask *cpus);
+extern void post_big_task_count_change(const struct cpumask *cpus);
+extern void set_hmp_defaults(void);
+extern int power_delta_exceeded(unsigned int cpu_cost, unsigned int base_cost);
+extern unsigned int power_cost(int cpu, u64 demand);
+extern void reset_all_window_stats(u64 window_start, unsigned int window_size);
+extern void boost_kick(int cpu);
+extern int sched_boost(void);
+
+#else /* CONFIG_SCHED_HMP */
+
+#define sched_enable_hmp 0
+#define sched_freq_legacy_mode 1
+
+static inline void check_for_migration(struct rq *rq, struct task_struct *p) { }
+static inline void pre_big_task_count_change(void) { }
+static inline void post_big_task_count_change(void) { }
+static inline void set_hmp_defaults(void) { }
+
+static inline void clear_reserved(int cpu) { }
+
+#define trace_sched_cpu_load(...)
+#define trace_sched_cpu_load_lb(...)
+#define trace_sched_cpu_load_cgroup(...)
+#define trace_sched_cpu_load_wakeup(...)
+
+#endif /* CONFIG_SCHED_HMP */
+
#ifdef CONFIG_CGROUP_SCHED
/*
@@ -925,6 +1496,11 @@ static inline struct task_group *task_group(struct task_struct *p)
return p->sched_task_group;
}
+static inline bool task_notify_on_migrate(struct task_struct *p)
+{
+ return task_group(p)->notify_on_migrate;
+}
+
/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
{
@@ -950,7 +1526,10 @@ static inline struct task_group *task_group(struct task_struct *p)
{
return NULL;
}
-
+static inline bool task_notify_on_migrate(struct task_struct *p)
+{
+ return false;
+}
#endif /* CONFIG_CGROUP_SCHED */
static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
@@ -1100,6 +1679,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */
#define WF_FORK 0x02 /* child wakeup after fork */
#define WF_MIGRATED 0x4 /* internal use, task got migrated */
+#define WF_NO_NOTIFIER 0x08 /* do not notify governor */
/*
* To aid in avoiding the subversion of "niceness" due to uneven distribution
@@ -1228,6 +1808,12 @@ struct sched_class {
#ifdef CONFIG_FAIR_GROUP_SCHED
void (*task_move_group) (struct task_struct *p);
#endif
+#ifdef CONFIG_SCHED_HMP
+ void (*inc_hmp_sched_stats)(struct rq *rq, struct task_struct *p);
+ void (*dec_hmp_sched_stats)(struct rq *rq, struct task_struct *p);
+ void (*fixup_hmp_sched_stats)(struct rq *rq, struct task_struct *p,
+ u32 new_task_load, u32 new_pred_demand);
+#endif
};
static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
@@ -1288,7 +1874,9 @@ static inline struct cpuidle_state *idle_get_state(struct rq *rq)
}
#endif
+#ifdef CONFIG_SYSRQ_SCHED_DEBUG
extern void sysrq_sched_debug_show(void);
+#endif
extern void sched_init_granularity(void);
extern void update_max_interval(void);
@@ -1314,6 +1902,7 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
{
unsigned prev_nr = rq->nr_running;
+ sched_update_nr_prod(cpu_of(rq), count, true);
rq->nr_running = prev_nr + count;
if (prev_nr < 2 && rq->nr_running >= 2) {
@@ -1340,6 +1929,7 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
static inline void sub_nr_running(struct rq *rq, unsigned count)
{
+ sched_update_nr_prod(cpu_of(rq), count, false);
rq->nr_running -= count;
}
@@ -1719,6 +2309,9 @@ enum rq_nohz_flag_bits {
NOHZ_BALANCE_KICK,
};
+#define NOHZ_KICK_ANY 0
+#define NOHZ_KICK_RESTRICT 1
+
#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags)
#endif
diff --git a/kernel/sched/sched_avg.c b/kernel/sched/sched_avg.c
new file mode 100644
index 000000000000..c70e0466c36c
--- /dev/null
+++ b/kernel/sched/sched_avg.c
@@ -0,0 +1,128 @@
+/* Copyright (c) 2012, 2015, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+/*
+ * Scheduler hook for average runqueue determination
+ */
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/hrtimer.h>
+#include <linux/sched.h>
+#include <linux/math64.h>
+
+#include "sched.h"
+#include <trace/events/sched.h>
+
+static DEFINE_PER_CPU(u64, nr_prod_sum);
+static DEFINE_PER_CPU(u64, last_time);
+static DEFINE_PER_CPU(u64, nr_big_prod_sum);
+static DEFINE_PER_CPU(u64, nr);
+
+static DEFINE_PER_CPU(unsigned long, iowait_prod_sum);
+static DEFINE_PER_CPU(spinlock_t, nr_lock) = __SPIN_LOCK_UNLOCKED(nr_lock);
+static s64 last_get_time;
+
+/**
+ * sched_get_nr_running_avg
+ * @return: Average nr_running, iowait and nr_big_tasks value since last poll.
+ * Returns the avg * 100 to return up to two decimal points
+ * of accuracy.
+ *
+ * Obtains the average nr_running value since the last poll.
+ * This function may not be called concurrently with itself
+ */
+void sched_get_nr_running_avg(int *avg, int *iowait_avg, int *big_avg)
+{
+ int cpu;
+ u64 curr_time = sched_clock();
+ u64 diff = curr_time - last_get_time;
+ u64 tmp_avg = 0, tmp_iowait = 0, tmp_big_avg = 0;
+
+ *avg = 0;
+ *iowait_avg = 0;
+ *big_avg = 0;
+
+ if (!diff)
+ return;
+
+ /* read and reset nr_running counts */
+ for_each_possible_cpu(cpu) {
+ unsigned long flags;
+
+ spin_lock_irqsave(&per_cpu(nr_lock, cpu), flags);
+ curr_time = sched_clock();
+ tmp_avg += per_cpu(nr_prod_sum, cpu);
+ tmp_avg += per_cpu(nr, cpu) *
+ (curr_time - per_cpu(last_time, cpu));
+
+ tmp_big_avg += per_cpu(nr_big_prod_sum, cpu);
+ tmp_big_avg += nr_eligible_big_tasks(cpu) *
+ (curr_time - per_cpu(last_time, cpu));
+
+ tmp_iowait += per_cpu(iowait_prod_sum, cpu);
+ tmp_iowait += nr_iowait_cpu(cpu) *
+ (curr_time - per_cpu(last_time, cpu));
+
+ per_cpu(last_time, cpu) = curr_time;
+
+ per_cpu(nr_prod_sum, cpu) = 0;
+ per_cpu(nr_big_prod_sum, cpu) = 0;
+ per_cpu(iowait_prod_sum, cpu) = 0;
+
+ spin_unlock_irqrestore(&per_cpu(nr_lock, cpu), flags);
+ }
+
+ diff = curr_time - last_get_time;
+ last_get_time = curr_time;
+
+ *avg = (int)div64_u64(tmp_avg * 100, diff);
+ *big_avg = (int)div64_u64(tmp_big_avg * 100, diff);
+ *iowait_avg = (int)div64_u64(tmp_iowait * 100, diff);
+
+ trace_sched_get_nr_running_avg(*avg, *big_avg, *iowait_avg);
+
+ BUG_ON(*avg < 0 || *big_avg < 0 || *iowait_avg < 0);
+ pr_debug("%s - avg:%d big_avg:%d iowait_avg:%d\n",
+ __func__, *avg, *big_avg, *iowait_avg);
+}
+EXPORT_SYMBOL(sched_get_nr_running_avg);
+
+/**
+ * sched_update_nr_prod
+ * @cpu: The core id of the nr running driver.
+ * @delta: Adjust nr by 'delta' amount
+ * @inc: Whether we are increasing or decreasing the count
+ * @return: N/A
+ *
+ * Update average with latest nr_running value for CPU
+ */
+void sched_update_nr_prod(int cpu, long delta, bool inc)
+{
+ int diff;
+ s64 curr_time;
+ unsigned long flags, nr_running;
+
+ spin_lock_irqsave(&per_cpu(nr_lock, cpu), flags);
+ nr_running = per_cpu(nr, cpu);
+ curr_time = sched_clock();
+ diff = curr_time - per_cpu(last_time, cpu);
+ per_cpu(last_time, cpu) = curr_time;
+ per_cpu(nr, cpu) = nr_running + (inc ? delta : -delta);
+
+ BUG_ON((s64)per_cpu(nr, cpu) < 0);
+
+ per_cpu(nr_prod_sum, cpu) += nr_running * diff;
+ per_cpu(nr_big_prod_sum, cpu) += nr_eligible_big_tasks(cpu) * diff;
+ per_cpu(iowait_prod_sum, cpu) += nr_iowait_cpu(cpu) * diff;
+ spin_unlock_irqrestore(&per_cpu(nr_lock, cpu), flags);
+}
+EXPORT_SYMBOL(sched_update_nr_prod);
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index cbc67da10954..134da1cc8fce 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -17,6 +17,41 @@ select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags)
}
#endif /* CONFIG_SMP */
+#ifdef CONFIG_SCHED_HMP
+
+static void
+inc_hmp_sched_stats_stop(struct rq *rq, struct task_struct *p)
+{
+ inc_cumulative_runnable_avg(&rq->hmp_stats, p);
+}
+
+static void
+dec_hmp_sched_stats_stop(struct rq *rq, struct task_struct *p)
+{
+ dec_cumulative_runnable_avg(&rq->hmp_stats, p);
+}
+
+static void
+fixup_hmp_sched_stats_stop(struct rq *rq, struct task_struct *p,
+ u32 new_task_load, u32 new_pred_demand)
+{
+ s64 task_load_delta = (s64)new_task_load - task_load(p);
+ s64 pred_demand_delta = PRED_DEMAND_DELTA;
+
+ fixup_cumulative_runnable_avg(&rq->hmp_stats, p, task_load_delta,
+ pred_demand_delta);
+}
+
+#else /* CONFIG_SCHED_HMP */
+
+static inline void
+inc_hmp_sched_stats_stop(struct rq *rq, struct task_struct *p) { }
+
+static inline void
+dec_hmp_sched_stats_stop(struct rq *rq, struct task_struct *p) { }
+
+#endif /* CONFIG_SCHED_HMP */
+
static void
check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
{
@@ -42,12 +77,14 @@ static void
enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
{
add_nr_running(rq, 1);
+ inc_hmp_sched_stats_stop(rq, p);
}
static void
dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
{
sub_nr_running(rq, 1);
+ dec_hmp_sched_stats_stop(rq, p);
}
static void yield_task_stop(struct rq *rq)
@@ -134,4 +171,9 @@ const struct sched_class stop_sched_class = {
.prio_changed = prio_changed_stop,
.switched_to = switched_to_stop,
.update_curr = update_curr_stop,
+#ifdef CONFIG_SCHED_HMP
+ .inc_hmp_sched_stats = inc_hmp_sched_stats_stop,
+ .dec_hmp_sched_stats = dec_hmp_sched_stats_stop,
+ .fixup_hmp_sched_stats = fixup_hmp_sched_stats_stop,
+#endif
};
diff --git a/kernel/smp.c b/kernel/smp.c
index d903c02223af..abdc48cd79a3 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -32,6 +32,9 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data);
static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue);
static void flush_smp_call_function_queue(bool warn_cpu_offline);
+/* CPU mask indicating which CPUs to bring online during smp_init() */
+static bool have_boot_cpu_mask;
+static cpumask_var_t boot_cpu_mask;
static int
hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
@@ -548,6 +551,19 @@ static int __init maxcpus(char *str)
early_param("maxcpus", maxcpus);
+static int __init boot_cpus(char *str)
+{
+ alloc_bootmem_cpumask_var(&boot_cpu_mask);
+ if (cpulist_parse(str, boot_cpu_mask) < 0) {
+ pr_warn("SMP: Incorrect boot_cpus cpumask\n");
+ return -EINVAL;
+ }
+ have_boot_cpu_mask = true;
+ return 0;
+}
+
+early_param("boot_cpus", boot_cpus);
+
/* Setup number of possible processor ids */
int nr_cpu_ids __read_mostly = NR_CPUS;
EXPORT_SYMBOL(nr_cpu_ids);
@@ -563,6 +579,21 @@ void __weak smp_announce(void)
printk(KERN_INFO "Brought up %d CPUs\n", num_online_cpus());
}
+/* Should the given CPU be booted during smp_init() ? */
+static inline bool boot_cpu(int cpu)
+{
+ if (!have_boot_cpu_mask)
+ return true;
+
+ return cpumask_test_cpu(cpu, boot_cpu_mask);
+}
+
+static inline void free_boot_cpu_mask(void)
+{
+ if (have_boot_cpu_mask) /* Allocated from boot_cpus() */
+ free_bootmem_cpumask_var(boot_cpu_mask);
+}
+
/* Called by boot processor to activate the rest. */
void __init smp_init(void)
{
@@ -574,10 +605,12 @@ void __init smp_init(void)
for_each_present_cpu(cpu) {
if (num_online_cpus() >= setup_max_cpus)
break;
- if (!cpu_online(cpu))
+ if (!cpu_online(cpu) && boot_cpu(cpu))
cpu_up(cpu);
}
+ free_boot_cpu_mask();
+
/* Any cleanup work */
smp_announce();
smp_cpus_done(setup_max_cpus);
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index d264f59bff56..6949476a118f 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -13,6 +13,7 @@
#include <linux/percpu.h>
#include <linux/kthread.h>
#include <linux/smpboot.h>
+#include <linux/kmemleak.h>
#include "smpboot.h"
@@ -177,6 +178,8 @@ __smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
td = kzalloc_node(sizeof(*td), GFP_KERNEL, cpu_to_node(cpu));
if (!td)
return -ENOMEM;
+
+ kmemleak_not_leak(td);
td->cpu = cpu;
td->ht = ht;
diff --git a/kernel/sys.c b/kernel/sys.c
index 78947de6f969..b5a8e844a968 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -41,6 +41,8 @@
#include <linux/syscore_ops.h>
#include <linux/version.h>
#include <linux/ctype.h>
+#include <linux/mm.h>
+#include <linux/mempolicy.h>
#include <linux/compat.h>
#include <linux/syscalls.h>
@@ -2072,10 +2074,158 @@ static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr)
}
#endif
+#ifdef CONFIG_MMU
+static int prctl_update_vma_anon_name(struct vm_area_struct *vma,
+ struct vm_area_struct **prev,
+ unsigned long start, unsigned long end,
+ const char __user *name_addr)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ int error = 0;
+ pgoff_t pgoff;
+
+ if (name_addr == vma_get_anon_name(vma)) {
+ *prev = vma;
+ goto out;
+ }
+
+ pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
+ *prev = vma_merge(mm, *prev, start, end, vma->vm_flags, vma->anon_vma,
+ vma->vm_file, pgoff, vma_policy(vma),
+ vma->vm_userfaultfd_ctx, name_addr);
+ if (*prev) {
+ vma = *prev;
+ goto success;
+ }
+
+ *prev = vma;
+
+ if (start != vma->vm_start) {
+ error = split_vma(mm, vma, start, 1);
+ if (error)
+ goto out;
+ }
+
+ if (end != vma->vm_end) {
+ error = split_vma(mm, vma, end, 0);
+ if (error)
+ goto out;
+ }
+
+success:
+ if (!vma->vm_file)
+ vma->anon_name = name_addr;
+
+out:
+ if (error == -ENOMEM)
+ error = -EAGAIN;
+ return error;
+}
+
+static int prctl_set_vma_anon_name(unsigned long start, unsigned long end,
+ unsigned long arg)
+{
+ unsigned long tmp;
+ struct vm_area_struct *vma, *prev;
+ int unmapped_error = 0;
+ int error = -EINVAL;
+
+ /*
+ * If the interval [start,end) covers some unmapped address
+ * ranges, just ignore them, but return -ENOMEM at the end.
+ * - this matches the handling in madvise.
+ */
+ vma = find_vma_prev(current->mm, start, &prev);
+ if (vma && start > vma->vm_start)
+ prev = vma;
+
+ for (;;) {
+ /* Still start < end. */
+ error = -ENOMEM;
+ if (!vma)
+ return error;
+
+ /* Here start < (end|vma->vm_end). */
+ if (start < vma->vm_start) {
+ unmapped_error = -ENOMEM;
+ start = vma->vm_start;
+ if (start >= end)
+ return error;
+ }
+
+ /* Here vma->vm_start <= start < (end|vma->vm_end) */
+ tmp = vma->vm_end;
+ if (end < tmp)
+ tmp = end;
+
+ /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
+ error = prctl_update_vma_anon_name(vma, &prev, start, tmp,
+ (const char __user *)arg);
+ if (error)
+ return error;
+ start = tmp;
+ if (prev && start < prev->vm_end)
+ start = prev->vm_end;
+ error = unmapped_error;
+ if (start >= end)
+ return error;
+ if (prev)
+ vma = prev->vm_next;
+ else /* madvise_remove dropped mmap_sem */
+ vma = find_vma(current->mm, start);
+ }
+}
+
+static int prctl_set_vma(unsigned long opt, unsigned long start,
+ unsigned long len_in, unsigned long arg)
+{
+ struct mm_struct *mm = current->mm;
+ int error;
+ unsigned long len;
+ unsigned long end;
+
+ if (start & ~PAGE_MASK)
+ return -EINVAL;
+ len = (len_in + ~PAGE_MASK) & PAGE_MASK;
+
+ /* Check to see whether len was rounded up from small -ve to zero */
+ if (len_in && !len)
+ return -EINVAL;
+
+ end = start + len;
+ if (end < start)
+ return -EINVAL;
+
+ if (end == start)
+ return 0;
+
+ down_write(&mm->mmap_sem);
+
+ switch (opt) {
+ case PR_SET_VMA_ANON_NAME:
+ error = prctl_set_vma_anon_name(start, end, arg);
+ break;
+ default:
+ error = -EINVAL;
+ }
+
+ up_write(&mm->mmap_sem);
+
+ return error;
+}
+#else /* CONFIG_MMU */
+static int prctl_set_vma(unsigned long opt, unsigned long start,
+ unsigned long len_in, unsigned long arg)
+{
+ return -EINVAL;
+}
+#endif
+
SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
unsigned long, arg4, unsigned long, arg5)
{
struct task_struct *me = current;
+ struct task_struct *tsk;
unsigned char comm[sizeof(me->comm)];
long error;
@@ -2218,6 +2368,26 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
case PR_GET_TID_ADDRESS:
error = prctl_get_tid_address(me, (int __user **)arg2);
break;
+ case PR_SET_TIMERSLACK_PID:
+ if (task_pid_vnr(current) != (pid_t)arg3 &&
+ !capable(CAP_SYS_NICE))
+ return -EPERM;
+ rcu_read_lock();
+ tsk = find_task_by_vpid((pid_t)arg3);
+ if (tsk == NULL) {
+ rcu_read_unlock();
+ return -EINVAL;
+ }
+ get_task_struct(tsk);
+ rcu_read_unlock();
+ if (arg2 <= 0)
+ tsk->timer_slack_ns =
+ tsk->default_timer_slack_ns;
+ else
+ tsk->timer_slack_ns = arg2;
+ put_task_struct(tsk);
+ error = 0;
+ break;
case PR_SET_CHILD_SUBREAPER:
me->signal->is_child_subreaper = !!arg2;
break;
@@ -2266,6 +2436,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
case PR_GET_FP_MODE:
error = GET_FP_MODE(me);
break;
+ case PR_SET_VMA:
+ error = prctl_set_vma(arg2, arg3, arg4, arg5);
+ break;
default:
error = -EINVAL;
break;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index dc6858d6639e..81fbed978da3 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -104,6 +104,7 @@ extern char core_pattern[];
extern unsigned int core_pipe_limit;
#endif
extern int pid_max;
+extern int extra_free_kbytes;
extern int pid_max_min, pid_max_max;
extern int percpu_pagelist_fraction;
extern int compat_log;
@@ -284,6 +285,167 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
+ {
+ .procname = "sched_wake_to_idle",
+ .data = &sysctl_sched_wake_to_idle,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "sched_wakeup_load_threshold",
+ .data = &sysctl_sched_wakeup_load_threshold,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#ifdef CONFIG_SCHED_FREQ_INPUT
+ {
+ .procname = "sched_freq_inc_notify",
+ .data = &sysctl_sched_freq_inc_notify,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ },
+ {
+ .procname = "sched_freq_dec_notify",
+ .data = &sysctl_sched_freq_dec_notify,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ },
+#endif
+#ifdef CONFIG_SCHED_HMP
+ {
+ .procname = "sched_cpu_high_irqload",
+ .data = &sysctl_sched_cpu_high_irqload,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "sched_ravg_hist_size",
+ .data = &sysctl_sched_ravg_hist_size,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sched_window_update_handler,
+ },
+ {
+ .procname = "sched_window_stats_policy",
+ .data = &sysctl_sched_window_stats_policy,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sched_window_update_handler,
+ },
+ {
+ .procname = "sched_spill_load",
+ .data = &sysctl_sched_spill_load_pct,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sched_hmp_proc_update_handler,
+ },
+ {
+ .procname = "sched_spill_nr_run",
+ .data = &sysctl_sched_spill_nr_run,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ },
+ {
+ .procname = "sched_upmigrate",
+ .data = &sysctl_sched_upmigrate_pct,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sched_hmp_proc_update_handler,
+ },
+ {
+ .procname = "sched_downmigrate",
+ .data = &sysctl_sched_downmigrate_pct,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sched_hmp_proc_update_handler,
+ },
+ {
+ .procname = "sched_init_task_load",
+ .data = &sysctl_sched_init_task_load_pct,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sched_hmp_proc_update_handler,
+ },
+ {
+ .procname = "sched_select_prev_cpu_us",
+ .data = &sysctl_sched_select_prev_cpu_us,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sched_hmp_proc_update_handler,
+ },
+ {
+ .procname = "sched_enable_colocation",
+ .data = &sysctl_sched_enable_colocation,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+ {
+ .procname = "sched_restrict_cluster_spill",
+ .data = &sysctl_sched_restrict_cluster_spill,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+ {
+ .procname = "sched_small_wakee_task_load",
+ .data = &sysctl_sched_small_wakee_task_load_pct,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sched_hmp_proc_update_handler,
+ },
+ {
+ .procname = "sched_big_waker_task_load",
+ .data = &sysctl_sched_big_waker_task_load_pct,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sched_hmp_proc_update_handler,
+ },
+#ifdef CONFIG_SCHED_FREQ_INPUT
+ {
+ .procname = "sched_new_task_windows",
+ .data = &sysctl_sched_new_task_windows,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sched_window_update_handler,
+ },
+ {
+ .procname = "sched_pred_alert_freq",
+ .data = &sysctl_sched_pred_alert_freq,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ },
+ {
+ .procname = "sched_freq_aggregate",
+ .data = &sysctl_sched_freq_aggregate,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sched_window_update_handler,
+ },
+#endif
+ {
+ .procname = "sched_boost",
+ .data = &sysctl_sched_boost,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sched_boost_handler,
+ },
+#endif /* CONFIG_SCHED_HMP */
#ifdef CONFIG_SCHED_DEBUG
{
.procname = "sched_min_granularity_ns",
@@ -1172,6 +1334,27 @@ static struct ctl_table kern_table[] = {
.extra2 = &one,
},
#endif
+#if defined(CONFIG_ARM) || defined(CONFIG_ARM64)
+ {
+ .procname = "boot_reason",
+ .data = &boot_reason,
+ .maxlen = sizeof(int),
+ .mode = 0444,
+ .proc_handler = proc_dointvec,
+ },
+
+ {
+ .procname = "cold_boot",
+ .data = &cold_boot,
+ .maxlen = sizeof(int),
+ .mode = 0444,
+ .proc_handler = proc_dointvec,
+ },
+#endif
+/*
+ * NOTE: do not add new entries to this table unless you have read
+ * Documentation/sysctl/ctl_unnumbered.txt
+ */
{ }
};
@@ -1393,6 +1576,14 @@ static struct ctl_table vm_table[] = {
.extra1 = &zero,
},
{
+ .procname = "extra_free_kbytes",
+ .data = &extra_free_kbytes,
+ .maxlen = sizeof(extra_free_kbytes),
+ .mode = 0644,
+ .proc_handler = min_free_kbytes_sysctl_handler,
+ .extra1 = &zero,
+ },
+ {
.procname = "percpu_pagelist_fraction",
.data = &percpu_pagelist_fraction,
.maxlen = sizeof(percpu_pagelist_fraction),
@@ -1568,6 +1759,44 @@ static struct ctl_table vm_table[] = {
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
},
+#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS
+ {
+ .procname = "mmap_rnd_bits",
+ .data = &mmap_rnd_bits,
+ .maxlen = sizeof(mmap_rnd_bits),
+ .mode = 0600,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = (void *)&mmap_rnd_bits_min,
+ .extra2 = (void *)&mmap_rnd_bits_max,
+ },
+#endif
+#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
+ {
+ .procname = "mmap_rnd_compat_bits",
+ .data = &mmap_rnd_compat_bits,
+ .maxlen = sizeof(mmap_rnd_compat_bits),
+ .mode = 0600,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = (void *)&mmap_rnd_compat_bits_min,
+ .extra2 = (void *)&mmap_rnd_compat_bits_max,
+ },
+#endif
+#ifdef CONFIG_SWAP
+ {
+ .procname = "swap_ratio",
+ .data = &sysctl_swap_ratio,
+ .maxlen = sizeof(sysctl_swap_ratio),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ },
+ {
+ .procname = "swap_ratio_enable",
+ .data = &sysctl_swap_ratio_enable,
+ .maxlen = sizeof(sysctl_swap_ratio_enable),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ },
+#endif
{ }
};
@@ -2015,15 +2244,7 @@ static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp,
int write, void *data)
{
if (write) {
- if (*negp) {
- if (*lvalp > (unsigned long) INT_MAX + 1)
- return -EINVAL;
- *valp = -*lvalp;
- } else {
- if (*lvalp > (unsigned long) INT_MAX)
- return -EINVAL;
- *valp = *lvalp;
- }
+ *valp = *negp ? -*lvalp : *lvalp;
} else {
int val = *valp;
if (val < 0) {
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 10a1d7dc9313..4a816bab38a2 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -138,6 +138,8 @@ static const struct bin_table bin_kern_table[] = {
{ CTL_INT, KERN_MAX_LOCK_DEPTH, "max_lock_depth" },
{ CTL_INT, KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" },
{ CTL_INT, KERN_PANIC_ON_WARN, "panic_on_warn" },
+ { CTL_INT, KERN_BOOT_REASON, "boot_reason" },
+ { CTL_INT, KERN_COLD_BOOT, "cold_boot" },
{}
};
@@ -523,6 +525,7 @@ static const struct bin_table bin_net_ipv6_conf_var_table[] = {
{ CTL_INT, NET_IPV6_PROXY_NDP, "proxy_ndp" },
{ CTL_INT, NET_IPV6_ACCEPT_SOURCE_ROUTE, "accept_source_route" },
{ CTL_INT, NET_IPV6_ACCEPT_RA_FROM_LOCAL, "accept_ra_from_local" },
+ { CTL_INT, NET_IPV6_ACCEPT_RA_PREFIX_ROUTE, "accept_ra_prefix_route" },
{}
};
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index 49eca0beed32..5819ca07a22b 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -12,3 +12,5 @@ obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o tick-sched.o
obj-$(CONFIG_TIMER_STATS) += timer_stats.o
obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o
obj-$(CONFIG_TEST_UDELAY) += test_udelay.o
+
+ccflags-y += -Idrivers/cpuidle
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 7fbba635a549..0cdc34ebd8d1 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -26,6 +26,11 @@
#include <linux/workqueue.h>
#include <linux/freezer.h>
+#ifdef CONFIG_MSM_PM
+#include "lpm-levels.h"
+#endif
+#include <linux/workqueue.h>
+
/**
* struct alarm_base - Alarm timer bases
* @lock: Lock for syncrhonized access to the base
@@ -46,14 +51,130 @@ static ktime_t freezer_delta;
static DEFINE_SPINLOCK(freezer_delta_lock);
static struct wakeup_source *ws;
+static struct delayed_work work;
+static struct workqueue_struct *power_off_alarm_workqueue;
#ifdef CONFIG_RTC_CLASS
/* rtc timer and device for setting alarm wakeups at suspend */
static struct rtc_timer rtctimer;
static struct rtc_device *rtcdev;
static DEFINE_SPINLOCK(rtcdev_lock);
+static struct mutex power_on_alarm_lock;
+static struct alarm init_alarm;
/**
+ * power_on_alarm_init - Init power on alarm value
+ *
+ * Read rtc alarm value after device booting up and add this alarm
+ * into alarm queue.
+ */
+void power_on_alarm_init(void)
+{
+ struct rtc_wkalrm rtc_alarm;
+ struct rtc_time rt;
+ unsigned long alarm_time;
+ struct rtc_device *rtc;
+ ktime_t alarm_ktime;
+
+ rtc = alarmtimer_get_rtcdev();
+
+ if (!rtc)
+ return;
+
+ rtc_read_alarm(rtc, &rtc_alarm);
+ rt = rtc_alarm.time;
+
+ rtc_tm_to_time(&rt, &alarm_time);
+
+ if (alarm_time) {
+ alarm_ktime = ktime_set(alarm_time, 0);
+ alarm_init(&init_alarm, ALARM_POWEROFF_REALTIME, NULL);
+ alarm_start(&init_alarm, alarm_ktime);
+ }
+}
+
+/**
+ * set_power_on_alarm - set power on alarm value into rtc register
+ *
+ * Get the soonest power off alarm timer and set the alarm value into rtc
+ * register.
+ */
+void set_power_on_alarm(void)
+{
+ int rc;
+ struct timespec wall_time, alarm_ts;
+ long alarm_secs = 0l;
+ long rtc_secs, alarm_time, alarm_delta;
+ struct rtc_time rtc_time;
+ struct rtc_wkalrm alarm;
+ struct rtc_device *rtc;
+ struct timerqueue_node *next;
+ unsigned long flags;
+ struct alarm_base *base = &alarm_bases[ALARM_POWEROFF_REALTIME];
+
+ rc = mutex_lock_interruptible(&power_on_alarm_lock);
+ if (rc != 0)
+ return;
+
+ spin_lock_irqsave(&base->lock, flags);
+ next = timerqueue_getnext(&base->timerqueue);
+ spin_unlock_irqrestore(&base->lock, flags);
+
+ if (next) {
+ alarm_ts = ktime_to_timespec(next->expires);
+ alarm_secs = alarm_ts.tv_sec;
+ }
+
+ if (!alarm_secs)
+ goto disable_alarm;
+
+ getnstimeofday(&wall_time);
+
+ /*
+ * alarm_secs have to be bigger than "wall_time +1".
+ * It is to make sure that alarm time will be always
+ * bigger than wall time.
+ */
+ if (alarm_secs <= wall_time.tv_sec + 1)
+ goto disable_alarm;
+
+ rtc = alarmtimer_get_rtcdev();
+ if (!rtc)
+ goto exit;
+
+ rtc_read_time(rtc, &rtc_time);
+ rtc_tm_to_time(&rtc_time, &rtc_secs);
+ alarm_delta = wall_time.tv_sec - rtc_secs;
+ alarm_time = alarm_secs - alarm_delta;
+
+ rtc_time_to_tm(alarm_time, &alarm.time);
+ alarm.enabled = 1;
+ rc = rtc_set_alarm(rtcdev, &alarm);
+ if (rc)
+ goto disable_alarm;
+
+ mutex_unlock(&power_on_alarm_lock);
+ return;
+
+disable_alarm:
+ rtc_alarm_irq_enable(rtcdev, 0);
+exit:
+ mutex_unlock(&power_on_alarm_lock);
+}
+
+static void alarmtimer_triggered_func(void *p)
+{
+ struct rtc_device *rtc = rtcdev;
+
+ if (!(rtc->irq_data & RTC_AF))
+ return;
+ __pm_wakeup_event(ws, 2 * MSEC_PER_SEC);
+}
+
+static struct rtc_task alarmtimer_rtc_task = {
+ .func = alarmtimer_triggered_func
+};
+/**
* alarmtimer_get_rtcdev - Return selected rtcdevice
*
* This function returns the rtc device to use for wakealarms.
@@ -63,7 +184,7 @@ static DEFINE_SPINLOCK(rtcdev_lock);
struct rtc_device *alarmtimer_get_rtcdev(void)
{
unsigned long flags;
- struct rtc_device *ret;
+ struct rtc_device *ret = NULL;
spin_lock_irqsave(&rtcdev_lock, flags);
ret = rtcdev;
@@ -77,33 +198,48 @@ static int alarmtimer_rtc_add_device(struct device *dev,
struct class_interface *class_intf)
{
unsigned long flags;
+ int err = 0;
struct rtc_device *rtc = to_rtc_device(dev);
-
if (rtcdev)
return -EBUSY;
-
if (!rtc->ops->set_alarm)
return -1;
- if (!device_may_wakeup(rtc->dev.parent))
- return -1;
spin_lock_irqsave(&rtcdev_lock, flags);
if (!rtcdev) {
+ err = rtc_irq_register(rtc, &alarmtimer_rtc_task);
+ if (err)
+ goto rtc_irq_reg_err;
rtcdev = rtc;
/* hold a reference so it doesn't go away */
get_device(dev);
}
+
+rtc_irq_reg_err:
spin_unlock_irqrestore(&rtcdev_lock, flags);
- return 0;
+ return err;
+
+}
+
+static void alarmtimer_rtc_remove_device(struct device *dev,
+ struct class_interface *class_intf)
+{
+ if (rtcdev && dev == &rtcdev->dev) {
+ rtc_irq_unregister(rtcdev, &alarmtimer_rtc_task);
+ rtcdev = NULL;
+ }
}
static inline void alarmtimer_rtc_timer_init(void)
{
+ mutex_init(&power_on_alarm_lock);
+
rtc_timer_init(&rtctimer, NULL, NULL);
}
static struct class_interface alarmtimer_rtc_interface = {
.add_dev = &alarmtimer_rtc_add_device,
+ .remove_dev = &alarmtimer_rtc_remove_device,
};
static int alarmtimer_rtc_interface_setup(void)
@@ -124,8 +260,14 @@ struct rtc_device *alarmtimer_get_rtcdev(void)
static inline int alarmtimer_rtc_interface_setup(void) { return 0; }
static inline void alarmtimer_rtc_interface_remove(void) { }
static inline void alarmtimer_rtc_timer_init(void) { }
+void set_power_on_alarm(void) { }
#endif
+static void alarm_work_func(struct work_struct *unused)
+{
+ set_power_on_alarm();
+}
+
/**
* alarmtimer_enqueue - Adds an alarm timer to an alarm_base timerqueue
* @base: pointer to the base where the timer is being run
@@ -195,6 +337,10 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer)
}
spin_unlock_irqrestore(&base->lock, flags);
+ /* set next power off alarm */
+ if (alarm->type == ALARM_POWEROFF_REALTIME)
+ queue_delayed_work(power_off_alarm_workqueue, &work, 0);
+
return ret;
}
@@ -217,6 +363,70 @@ EXPORT_SYMBOL_GPL(alarm_expires_remaining);
* set an rtc timer to fire that far into the future, which
* will wake us from suspend.
*/
+#if defined(CONFIG_RTC_DRV_QPNP) && defined(CONFIG_MSM_PM)
+static int alarmtimer_suspend(struct device *dev)
+{
+ struct rtc_time tm;
+ ktime_t min, now;
+ unsigned long flags;
+ struct rtc_device *rtc;
+ int i;
+ int ret = 0;
+
+ spin_lock_irqsave(&freezer_delta_lock, flags);
+ min = freezer_delta;
+ freezer_delta = ktime_set(0, 0);
+ spin_unlock_irqrestore(&freezer_delta_lock, flags);
+
+ rtc = alarmtimer_get_rtcdev();
+ /* If we have no rtcdev, just return */
+ if (!rtc)
+ return 0;
+
+ /* Find the soonest timer to expire*/
+ for (i = 0; i < ALARM_NUMTYPE; i++) {
+ struct alarm_base *base = &alarm_bases[i];
+ struct timerqueue_node *next;
+ ktime_t delta;
+
+ spin_lock_irqsave(&base->lock, flags);
+ next = timerqueue_getnext(&base->timerqueue);
+ spin_unlock_irqrestore(&base->lock, flags);
+ if (!next)
+ continue;
+ delta = ktime_sub(next->expires, base->gettime());
+ if (!min.tv64 || (delta.tv64 < min.tv64))
+ min = delta;
+ }
+ if (min.tv64 == 0)
+ return 0;
+
+ if (ktime_to_ns(min) < 2 * NSEC_PER_SEC) {
+ __pm_wakeup_event(ws, 2 * MSEC_PER_SEC);
+ return -EBUSY;
+ }
+
+ /* Setup a timer to fire that far in the future */
+ rtc_timer_cancel(rtc, &rtctimer);
+ rtc_read_time(rtc, &tm);
+ now = rtc_tm_to_ktime(tm);
+ now = ktime_add(now, min);
+ if (poweron_alarm) {
+ struct rtc_time tm_val;
+ unsigned long secs;
+
+ tm_val = rtc_ktime_to_tm(min);
+ rtc_tm_to_time(&tm_val, &secs);
+ lpm_suspend_wake_time(secs);
+ } else {
+ /* Set alarm, if in the past reject suspend briefly to handle */
+ ret = rtc_timer_start(rtc, &rtctimer, now, ktime_set(0, 0));
+ if (ret < 0)
+ __pm_wakeup_event(ws, MSEC_PER_SEC);
+ }
+ return ret;
+}
+#else
static int alarmtimer_suspend(struct device *dev)
{
struct rtc_time tm;
@@ -226,6 +436,8 @@ static int alarmtimer_suspend(struct device *dev)
int i;
int ret;
+ cancel_delayed_work_sync(&work);
+
spin_lock_irqsave(&freezer_delta_lock, flags);
min = freezer_delta;
freezer_delta = ktime_set(0, 0);
@@ -271,11 +483,31 @@ static int alarmtimer_suspend(struct device *dev)
__pm_wakeup_event(ws, MSEC_PER_SEC);
return ret;
}
+#endif
+static int alarmtimer_resume(struct device *dev)
+{
+ struct rtc_device *rtc;
+
+ rtc = alarmtimer_get_rtcdev();
+ /* If we have no rtcdev, just return */
+ if (!rtc)
+ return 0;
+ rtc_timer_cancel(rtc, &rtctimer);
+
+ queue_delayed_work(power_off_alarm_workqueue, &work, 0);
+ return 0;
+}
+
#else
static int alarmtimer_suspend(struct device *dev)
{
return 0;
}
+
+static int alarmtimer_resume(struct device *dev)
+{
+ return 0;
+}
#endif
static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type)
@@ -443,12 +675,14 @@ EXPORT_SYMBOL_GPL(alarm_forward_now);
* clock2alarm - helper that converts from clockid to alarmtypes
* @clockid: clockid.
*/
-static enum alarmtimer_type clock2alarm(clockid_t clockid)
+enum alarmtimer_type clock2alarm(clockid_t clockid)
{
if (clockid == CLOCK_REALTIME_ALARM)
return ALARM_REALTIME;
if (clockid == CLOCK_BOOTTIME_ALARM)
return ALARM_BOOTTIME;
+ if (clockid == CLOCK_POWEROFF_ALARM)
+ return ALARM_POWEROFF_REALTIME;
return -1;
}
@@ -800,6 +1034,7 @@ out:
/* Suspend hook structures */
static const struct dev_pm_ops alarmtimer_pm_ops = {
.suspend = alarmtimer_suspend,
+ .resume = alarmtimer_resume,
};
static struct platform_driver alarmtimer_driver = {
@@ -834,10 +1069,13 @@ static int __init alarmtimer_init(void)
posix_timers_register_clock(CLOCK_REALTIME_ALARM, &alarm_clock);
posix_timers_register_clock(CLOCK_BOOTTIME_ALARM, &alarm_clock);
+ posix_timers_register_clock(CLOCK_POWEROFF_ALARM, &alarm_clock);
/* Initialize alarm bases */
alarm_bases[ALARM_REALTIME].base_clockid = CLOCK_REALTIME;
alarm_bases[ALARM_REALTIME].gettime = &ktime_get_real;
+ alarm_bases[ALARM_POWEROFF_REALTIME].base_clockid = CLOCK_REALTIME;
+ alarm_bases[ALARM_POWEROFF_REALTIME].gettime = &ktime_get_real;
alarm_bases[ALARM_BOOTTIME].base_clockid = CLOCK_BOOTTIME;
alarm_bases[ALARM_BOOTTIME].gettime = &ktime_get_boottime;
for (i = 0; i < ALARM_NUMTYPE; i++) {
@@ -859,8 +1097,24 @@ static int __init alarmtimer_init(void)
goto out_drv;
}
ws = wakeup_source_register("alarmtimer");
- return 0;
+ if (!ws) {
+ error = -ENOMEM;
+ goto out_ws;
+ }
+
+ INIT_DELAYED_WORK(&work, alarm_work_func);
+ power_off_alarm_workqueue =
+ create_singlethread_workqueue("power_off_alarm");
+ if (!power_off_alarm_workqueue) {
+ error = -ENOMEM;
+ goto out_wq;
+ }
+ return 0;
+out_wq:
+ wakeup_source_unregister(ws);
+out_ws:
+ platform_device_unregister(pdev);
out_drv:
platform_driver_unregister(&alarmtimer_driver);
out_if:
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index a26036d37a38..0637823aa5a6 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -70,6 +70,7 @@ struct clock_data {
static struct hrtimer sched_clock_timer;
static int irqtime = -1;
+static int initialized;
core_param(irqtime, irqtime, int, 0400);
@@ -231,6 +232,11 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate)
pr_debug("Registered %pF as sched_clock source\n", read);
}
+int sched_clock_initialized(void)
+{
+ return initialized;
+}
+
void __init sched_clock_postinit(void)
{
/*
@@ -249,6 +255,8 @@ void __init sched_clock_postinit(void)
hrtimer_init(&sched_clock_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
sched_clock_timer.function = sched_clock_poll;
hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL);
+
+ initialized = 1;
}
/*
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 22c57e191a23..651ff1a3a306 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -24,6 +24,7 @@
#include <linux/posix-timers.h>
#include <linux/perf_event.h>
#include <linux/context_tracking.h>
+#include <linux/rq_stats.h>
#include <asm/irq_regs.h>
@@ -31,6 +32,10 @@
#include <trace/events/timer.h>
+struct rq_data rq_info;
+struct workqueue_struct *rq_wq;
+spinlock_t rq_lock;
+
/*
* Per cpu nohz control structure
*/
@@ -41,6 +46,21 @@ static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);
*/
static ktime_t last_jiffies_update;
+u64 jiffy_to_ktime_ns(u64 *now, u64 *jiffy_ktime_ns)
+{
+ u64 cur_jiffies;
+ unsigned long seq;
+
+ do {
+ seq = read_seqbegin(&jiffies_lock);
+ *now = ktime_get_ns();
+ *jiffy_ktime_ns = ktime_to_ns(last_jiffies_update);
+ cur_jiffies = get_jiffies_64();
+ } while (read_seqretry(&jiffies_lock, seq));
+
+ return cur_jiffies;
+}
+
struct tick_sched *tick_get_tick_sched(int cpu)
{
return &per_cpu(tick_cpu_sched, cpu);
@@ -143,7 +163,7 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
* when we go busy again does not account too much ticks.
*/
if (ts->tick_stopped) {
- touch_softlockup_watchdog();
+ touch_softlockup_watchdog_sched();
if (is_idle_task(current))
ts->idle_jiffies++;
}
@@ -430,7 +450,7 @@ static void tick_nohz_update_jiffies(ktime_t now)
tick_do_update_jiffies64(now);
local_irq_restore(flags);
- touch_softlockup_watchdog();
+ touch_softlockup_watchdog_sched();
}
/*
@@ -701,7 +721,7 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
update_cpu_load_nohz();
calc_load_exit_idle();
- touch_softlockup_watchdog();
+ touch_softlockup_watchdog_sched();
/*
* Cancel the scheduled timer and restore the tick
*/
@@ -1049,6 +1069,51 @@ void tick_irq_enter(void)
* High resolution timer specific code
*/
#ifdef CONFIG_HIGH_RES_TIMERS
+static void update_rq_stats(void)
+{
+ unsigned long jiffy_gap = 0;
+ unsigned int rq_avg = 0;
+ unsigned long flags = 0;
+
+ jiffy_gap = jiffies - rq_info.rq_poll_last_jiffy;
+
+ if (jiffy_gap >= rq_info.rq_poll_jiffies) {
+
+ spin_lock_irqsave(&rq_lock, flags);
+
+ if (!rq_info.rq_avg)
+ rq_info.rq_poll_total_jiffies = 0;
+
+ rq_avg = nr_running() * 10;
+
+ if (rq_info.rq_poll_total_jiffies) {
+ rq_avg = (rq_avg * jiffy_gap) +
+ (rq_info.rq_avg *
+ rq_info.rq_poll_total_jiffies);
+ do_div(rq_avg,
+ rq_info.rq_poll_total_jiffies + jiffy_gap);
+ }
+
+ rq_info.rq_avg = rq_avg;
+ rq_info.rq_poll_total_jiffies += jiffy_gap;
+ rq_info.rq_poll_last_jiffy = jiffies;
+
+ spin_unlock_irqrestore(&rq_lock, flags);
+ }
+}
+
+static void wakeup_user(void)
+{
+ unsigned long jiffy_gap;
+
+ jiffy_gap = jiffies - rq_info.def_timer_last_jiffy;
+
+ if (jiffy_gap >= rq_info.def_timer_jiffies) {
+ rq_info.def_timer_last_jiffy = jiffies;
+ queue_work(rq_wq, &rq_info.def_timer_work);
+ }
+}
+
/*
* We rearm the timer until we get disabled by the idle code.
* Called with interrupts disabled.
@@ -1066,9 +1131,23 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
* Do not call, when we are not in irq context and have
* no valid regs pointer
*/
- if (regs)
+ if (regs) {
tick_sched_handle(ts, regs);
+ if (rq_info.init == 1 &&
+ tick_do_timer_cpu == smp_processor_id()) {
+ /*
+ * update run queue statistics
+ */
+ update_rq_stats();
+
+ /*
+ * wakeup user if needed
+ */
+ wakeup_user();
+ }
+ }
+
/* No need to reprogram if we are in idle or full dynticks mode */
if (unlikely(ts->tick_stopped))
return HRTIMER_NORESTART;
@@ -1181,3 +1260,8 @@ int tick_check_oneshot_change(int allow_nohz)
tick_nohz_switch_to_nohz();
return 0;
}
+
+ktime_t * get_next_event_cpu(unsigned int cpu)
+{
+ return &(per_cpu(tick_cpu_device, cpu).evtdev->next_event);
+}
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index bbc5d1114583..51896272fcde 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -94,12 +94,15 @@ struct tvec_base {
struct tvec tv5;
} ____cacheline_aligned;
+static inline void __run_timers(struct tvec_base *base);
static DEFINE_PER_CPU(struct tvec_base, tvec_bases);
#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
unsigned int sysctl_timer_migration = 1;
+struct tvec_base tvec_base_deferrable;
+
void timers_update_migration(bool update_nohz)
{
bool on = sysctl_timer_migration && tick_nohz_active;
@@ -135,18 +138,62 @@ int timer_migration_handler(struct ctl_table *table, int write,
}
static inline struct tvec_base *get_target_base(struct tvec_base *base,
- int pinned)
+ int pinned, u32 timer_flags)
{
+ if (!pinned && !(timer_flags & TIMER_PINNED_ON_CPU) &&
+ (timer_flags & TIMER_DEFERRABLE))
+ return &tvec_base_deferrable;
if (pinned || !base->migration_enabled)
return this_cpu_ptr(&tvec_bases);
return per_cpu_ptr(&tvec_bases, get_nohz_timer_target());
}
+
+static inline void __run_deferrable_timers(void)
+{
+ if (smp_processor_id() == tick_do_timer_cpu &&
+ time_after_eq(jiffies, tvec_base_deferrable.timer_jiffies))
+ __run_timers(&tvec_base_deferrable);
+}
+
+static inline void init_timer_deferrable_global(void)
+{
+ tvec_base_deferrable.cpu = nr_cpu_ids;
+ spin_lock_init(&tvec_base_deferrable.lock);
+ tvec_base_deferrable.timer_jiffies = jiffies;
+ tvec_base_deferrable.next_timer = tvec_base_deferrable.timer_jiffies;
+}
+
+static inline struct tvec_base *get_timer_base(u32 timer_flags)
+{
+ if (!(timer_flags & TIMER_PINNED_ON_CPU) &&
+ timer_flags & TIMER_DEFERRABLE)
+ return &tvec_base_deferrable;
+ else
+ return per_cpu_ptr(&tvec_bases, timer_flags & TIMER_CPUMASK);
+}
#else
static inline struct tvec_base *get_target_base(struct tvec_base *base,
- int pinned)
+ int pinned, u32 timer_flags)
{
return this_cpu_ptr(&tvec_bases);
}
+
+static inline void __run_deferrable_timers(void)
+{
+}
+
+static inline void init_timer_deferrable_global(void)
+{
+ /*
+ * initialize cpu unbound deferrable timer base only when CONFIG_SMP.
+ * UP kernel handles the timers with cpu 0 timer base.
+ */
+}
+
+static inline struct tvec_base *get_timer_base(u32 timer_flags)
+{
+ return per_cpu_ptr(&tvec_bases, timer_flags & TIMER_CPUMASK);
+}
#endif
static unsigned long round_jiffies_common(unsigned long j, int cpu,
@@ -768,7 +815,7 @@ static struct tvec_base *lock_timer_base(struct timer_list *timer,
struct tvec_base *base;
if (!(tf & TIMER_MIGRATING)) {
- base = per_cpu_ptr(&tvec_bases, tf & TIMER_CPUMASK);
+ base = get_timer_base(tf);
spin_lock_irqsave(&base->lock, *flags);
if (timer->flags == tf)
return base;
@@ -797,7 +844,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
debug_activate(timer, expires);
- new_base = get_target_base(base, pinned);
+ new_base = get_target_base(base, pinned, timer->flags);
if (base != new_base) {
/*
@@ -819,6 +866,10 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
}
}
+ if (pinned == TIMER_PINNED)
+ timer->flags |= TIMER_PINNED_ON_CPU;
+ else
+ timer->flags &= ~TIMER_PINNED_ON_CPU;
timer->expires = expires;
internal_add_timer(base, timer);
@@ -1000,6 +1051,7 @@ void add_timer_on(struct timer_list *timer, int cpu)
(timer->flags & ~TIMER_BASEMASK) | cpu);
}
+ timer->flags |= TIMER_PINNED_ON_CPU;
debug_activate(timer, timer->expires);
internal_add_timer(base, timer);
spin_unlock_irqrestore(&base->lock, flags);
@@ -1433,6 +1485,8 @@ static void run_timer_softirq(struct softirq_action *h)
{
struct tvec_base *base = this_cpu_ptr(&tvec_bases);
+ __run_deferrable_timers();
+
if (time_after_eq(jiffies, base->timer_jiffies))
__run_timers(base);
}
@@ -1656,6 +1710,8 @@ static void __init init_timer_cpus(void)
for_each_possible_cpu(cpu)
init_timer_cpu(cpu);
+
+ init_timer_deferrable_global();
}
void __init init_timers(void)
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index e45db6b0d878..048bf074bef9 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -77,6 +77,9 @@ config EVENT_TRACING
select CONTEXT_SWITCH_TRACER
bool
+config GPU_TRACEPOINTS
+ bool
+
config CONTEXT_SWITCH_TRACER
bool
@@ -86,6 +89,31 @@ config RING_BUFFER_ALLOW_SWAP
Allow the use of ring_buffer_swap_cpu.
Adds a very slight overhead to tracing when enabled.
+config IPC_LOGGING
+ bool "Debug Logging for IPC Drivers"
+ select GENERIC_TRACER
+ help
+ This option allows the debug logging for IPC Drivers.
+
+ If in doubt, say no.
+
+config QCOM_RTB
+ bool "Register tracing"
+ help
+ Add support for logging different events to a small uncached
+ region. This is designed to aid in debugging reset cases where the
+ caches may not be flushed before the target resets.
+
+config QCOM_RTB_SEPARATE_CPUS
+ bool "Separate entries for each cpu"
+ depends on QCOM_RTB
+ depends on SMP
+ help
+ Under some circumstances, it may be beneficial to give dedicated space
+ for each cpu to log accesses. Selecting this option will log each cpu
+ separately. This will guarantee that the last acesses for each cpu
+ will be logged but there will be fewer entries per cpu
+
# All tracer options should select GENERIC_TRACER. For those options that are
# enabled by all tracers (context switch and event tracer) they select TRACING.
# This allows those options to appear when no other tracer is selected. But the
@@ -485,6 +513,19 @@ config FUNCTION_PROFILER
If in doubt, say N.
+config CPU_FREQ_SWITCH_PROFILER
+ bool "CPU frequency switch time profiler"
+ select GENERIC_TRACER
+ help
+ This option enables the CPU frequency switch profiler. A file is
+ created in debugfs called "cpu_freq_switch_profile_enabled", which
+ defaults to zero. When a 1 is echoed into this file, profiling begins.
+ When a zero is echoed, profiling stops. A "cpu_freq_switch" file is
+ also created in the trace_stats directory; this file shows the
+ switches that have occurred and duration statistics.
+
+ If in doubt, say N.
+
config FTRACE_MCOUNT_RECORD
def_bool y
depends on DYNAMIC_FTRACE
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 9b1044e936a6..2acad4b6a92a 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -36,6 +36,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
+obj-$(CONFIG_CPU_FREQ_SWITCH_PROFILER) += trace_cpu_freq_switch.o
obj-$(CONFIG_NOP_TRACER) += trace_nop.o
obj-$(CONFIG_STACK_TRACER) += trace_stack.o
obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
@@ -64,7 +65,13 @@ obj-$(CONFIG_KGDB_KDB) += trace_kdb.o
endif
obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o
obj-$(CONFIG_UPROBE_EVENT) += trace_uprobe.o
+obj-$(CONFIG_GPU_TRACEPOINTS) += gpu-traces.o
+obj-$(CONFIG_QCOM_RTB) += msm_rtb.o
obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o
+obj-$(CONFIG_IPC_LOGGING) += ipc_logging.o
+ifdef CONFIG_DEBUG_FS
+obj-$(CONFIG_IPC_LOGGING) += ipc_logging_debug.o
+endif
libftrace-y := ftrace.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index a990824c8604..7b6127653a37 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -199,9 +199,9 @@ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),
* blk_io_trace structure and places it in a per-cpu subbuffer.
*/
static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
- int rw, u32 what, int error, int pdu_len, void *pdu_data)
+ int rw, u32 what, int error, int pdu_len,
+ void *pdu_data, struct task_struct *tsk)
{
- struct task_struct *tsk = current;
struct ring_buffer_event *event = NULL;
struct ring_buffer *buffer = NULL;
struct blk_io_trace *t;
@@ -708,18 +708,33 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
unsigned int nr_bytes, u32 what)
{
struct blk_trace *bt = q->blk_trace;
+ struct task_struct *tsk = current;
if (likely(!bt))
return;
+ /*
+ * Use the bio context for all events except ISSUE and
+ * COMPLETE events.
+ *
+ * Not all the pages in the bio are dirtied by the same task but
+ * most likely it will be, since the sectors accessed on the device
+ * must be adjacent.
+ */
+ if (!((what == BLK_TA_ISSUE) || (what == BLK_TA_COMPLETE)) &&
+ bio_has_data(rq->bio) && rq->bio->bi_io_vec &&
+ rq->bio->bi_io_vec->bv_page &&
+ rq->bio->bi_io_vec->bv_page->tsk_dirty)
+ tsk = rq->bio->bi_io_vec->bv_page->tsk_dirty;
+
if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
what |= BLK_TC_ACT(BLK_TC_PC);
__blk_add_trace(bt, 0, nr_bytes, rq->cmd_flags,
- what, rq->errors, rq->cmd_len, rq->cmd);
+ what, rq->errors, rq->cmd_len, rq->cmd, tsk);
} else {
what |= BLK_TC_ACT(BLK_TC_FS);
__blk_add_trace(bt, blk_rq_pos(rq), nr_bytes,
- rq->cmd_flags, what, rq->errors, 0, NULL);
+ rq->cmd_flags, what, rq->errors, 0, NULL, tsk);
}
}
@@ -771,12 +786,22 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
u32 what, int error)
{
struct blk_trace *bt = q->blk_trace;
+ struct task_struct *tsk = current;
if (likely(!bt))
return;
+ /*
+ * Not all the pages in the bio are dirtied by the same task but
+ * most likely it will be, since the sectors accessed on the device
+ * must be adjacent.
+ */
+ if (bio_has_data(bio) && bio->bi_io_vec && bio->bi_io_vec->bv_page &&
+ bio->bi_io_vec->bv_page->tsk_dirty)
+ tsk = bio->bi_io_vec->bv_page->tsk_dirty;
+
__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
- bio->bi_rw, what, error, 0, NULL);
+ bio->bi_rw, what, error, 0, NULL, tsk);
}
static void blk_add_trace_bio_bounce(void *ignore,
@@ -824,7 +849,8 @@ static void blk_add_trace_getrq(void *ignore,
struct blk_trace *bt = q->blk_trace;
if (bt)
- __blk_add_trace(bt, 0, 0, rw, BLK_TA_GETRQ, 0, 0, NULL);
+ __blk_add_trace(bt, 0, 0, rw, BLK_TA_GETRQ, 0, 0,
+ NULL, current);
}
}
@@ -840,7 +866,7 @@ static void blk_add_trace_sleeprq(void *ignore,
if (bt)
__blk_add_trace(bt, 0, 0, rw, BLK_TA_SLEEPRQ,
- 0, 0, NULL);
+ 0, 0, NULL, current);
}
}
@@ -849,7 +875,8 @@ static void blk_add_trace_plug(void *ignore, struct request_queue *q)
struct blk_trace *bt = q->blk_trace;
if (bt)
- __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL);
+ __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL,
+ current);
}
static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
@@ -866,7 +893,8 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
else
what = BLK_TA_UNPLUG_TIMER;
- __blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu);
+ __blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu,
+ current);
}
}
@@ -875,13 +903,19 @@ static void blk_add_trace_split(void *ignore,
unsigned int pdu)
{
struct blk_trace *bt = q->blk_trace;
+ struct task_struct *tsk = current;
if (bt) {
__be64 rpdu = cpu_to_be64(pdu);
+ if (bio_has_data(bio) && bio->bi_io_vec &&
+ bio->bi_io_vec->bv_page &&
+ bio->bi_io_vec->bv_page->tsk_dirty)
+ tsk = bio->bi_io_vec->bv_page->tsk_dirty;
+
__blk_add_trace(bt, bio->bi_iter.bi_sector,
bio->bi_iter.bi_size, bio->bi_rw, BLK_TA_SPLIT,
- bio->bi_error, sizeof(rpdu), &rpdu);
+ bio->bi_error, sizeof(rpdu), &rpdu, tsk);
}
}
@@ -904,6 +938,7 @@ static void blk_add_trace_bio_remap(void *ignore,
{
struct blk_trace *bt = q->blk_trace;
struct blk_io_trace_remap r;
+ struct task_struct *tsk = current;
if (likely(!bt))
return;
@@ -912,9 +947,14 @@ static void blk_add_trace_bio_remap(void *ignore,
r.device_to = cpu_to_be32(bio->bi_bdev->bd_dev);
r.sector_from = cpu_to_be64(from);
+ if (bio_has_data(bio) && bio->bi_io_vec &&
+ bio->bi_io_vec->bv_page &&
+ bio->bi_io_vec->bv_page->tsk_dirty)
+ tsk = bio->bi_io_vec->bv_page->tsk_dirty;
+
__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
bio->bi_rw, BLK_TA_REMAP, bio->bi_error,
- sizeof(r), &r);
+ sizeof(r), &r, tsk);
}
/**
@@ -937,6 +977,7 @@ static void blk_add_trace_rq_remap(void *ignore,
{
struct blk_trace *bt = q->blk_trace;
struct blk_io_trace_remap r;
+ struct task_struct *tsk = current;
if (likely(!bt))
return;
@@ -945,9 +986,14 @@ static void blk_add_trace_rq_remap(void *ignore,
r.device_to = cpu_to_be32(disk_devt(rq->rq_disk));
r.sector_from = cpu_to_be64(from);
+ if (bio_has_data(rq->bio) && rq->bio->bi_io_vec &&
+ rq->bio->bi_io_vec->bv_page &&
+ rq->bio->bi_io_vec->bv_page->tsk_dirty)
+ tsk = rq->bio->bi_io_vec->bv_page->tsk_dirty;
+
__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
rq_data_dir(rq), BLK_TA_REMAP, !!rq->errors,
- sizeof(r), &r);
+ sizeof(r), &r, tsk);
}
/**
@@ -966,16 +1012,22 @@ void blk_add_driver_data(struct request_queue *q,
void *data, size_t len)
{
struct blk_trace *bt = q->blk_trace;
+ struct task_struct *tsk = current;
if (likely(!bt))
return;
+ if (bio_has_data(rq->bio) && rq->bio->bi_io_vec &&
+ rq->bio->bi_io_vec->bv_page &&
+ rq->bio->bi_io_vec->bv_page->tsk_dirty)
+ tsk = rq->bio->bi_io_vec->bv_page->tsk_dirty;
+
if (rq->cmd_type == REQ_TYPE_BLOCK_PC)
__blk_add_trace(bt, 0, blk_rq_bytes(rq), 0,
- BLK_TA_DRV_DATA, rq->errors, len, data);
+ BLK_TA_DRV_DATA, rq->errors, len, data, tsk);
else
__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), 0,
- BLK_TA_DRV_DATA, rq->errors, len, data);
+ BLK_TA_DRV_DATA, rq->errors, len, data, tsk);
}
EXPORT_SYMBOL_GPL(blk_add_driver_data);
diff --git a/kernel/trace/gpu-traces.c b/kernel/trace/gpu-traces.c
new file mode 100644
index 000000000000..a4b3f00faee3
--- /dev/null
+++ b/kernel/trace/gpu-traces.c
@@ -0,0 +1,23 @@
+/*
+ * GPU tracepoints
+ *
+ * Copyright (C) 2013 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/module.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/gpu.h>
+
+EXPORT_TRACEPOINT_SYMBOL(gpu_sched_switch);
+EXPORT_TRACEPOINT_SYMBOL(gpu_job_enqueue);
diff --git a/kernel/trace/ipc_logging.c b/kernel/trace/ipc_logging.c
new file mode 100644
index 000000000000..2c3e0998d400
--- /dev/null
+++ b/kernel/trace/ipc_logging.c
@@ -0,0 +1,876 @@
+/* Copyright (c) 2012-2016, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <asm/arch_timer.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/jiffies.h>
+#include <linux/debugfs.h>
+#include <linux/io.h>
+#include <linux/idr.h>
+#include <linux/string.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/delay.h>
+#include <linux/completion.h>
+#include <linux/ipc_logging.h>
+
+#include "ipc_logging_private.h"
+
+#define LOG_PAGE_DATA_SIZE sizeof(((struct ipc_log_page *)0)->data)
+#define LOG_PAGE_FLAG (1 << 31)
+
+static LIST_HEAD(ipc_log_context_list);
+static DEFINE_RWLOCK(context_list_lock_lha1);
+static void *get_deserialization_func(struct ipc_log_context *ilctxt,
+ int type);
+
+static struct ipc_log_page *get_first_page(struct ipc_log_context *ilctxt)
+{
+ struct ipc_log_page_header *p_pghdr;
+ struct ipc_log_page *pg = NULL;
+
+ if (!ilctxt)
+ return NULL;
+ p_pghdr = list_first_entry(&ilctxt->page_list,
+ struct ipc_log_page_header, list);
+ pg = container_of(p_pghdr, struct ipc_log_page, hdr);
+ return pg;
+}
+
+/**
+ * is_nd_read_empty - Returns true if no data is available to read in log
+ *
+ * @ilctxt: logging context
+ * @returns: > 1 if context is empty; 0 if not empty; <0 for failure
+ *
+ * This is for the debugfs read pointer which allows for a non-destructive read.
+ * There may still be data in the log, but it may have already been read.
+ */
+static int is_nd_read_empty(struct ipc_log_context *ilctxt)
+{
+ if (!ilctxt)
+ return -EINVAL;
+
+ return ((ilctxt->nd_read_page == ilctxt->write_page) &&
+ (ilctxt->nd_read_page->hdr.nd_read_offset ==
+ ilctxt->write_page->hdr.write_offset));
+}
+
+/**
+ * is_read_empty - Returns true if no data is available in log
+ *
+ * @ilctxt: logging context
+ * @returns: > 1 if context is empty; 0 if not empty; <0 for failure
+ *
+ * This is for the actual log contents. If it is empty, then there
+ * is no data at all in the log.
+ */
+static int is_read_empty(struct ipc_log_context *ilctxt)
+{
+ if (!ilctxt)
+ return -EINVAL;
+
+ return ((ilctxt->read_page == ilctxt->write_page) &&
+ (ilctxt->read_page->hdr.read_offset ==
+ ilctxt->write_page->hdr.write_offset));
+}
+
+/**
+ * is_nd_read_equal_read - Return true if the non-destructive read is equal to
+ * the destructive read
+ *
+ * @ilctxt: logging context
+ * @returns: true if nd read is equal to read; false otherwise
+ */
+static bool is_nd_read_equal_read(struct ipc_log_context *ilctxt)
+{
+ uint16_t read_offset;
+ uint16_t nd_read_offset;
+
+ if (ilctxt->nd_read_page == ilctxt->read_page) {
+ read_offset = ilctxt->read_page->hdr.read_offset;
+ nd_read_offset = ilctxt->nd_read_page->hdr.nd_read_offset;
+
+ if (read_offset == nd_read_offset)
+ return true;
+ }
+
+ return false;
+}
+
+
+static struct ipc_log_page *get_next_page(struct ipc_log_context *ilctxt,
+ struct ipc_log_page *cur_pg)
+{
+ struct ipc_log_page_header *p_pghdr;
+ struct ipc_log_page *pg = NULL;
+
+ if (!ilctxt || !cur_pg)
+ return NULL;
+
+ if (ilctxt->last_page == cur_pg)
+ return ilctxt->first_page;
+
+ p_pghdr = list_first_entry(&cur_pg->hdr.list,
+ struct ipc_log_page_header, list);
+ pg = container_of(p_pghdr, struct ipc_log_page, hdr);
+
+ return pg;
+}
+
+/**
+ * ipc_log_read - do non-destructive read of the log
+ *
+ * @ilctxt: Logging context
+ * @data: Data pointer to receive the data
+ * @data_size: Number of bytes to read (must be <= bytes available in log)
+ *
+ * This read will update a runtime read pointer, but will not affect the actual
+ * contents of the log which allows for reading the logs continuously while
+ * debugging and if the system crashes, then the full logs can still be
+ * extracted.
+ */
+static void ipc_log_read(struct ipc_log_context *ilctxt,
+ void *data, int data_size)
+{
+ int bytes_to_read;
+
+ bytes_to_read = MIN(LOG_PAGE_DATA_SIZE
+ - ilctxt->nd_read_page->hdr.nd_read_offset,
+ data_size);
+
+ memcpy(data, (ilctxt->nd_read_page->data +
+ ilctxt->nd_read_page->hdr.nd_read_offset), bytes_to_read);
+
+ if (bytes_to_read != data_size) {
+ /* not enough space, wrap read to next page */
+ ilctxt->nd_read_page->hdr.nd_read_offset = 0;
+ ilctxt->nd_read_page = get_next_page(ilctxt,
+ ilctxt->nd_read_page);
+ BUG_ON(ilctxt->nd_read_page == NULL);
+
+ memcpy((data + bytes_to_read),
+ (ilctxt->nd_read_page->data +
+ ilctxt->nd_read_page->hdr.nd_read_offset),
+ (data_size - bytes_to_read));
+ bytes_to_read = (data_size - bytes_to_read);
+ }
+ ilctxt->nd_read_page->hdr.nd_read_offset += bytes_to_read;
+}
+
+/**
+ * ipc_log_drop - do destructive read of the log
+ *
+ * @ilctxt: Logging context
+ * @data: Data pointer to receive the data (or NULL)
+ * @data_size: Number of bytes to read (must be <= bytes available in log)
+ */
+static void ipc_log_drop(struct ipc_log_context *ilctxt, void *data,
+ int data_size)
+{
+ int bytes_to_read;
+ bool push_nd_read;
+
+ bytes_to_read = MIN(LOG_PAGE_DATA_SIZE
+ - ilctxt->read_page->hdr.read_offset,
+ data_size);
+ if (data)
+ memcpy(data, (ilctxt->read_page->data +
+ ilctxt->read_page->hdr.read_offset), bytes_to_read);
+
+ if (bytes_to_read != data_size) {
+ /* not enough space, wrap read to next page */
+ push_nd_read = is_nd_read_equal_read(ilctxt);
+
+ ilctxt->read_page->hdr.read_offset = 0;
+ if (push_nd_read) {
+ ilctxt->read_page->hdr.nd_read_offset = 0;
+ ilctxt->read_page = get_next_page(ilctxt,
+ ilctxt->read_page);
+ BUG_ON(ilctxt->read_page == NULL);
+ ilctxt->nd_read_page = ilctxt->read_page;
+ } else {
+ ilctxt->read_page = get_next_page(ilctxt,
+ ilctxt->read_page);
+ BUG_ON(ilctxt->read_page == NULL);
+ }
+
+ if (data)
+ memcpy((data + bytes_to_read),
+ (ilctxt->read_page->data +
+ ilctxt->read_page->hdr.read_offset),
+ (data_size - bytes_to_read));
+
+ bytes_to_read = (data_size - bytes_to_read);
+ }
+
+ /* update non-destructive read pointer if necessary */
+ push_nd_read = is_nd_read_equal_read(ilctxt);
+ ilctxt->read_page->hdr.read_offset += bytes_to_read;
+ ilctxt->write_avail += data_size;
+
+ if (push_nd_read)
+ ilctxt->nd_read_page->hdr.nd_read_offset += bytes_to_read;
+}
+
+/**
+ * msg_read - Reads a message.
+ *
+ * If a message is read successfully, then the message context
+ * will be set to:
+ * .hdr message header .size and .type values
+ * .offset beginning of message data
+ *
+ * @ilctxt Logging context
+ * @ectxt Message context
+ *
+ * @returns 0 - no message available; >0 message size; <0 error
+ */
+static int msg_read(struct ipc_log_context *ilctxt,
+ struct encode_context *ectxt)
+{
+ struct tsv_header hdr;
+
+ if (!ectxt)
+ return -EINVAL;
+
+ if (is_nd_read_empty(ilctxt))
+ return 0;
+
+ ipc_log_read(ilctxt, &hdr, sizeof(hdr));
+ ectxt->hdr.type = hdr.type;
+ ectxt->hdr.size = hdr.size;
+ ectxt->offset = sizeof(hdr);
+ ipc_log_read(ilctxt, (ectxt->buff + ectxt->offset),
+ (int)hdr.size);
+
+ return sizeof(hdr) + (int)hdr.size;
+}
+
+/**
+ * msg_drop - Drops a message.
+ *
+ * @ilctxt Logging context
+ */
+static void msg_drop(struct ipc_log_context *ilctxt)
+{
+ struct tsv_header hdr;
+
+ if (!is_read_empty(ilctxt)) {
+ ipc_log_drop(ilctxt, &hdr, sizeof(hdr));
+ ipc_log_drop(ilctxt, NULL, (int)hdr.size);
+ }
+}
+
+/*
+ * Commits messages to the FIFO. If the FIFO is full, then enough
+ * messages are dropped to create space for the new message.
+ */
+void ipc_log_write(void *ctxt, struct encode_context *ectxt)
+{
+ struct ipc_log_context *ilctxt = (struct ipc_log_context *)ctxt;
+ int bytes_to_write;
+ unsigned long flags;
+
+ if (!ilctxt || !ectxt) {
+ pr_err("%s: Invalid ipc_log or encode context\n", __func__);
+ return;
+ }
+
+ read_lock_irqsave(&context_list_lock_lha1, flags);
+ spin_lock(&ilctxt->context_lock_lhb1);
+ while (ilctxt->write_avail <= ectxt->offset)
+ msg_drop(ilctxt);
+
+ bytes_to_write = MIN(LOG_PAGE_DATA_SIZE
+ - ilctxt->write_page->hdr.write_offset,
+ ectxt->offset);
+ memcpy((ilctxt->write_page->data +
+ ilctxt->write_page->hdr.write_offset),
+ ectxt->buff, bytes_to_write);
+
+ if (bytes_to_write != ectxt->offset) {
+ uint64_t t_now = sched_clock();
+
+ ilctxt->write_page->hdr.write_offset += bytes_to_write;
+ ilctxt->write_page->hdr.end_time = t_now;
+
+ ilctxt->write_page = get_next_page(ilctxt, ilctxt->write_page);
+ BUG_ON(ilctxt->write_page == NULL);
+ ilctxt->write_page->hdr.write_offset = 0;
+ ilctxt->write_page->hdr.start_time = t_now;
+ memcpy((ilctxt->write_page->data +
+ ilctxt->write_page->hdr.write_offset),
+ (ectxt->buff + bytes_to_write),
+ (ectxt->offset - bytes_to_write));
+ bytes_to_write = (ectxt->offset - bytes_to_write);
+ }
+ ilctxt->write_page->hdr.write_offset += bytes_to_write;
+ ilctxt->write_avail -= ectxt->offset;
+ complete(&ilctxt->read_avail);
+ spin_unlock(&ilctxt->context_lock_lhb1);
+ read_unlock_irqrestore(&context_list_lock_lha1, flags);
+}
+EXPORT_SYMBOL(ipc_log_write);
+
+/*
+ * Starts a new message after which you can add serialized data and
+ * then complete the message by calling msg_encode_end().
+ */
+void msg_encode_start(struct encode_context *ectxt, uint32_t type)
+{
+ if (!ectxt) {
+ pr_err("%s: Invalid encode context\n", __func__);
+ return;
+ }
+
+ ectxt->hdr.type = type;
+ ectxt->hdr.size = 0;
+ ectxt->offset = sizeof(ectxt->hdr);
+}
+EXPORT_SYMBOL(msg_encode_start);
+
+/*
+ * Completes the message
+ */
+void msg_encode_end(struct encode_context *ectxt)
+{
+ if (!ectxt) {
+ pr_err("%s: Invalid encode context\n", __func__);
+ return;
+ }
+
+ /* finalize data size */
+ ectxt->hdr.size = ectxt->offset - sizeof(ectxt->hdr);
+ BUG_ON(ectxt->hdr.size > MAX_MSG_SIZE);
+ memcpy(ectxt->buff, &ectxt->hdr, sizeof(ectxt->hdr));
+}
+EXPORT_SYMBOL(msg_encode_end);
+
+/*
+ * Helper funtion used to write data to a message context.
+ *
+ * @ectxt context initialized by calling msg_encode_start()
+ * @data data to write
+ * @size number of bytes of data to write
+ */
+static inline int tsv_write_data(struct encode_context *ectxt,
+ void *data, uint32_t size)
+{
+ if (!ectxt) {
+ pr_err("%s: Invalid encode context\n", __func__);
+ return -EINVAL;
+ }
+ if ((ectxt->offset + size) > MAX_MSG_SIZE) {
+ pr_err("%s: No space to encode further\n", __func__);
+ return -EINVAL;
+ }
+
+ memcpy((void *)(ectxt->buff + ectxt->offset), data, size);
+ ectxt->offset += size;
+ return 0;
+}
+
+/*
+ * Helper function that writes a type to the context.
+ *
+ * @ectxt context initialized by calling msg_encode_start()
+ * @type primitive type
+ * @size size of primitive in bytes
+ */
+static inline int tsv_write_header(struct encode_context *ectxt,
+ uint32_t type, uint32_t size)
+{
+ struct tsv_header hdr;
+
+ hdr.type = (unsigned char)type;
+ hdr.size = (unsigned char)size;
+ return tsv_write_data(ectxt, &hdr, sizeof(hdr));
+}
+
+/*
+ * Writes the current timestamp count.
+ *
+ * @ectxt context initialized by calling msg_encode_start()
+ */
+int tsv_timestamp_write(struct encode_context *ectxt)
+{
+ int ret;
+ uint64_t t_now = sched_clock();
+
+ ret = tsv_write_header(ectxt, TSV_TYPE_TIMESTAMP, sizeof(t_now));
+ if (ret)
+ return ret;
+ return tsv_write_data(ectxt, &t_now, sizeof(t_now));
+}
+EXPORT_SYMBOL(tsv_timestamp_write);
+
+/*
+ * Writes the current QTimer timestamp count.
+ *
+ * @ectxt context initialized by calling msg_encode_start()
+ */
+int tsv_qtimer_write(struct encode_context *ectxt)
+{
+ int ret;
+ uint64_t t_now = arch_counter_get_cntvct();
+
+ ret = tsv_write_header(ectxt, TSV_TYPE_QTIMER, sizeof(t_now));
+ if (ret)
+ return ret;
+ return tsv_write_data(ectxt, &t_now, sizeof(t_now));
+}
+EXPORT_SYMBOL(tsv_qtimer_write);
+
+/*
+ * Writes a data pointer.
+ *
+ * @ectxt context initialized by calling msg_encode_start()
+ * @pointer pointer value to write
+ */
+int tsv_pointer_write(struct encode_context *ectxt, void *pointer)
+{
+ int ret;
+ ret = tsv_write_header(ectxt, TSV_TYPE_POINTER, sizeof(pointer));
+ if (ret)
+ return ret;
+ return tsv_write_data(ectxt, &pointer, sizeof(pointer));
+}
+EXPORT_SYMBOL(tsv_pointer_write);
+
+/*
+ * Writes a 32-bit integer value.
+ *
+ * @ectxt context initialized by calling msg_encode_start()
+ * @n integer to write
+ */
+int tsv_int32_write(struct encode_context *ectxt, int32_t n)
+{
+ int ret;
+ ret = tsv_write_header(ectxt, TSV_TYPE_INT32, sizeof(n));
+ if (ret)
+ return ret;
+ return tsv_write_data(ectxt, &n, sizeof(n));
+}
+EXPORT_SYMBOL(tsv_int32_write);
+
+/*
+ * Writes a byte array.
+ *
+ * @ectxt context initialized by calling msg_write_start()
+ * @data Beginning address of data
+ * @data_size Size of data to be written
+ */
+int tsv_byte_array_write(struct encode_context *ectxt,
+ void *data, int data_size)
+{
+ int ret;
+ ret = tsv_write_header(ectxt, TSV_TYPE_BYTE_ARRAY, data_size);
+ if (ret)
+ return ret;
+ return tsv_write_data(ectxt, data, data_size);
+}
+EXPORT_SYMBOL(tsv_byte_array_write);
+
+/*
+ * Helper function to log a string
+ *
+ * @ilctxt ipc_log_context created using ipc_log_context_create()
+ * @fmt Data specified using format specifiers
+ */
+int ipc_log_string(void *ilctxt, const char *fmt, ...)
+{
+ struct encode_context ectxt;
+ int avail_size, data_size, hdr_size = sizeof(struct tsv_header);
+ va_list arg_list;
+
+ if (!ilctxt)
+ return -EINVAL;
+
+ msg_encode_start(&ectxt, TSV_TYPE_STRING);
+ tsv_timestamp_write(&ectxt);
+ tsv_qtimer_write(&ectxt);
+ avail_size = (MAX_MSG_SIZE - (ectxt.offset + hdr_size));
+ va_start(arg_list, fmt);
+ data_size = vsnprintf((ectxt.buff + ectxt.offset + hdr_size),
+ avail_size, fmt, arg_list);
+ va_end(arg_list);
+ tsv_write_header(&ectxt, TSV_TYPE_BYTE_ARRAY, data_size);
+ ectxt.offset += data_size;
+ msg_encode_end(&ectxt);
+ ipc_log_write(ilctxt, &ectxt);
+ return 0;
+}
+EXPORT_SYMBOL(ipc_log_string);
+
+/**
+ * ipc_log_extract - Reads and deserializes log
+ *
+ * @ctxt: logging context
+ * @buff: buffer to receive the data
+ * @size: size of the buffer
+ * @returns: 0 if no data read; >0 number of bytes read; < 0 error
+ *
+ * If no data is available to be read, then the ilctxt::read_avail
+ * completion is reinitialized. This allows clients to block
+ * until new log data is save.
+ */
+int ipc_log_extract(void *ctxt, char *buff, int size)
+{
+ struct encode_context ectxt;
+ struct decode_context dctxt;
+ void (*deserialize_func)(struct encode_context *ectxt,
+ struct decode_context *dctxt);
+ struct ipc_log_context *ilctxt = (struct ipc_log_context *)ctxt;
+ unsigned long flags;
+
+ if (size < MAX_MSG_DECODED_SIZE)
+ return -EINVAL;
+
+ dctxt.output_format = OUTPUT_DEBUGFS;
+ dctxt.buff = buff;
+ dctxt.size = size;
+ read_lock_irqsave(&context_list_lock_lha1, flags);
+ spin_lock(&ilctxt->context_lock_lhb1);
+ while (dctxt.size >= MAX_MSG_DECODED_SIZE &&
+ !is_nd_read_empty(ilctxt)) {
+ msg_read(ilctxt, &ectxt);
+ deserialize_func = get_deserialization_func(ilctxt,
+ ectxt.hdr.type);
+ spin_unlock(&ilctxt->context_lock_lhb1);
+ read_unlock_irqrestore(&context_list_lock_lha1, flags);
+ if (deserialize_func)
+ deserialize_func(&ectxt, &dctxt);
+ else
+ pr_err("%s: unknown message 0x%x\n",
+ __func__, ectxt.hdr.type);
+ read_lock_irqsave(&context_list_lock_lha1, flags);
+ spin_lock(&ilctxt->context_lock_lhb1);
+ }
+ if ((size - dctxt.size) == 0)
+ reinit_completion(&ilctxt->read_avail);
+ spin_unlock(&ilctxt->context_lock_lhb1);
+ read_unlock_irqrestore(&context_list_lock_lha1, flags);
+ return size - dctxt.size;
+}
+EXPORT_SYMBOL(ipc_log_extract);
+
+/*
+ * Helper funtion used to read data from a message context.
+ *
+ * @ectxt context initialized by calling msg_read()
+ * @data data to read
+ * @size number of bytes of data to read
+ */
+static void tsv_read_data(struct encode_context *ectxt,
+ void *data, uint32_t size)
+{
+ BUG_ON((ectxt->offset + size) > MAX_MSG_SIZE);
+ memcpy(data, (ectxt->buff + ectxt->offset), size);
+ ectxt->offset += size;
+}
+
+/*
+ * Helper function that reads a type from the context and updates the
+ * context pointers.
+ *
+ * @ectxt context initialized by calling msg_read()
+ * @hdr type header
+ */
+static void tsv_read_header(struct encode_context *ectxt,
+ struct tsv_header *hdr)
+{
+ BUG_ON((ectxt->offset + sizeof(*hdr)) > MAX_MSG_SIZE);
+ memcpy(hdr, (ectxt->buff + ectxt->offset), sizeof(*hdr));
+ ectxt->offset += sizeof(*hdr);
+}
+
+/*
+ * Reads a timestamp.
+ *
+ * @ectxt context initialized by calling msg_read()
+ * @dctxt deserialization context
+ * @format output format (appended to %6u.09u timestamp format)
+ */
+void tsv_timestamp_read(struct encode_context *ectxt,
+ struct decode_context *dctxt, const char *format)
+{
+ struct tsv_header hdr;
+ uint64_t val;
+ unsigned long nanosec_rem;
+
+ tsv_read_header(ectxt, &hdr);
+ BUG_ON(hdr.type != TSV_TYPE_TIMESTAMP);
+ tsv_read_data(ectxt, &val, sizeof(val));
+ nanosec_rem = do_div(val, 1000000000U);
+ IPC_SPRINTF_DECODE(dctxt, "[%6u.%09lu%s/",
+ (unsigned)val, nanosec_rem, format);
+}
+EXPORT_SYMBOL(tsv_timestamp_read);
+
+/*
+ * Reads a QTimer timestamp.
+ *
+ * @ectxt context initialized by calling msg_read()
+ * @dctxt deserialization context
+ * @format output format (appended to %#18llx timestamp format)
+ */
+void tsv_qtimer_read(struct encode_context *ectxt,
+ struct decode_context *dctxt, const char *format)
+{
+ struct tsv_header hdr;
+ uint64_t val;
+
+ tsv_read_header(ectxt, &hdr);
+ BUG_ON(hdr.type != TSV_TYPE_QTIMER);
+ tsv_read_data(ectxt, &val, sizeof(val));
+
+ /*
+ * This gives 16 hex digits of output. The # prefix prepends
+ * a 0x, and these characters count as part of the number.
+ */
+ IPC_SPRINTF_DECODE(dctxt, "%#18llx]%s", val, format);
+}
+EXPORT_SYMBOL(tsv_qtimer_read);
+
+/*
+ * Reads a data pointer.
+ *
+ * @ectxt context initialized by calling msg_read()
+ * @dctxt deserialization context
+ * @format output format
+ */
+void tsv_pointer_read(struct encode_context *ectxt,
+ struct decode_context *dctxt, const char *format)
+{
+ struct tsv_header hdr;
+ void *val;
+
+ tsv_read_header(ectxt, &hdr);
+ BUG_ON(hdr.type != TSV_TYPE_POINTER);
+ tsv_read_data(ectxt, &val, sizeof(val));
+
+ IPC_SPRINTF_DECODE(dctxt, format, val);
+}
+EXPORT_SYMBOL(tsv_pointer_read);
+
+/*
+ * Reads a 32-bit integer value.
+ *
+ * @ectxt context initialized by calling msg_read()
+ * @dctxt deserialization context
+ * @format output format
+ */
+int32_t tsv_int32_read(struct encode_context *ectxt,
+ struct decode_context *dctxt, const char *format)
+{
+ struct tsv_header hdr;
+ int32_t val;
+
+ tsv_read_header(ectxt, &hdr);
+ BUG_ON(hdr.type != TSV_TYPE_INT32);
+ tsv_read_data(ectxt, &val, sizeof(val));
+
+ IPC_SPRINTF_DECODE(dctxt, format, val);
+ return val;
+}
+EXPORT_SYMBOL(tsv_int32_read);
+
+/*
+ * Reads a byte array/string.
+ *
+ * @ectxt context initialized by calling msg_read()
+ * @dctxt deserialization context
+ * @format output format
+ */
+void tsv_byte_array_read(struct encode_context *ectxt,
+ struct decode_context *dctxt, const char *format)
+{
+ struct tsv_header hdr;
+
+ tsv_read_header(ectxt, &hdr);
+ BUG_ON(hdr.type != TSV_TYPE_BYTE_ARRAY);
+ tsv_read_data(ectxt, dctxt->buff, hdr.size);
+ dctxt->buff += hdr.size;
+ dctxt->size -= hdr.size;
+}
+EXPORT_SYMBOL(tsv_byte_array_read);
+
+int add_deserialization_func(void *ctxt, int type,
+ void (*dfunc)(struct encode_context *,
+ struct decode_context *))
+{
+ struct ipc_log_context *ilctxt = (struct ipc_log_context *)ctxt;
+ struct dfunc_info *df_info;
+ unsigned long flags;
+
+ if (!ilctxt || !dfunc)
+ return -EINVAL;
+
+ df_info = kmalloc(sizeof(struct dfunc_info), GFP_KERNEL);
+ if (!df_info)
+ return -ENOSPC;
+
+ read_lock_irqsave(&context_list_lock_lha1, flags);
+ spin_lock(&ilctxt->context_lock_lhb1);
+ df_info->type = type;
+ df_info->dfunc = dfunc;
+ list_add_tail(&df_info->list, &ilctxt->dfunc_info_list);
+ spin_unlock(&ilctxt->context_lock_lhb1);
+ read_unlock_irqrestore(&context_list_lock_lha1, flags);
+ return 0;
+}
+EXPORT_SYMBOL(add_deserialization_func);
+
+static void *get_deserialization_func(struct ipc_log_context *ilctxt,
+ int type)
+{
+ struct dfunc_info *df_info = NULL;
+
+ if (!ilctxt)
+ return NULL;
+
+ list_for_each_entry(df_info, &ilctxt->dfunc_info_list, list) {
+ if (df_info->type == type)
+ return df_info->dfunc;
+ }
+ return NULL;
+}
+
+/**
+ * ipc_log_context_create: Create a debug log context
+ * Should not be called from atomic context
+ *
+ * @max_num_pages: Number of pages of logging space required (max. 10)
+ * @mod_name : Name of the directory entry under DEBUGFS
+ * @user_version : Version number of user-defined message formats
+ *
+ * returns context id on success, NULL on failure
+ */
+void *ipc_log_context_create(int max_num_pages,
+ const char *mod_name, uint16_t user_version)
+{
+ struct ipc_log_context *ctxt;
+ struct ipc_log_page *pg = NULL;
+ int page_cnt;
+ unsigned long flags;
+
+ ctxt = kzalloc(sizeof(struct ipc_log_context), GFP_KERNEL);
+ if (!ctxt) {
+ pr_err("%s: cannot create ipc_log_context\n", __func__);
+ return 0;
+ }
+
+ init_completion(&ctxt->read_avail);
+ INIT_LIST_HEAD(&ctxt->page_list);
+ INIT_LIST_HEAD(&ctxt->dfunc_info_list);
+ spin_lock_init(&ctxt->context_lock_lhb1);
+ for (page_cnt = 0; page_cnt < max_num_pages; page_cnt++) {
+ pg = kzalloc(sizeof(struct ipc_log_page), GFP_KERNEL);
+ if (!pg) {
+ pr_err("%s: cannot create ipc_log_page\n", __func__);
+ goto release_ipc_log_context;
+ }
+ pg->hdr.log_id = (uint64_t)(uintptr_t)ctxt;
+ pg->hdr.page_num = LOG_PAGE_FLAG | page_cnt;
+ pg->hdr.ctx_offset = (int64_t)((uint64_t)(uintptr_t)ctxt -
+ (uint64_t)(uintptr_t)&pg->hdr);
+
+ /* set magic last to signal that page init is complete */
+ pg->hdr.magic = IPC_LOGGING_MAGIC_NUM;
+ pg->hdr.nmagic = ~(IPC_LOGGING_MAGIC_NUM);
+
+ spin_lock_irqsave(&ctxt->context_lock_lhb1, flags);
+ list_add_tail(&pg->hdr.list, &ctxt->page_list);
+ spin_unlock_irqrestore(&ctxt->context_lock_lhb1, flags);
+ }
+
+ ctxt->log_id = (uint64_t)(uintptr_t)ctxt;
+ ctxt->version = IPC_LOG_VERSION;
+ strlcpy(ctxt->name, mod_name, IPC_LOG_MAX_CONTEXT_NAME_LEN);
+ ctxt->user_version = user_version;
+ ctxt->first_page = get_first_page(ctxt);
+ ctxt->last_page = pg;
+ ctxt->write_page = ctxt->first_page;
+ ctxt->read_page = ctxt->first_page;
+ ctxt->nd_read_page = ctxt->first_page;
+ ctxt->write_avail = max_num_pages * LOG_PAGE_DATA_SIZE;
+ ctxt->header_size = sizeof(struct ipc_log_page_header);
+ create_ctx_debugfs(ctxt, mod_name);
+
+ /* set magic last to signal context init is complete */
+ ctxt->magic = IPC_LOG_CONTEXT_MAGIC_NUM;
+ ctxt->nmagic = ~(IPC_LOG_CONTEXT_MAGIC_NUM);
+
+ write_lock_irqsave(&context_list_lock_lha1, flags);
+ list_add_tail(&ctxt->list, &ipc_log_context_list);
+ write_unlock_irqrestore(&context_list_lock_lha1, flags);
+ return (void *)ctxt;
+
+release_ipc_log_context:
+ while (page_cnt-- > 0) {
+ pg = get_first_page(ctxt);
+ list_del(&pg->hdr.list);
+ kfree(pg);
+ }
+ kfree(ctxt);
+ return 0;
+}
+EXPORT_SYMBOL(ipc_log_context_create);
+
+/*
+ * Destroy debug log context
+ *
+ * @ctxt: debug log context created by calling ipc_log_context_create API.
+ */
+int ipc_log_context_destroy(void *ctxt)
+{
+ struct ipc_log_context *ilctxt = (struct ipc_log_context *)ctxt;
+ struct ipc_log_page *pg = NULL;
+ unsigned long flags;
+
+ if (!ilctxt)
+ return 0;
+
+ while (!list_empty(&ilctxt->page_list)) {
+ pg = get_first_page(ctxt);
+ list_del(&pg->hdr.list);
+ kfree(pg);
+ }
+
+ write_lock_irqsave(&context_list_lock_lha1, flags);
+ list_del(&ilctxt->list);
+ write_unlock_irqrestore(&context_list_lock_lha1, flags);
+
+ debugfs_remove_recursive(ilctxt->dent);
+
+ kfree(ilctxt);
+ return 0;
+}
+EXPORT_SYMBOL(ipc_log_context_destroy);
+
+static int __init ipc_logging_init(void)
+{
+ check_and_create_debugfs();
+ return 0;
+}
+
+module_init(ipc_logging_init);
+
+MODULE_DESCRIPTION("ipc logging");
+MODULE_LICENSE("GPL v2");
diff --git a/kernel/trace/ipc_logging_debug.c b/kernel/trace/ipc_logging_debug.c
new file mode 100644
index 000000000000..a54538798f2b
--- /dev/null
+++ b/kernel/trace/ipc_logging_debug.c
@@ -0,0 +1,184 @@
+/* Copyright (c) 2012-2015, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/jiffies.h>
+#include <linux/debugfs.h>
+#include <linux/io.h>
+#include <linux/idr.h>
+#include <linux/string.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/delay.h>
+#include <linux/completion.h>
+#include <linux/ipc_logging.h>
+
+#include "ipc_logging_private.h"
+
+static DEFINE_MUTEX(ipc_log_debugfs_init_lock);
+static struct dentry *root_dent;
+
+static int debug_log(struct ipc_log_context *ilctxt,
+ char *buff, int size, int cont)
+{
+ int i = 0;
+ int ret;
+
+ if (size < MAX_MSG_DECODED_SIZE) {
+ pr_err("%s: buffer size %d < %d\n", __func__, size,
+ MAX_MSG_DECODED_SIZE);
+ return -ENOMEM;
+ }
+ do {
+ i = ipc_log_extract(ilctxt, buff, size - 1);
+ if (cont && i == 0) {
+ ret = wait_for_completion_interruptible(
+ &ilctxt->read_avail);
+ if (ret < 0)
+ return ret;
+ }
+ } while (cont && i == 0);
+
+ return i;
+}
+
+/*
+ * VFS Read operation helper which dispatches the call to the debugfs
+ * read command stored in file->private_data.
+ *
+ * @file File structure
+ * @buff user buffer
+ * @count size of user buffer
+ * @ppos file position to read from (only a value of 0 is accepted)
+ * @cont 1 = continuous mode (don't return 0 to signal end-of-file)
+ *
+ * @returns ==0 end of file
+ * >0 number of bytes read
+ * <0 error
+ */
+static ssize_t debug_read_helper(struct file *file, char __user *buff,
+ size_t count, loff_t *ppos, int cont)
+{
+ struct ipc_log_context *ilctxt = file->private_data;
+ char *buffer;
+ int bsize;
+
+ buffer = kmalloc(count, GFP_KERNEL);
+ if (!buffer)
+ return -ENOMEM;
+
+ bsize = debug_log(ilctxt, buffer, count, cont);
+ if (bsize > 0) {
+ if (copy_to_user(buff, buffer, bsize)) {
+ kfree(buffer);
+ return -EFAULT;
+ }
+ *ppos += bsize;
+ }
+ kfree(buffer);
+ return bsize;
+}
+
+static ssize_t debug_read(struct file *file, char __user *buff,
+ size_t count, loff_t *ppos)
+{
+ return debug_read_helper(file, buff, count, ppos, 0);
+}
+
+static ssize_t debug_read_cont(struct file *file, char __user *buff,
+ size_t count, loff_t *ppos)
+{
+ return debug_read_helper(file, buff, count, ppos, 1);
+}
+
+static int debug_open(struct inode *inode, struct file *file)
+{
+ file->private_data = inode->i_private;
+ return 0;
+}
+
+static const struct file_operations debug_ops = {
+ .read = debug_read,
+ .open = debug_open,
+};
+
+static const struct file_operations debug_ops_cont = {
+ .read = debug_read_cont,
+ .open = debug_open,
+};
+
+static void debug_create(const char *name, mode_t mode,
+ struct dentry *dent,
+ struct ipc_log_context *ilctxt,
+ const struct file_operations *fops)
+{
+ debugfs_create_file(name, mode, dent, ilctxt, fops);
+}
+
+static void dfunc_string(struct encode_context *ectxt,
+ struct decode_context *dctxt)
+{
+ tsv_timestamp_read(ectxt, dctxt, "");
+ tsv_qtimer_read(ectxt, dctxt, " ");
+ tsv_byte_array_read(ectxt, dctxt, "");
+
+ /* add trailing \n if necessary */
+ if (*(dctxt->buff - 1) != '\n') {
+ if (dctxt->size) {
+ ++dctxt->buff;
+ --dctxt->size;
+ }
+ *(dctxt->buff - 1) = '\n';
+ }
+}
+
+void check_and_create_debugfs(void)
+{
+ mutex_lock(&ipc_log_debugfs_init_lock);
+ if (!root_dent) {
+ root_dent = debugfs_create_dir("ipc_logging", 0);
+
+ if (IS_ERR(root_dent)) {
+ pr_err("%s: unable to create debugfs %ld\n",
+ __func__, PTR_ERR(root_dent));
+ root_dent = NULL;
+ }
+ }
+ mutex_unlock(&ipc_log_debugfs_init_lock);
+}
+EXPORT_SYMBOL(check_and_create_debugfs);
+
+void create_ctx_debugfs(struct ipc_log_context *ctxt,
+ const char *mod_name)
+{
+ if (!root_dent)
+ check_and_create_debugfs();
+
+ if (root_dent) {
+ ctxt->dent = debugfs_create_dir(mod_name, root_dent);
+ if (!IS_ERR(ctxt->dent)) {
+ debug_create("log", 0444, ctxt->dent,
+ ctxt, &debug_ops);
+ debug_create("log_cont", 0444, ctxt->dent,
+ ctxt, &debug_ops_cont);
+ }
+ }
+ add_deserialization_func((void *)ctxt,
+ TSV_TYPE_STRING, dfunc_string);
+}
+EXPORT_SYMBOL(create_ctx_debugfs);
diff --git a/kernel/trace/ipc_logging_private.h b/kernel/trace/ipc_logging_private.h
new file mode 100644
index 000000000000..3ac950695086
--- /dev/null
+++ b/kernel/trace/ipc_logging_private.h
@@ -0,0 +1,165 @@
+/* Copyright (c) 2012-2015, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+#ifndef _IPC_LOGGING_PRIVATE_H
+#define _IPC_LOGGING_PRIVATE_H
+
+#include <linux/ipc_logging.h>
+
+#define IPC_LOG_VERSION 0x0003
+#define IPC_LOG_MAX_CONTEXT_NAME_LEN 32
+
+/**
+ * struct ipc_log_page_header - Individual log page header
+ *
+ * @magic: Magic number (used for log extraction)
+ * @nmagic: Inverse of magic number (used for log extraction)
+ * @page_num: Index of page (0.. N - 1) (note top bit is always set)
+ * @read_offset: Read offset in page
+ * @write_offset: Write offset in page (or 0xFFFF if full)
+ * @log_id: ID of logging context that owns this page
+ * @start_time: Scheduler clock for first write time in page
+ * @end_time: Scheduler clock for last write time in page
+ * @ctx_offset: Signed offset from page to the logging context. Used to
+ * optimize ram-dump extraction.
+ *
+ * @list: Linked list of pages that make up a log
+ * @nd_read_offset: Non-destructive read offset used for debugfs
+ *
+ * The first part of the structure defines data that is used to extract the
+ * logs from a memory dump and elements in this section should not be changed
+ * or re-ordered. New local data structures can be added to the end of the
+ * structure since they will be ignored by the extraction tool.
+ */
+struct ipc_log_page_header {
+ uint32_t magic;
+ uint32_t nmagic;
+ uint32_t page_num;
+ uint16_t read_offset;
+ uint16_t write_offset;
+ uint64_t log_id;
+ uint64_t start_time;
+ uint64_t end_time;
+ int64_t ctx_offset;
+
+ /* add local data structures after this point */
+ struct list_head list;
+ uint16_t nd_read_offset;
+};
+
+/**
+ * struct ipc_log_page - Individual log page
+ *
+ * @hdr: Log page header
+ * @data: Log data
+ *
+ * Each log consists of 1 to N log pages. Data size is adjusted to always fit
+ * the structure into a single kernel page.
+ */
+struct ipc_log_page {
+ struct ipc_log_page_header hdr;
+ char data[PAGE_SIZE - sizeof(struct ipc_log_page_header)];
+};
+
+/**
+ * struct ipc_log_context - main logging context
+ *
+ * @magic: Magic number (used for log extraction)
+ * @nmagic: Inverse of magic number (used for log extraction)
+ * @version: IPC Logging version of log format
+ * @user_version: Version number for user-defined messages
+ * @header_size: Size of the log header which is used to determine the offset
+ * of ipc_log_page::data
+ * @log_id: Log ID (assigned when log is created)
+ * @name: Name of the log used to uniquely identify the log during extraction
+ *
+ * @list: List of log contexts (struct ipc_log_context)
+ * @page_list: List of log pages (struct ipc_log_page)
+ * @first_page: First page in list of logging pages
+ * @last_page: Last page in list of logging pages
+ * @write_page: Current write page
+ * @read_page: Current read page (for internal reads)
+ * @nd_read_page: Current debugfs extraction page (non-destructive)
+ *
+ * @write_avail: Number of bytes available to write in all pages
+ * @dent: Debugfs node for run-time log extraction
+ * @dfunc_info_list: List of deserialization functions
+ * @context_lock_lhb1: Lock for entire structure
+ * @read_avail: Completed when new data is added to the log
+ */
+struct ipc_log_context {
+ uint32_t magic;
+ uint32_t nmagic;
+ uint32_t version;
+ uint16_t user_version;
+ uint16_t header_size;
+ uint64_t log_id;
+ char name[IPC_LOG_MAX_CONTEXT_NAME_LEN];
+
+ /* add local data structures after this point */
+ struct list_head list;
+ struct list_head page_list;
+ struct ipc_log_page *first_page;
+ struct ipc_log_page *last_page;
+ struct ipc_log_page *write_page;
+ struct ipc_log_page *read_page;
+ struct ipc_log_page *nd_read_page;
+
+ uint32_t write_avail;
+ struct dentry *dent;
+ struct list_head dfunc_info_list;
+ spinlock_t context_lock_lhb1;
+ struct completion read_avail;
+};
+
+struct dfunc_info {
+ struct list_head list;
+ int type;
+ void (*dfunc) (struct encode_context *, struct decode_context *);
+};
+
+enum {
+ TSV_TYPE_INVALID,
+ TSV_TYPE_TIMESTAMP,
+ TSV_TYPE_POINTER,
+ TSV_TYPE_INT32,
+ TSV_TYPE_BYTE_ARRAY,
+ TSV_TYPE_QTIMER,
+};
+
+enum {
+ OUTPUT_DEBUGFS,
+};
+
+#define IPC_LOG_CONTEXT_MAGIC_NUM 0x25874452
+#define IPC_LOGGING_MAGIC_NUM 0x52784425
+#define MIN(x, y) ((x) < (y) ? (x) : (y))
+#define IS_MSG_TYPE(x) (((x) > TSV_TYPE_MSG_START) && \
+ ((x) < TSV_TYPE_MSG_END))
+#define MAX_MSG_DECODED_SIZE (MAX_MSG_SIZE*4)
+
+#if (defined(CONFIG_DEBUG_FS))
+void check_and_create_debugfs(void);
+
+void create_ctx_debugfs(struct ipc_log_context *ctxt,
+ const char *mod_name);
+#else
+void check_and_create_debugfs(void)
+{
+}
+
+void create_ctx_debugfs(struct ipc_log_context *ctxt, const char *mod_name)
+{
+}
+#endif
+
+#endif
diff --git a/kernel/trace/msm_rtb.c b/kernel/trace/msm_rtb.c
new file mode 100644
index 000000000000..ba609d5eb07f
--- /dev/null
+++ b/kernel/trace/msm_rtb.c
@@ -0,0 +1,329 @@
+/*
+ * Copyright (c) 2013-2015, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/atomic.h>
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/dma-mapping.h>
+#include <linux/mod_devicetable.h>
+#include <linux/platform_device.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/atomic.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
+#include <linux/io.h>
+#include <asm-generic/sizes.h>
+#include <linux/msm_rtb.h>
+
+#define SENTINEL_BYTE_1 0xFF
+#define SENTINEL_BYTE_2 0xAA
+#define SENTINEL_BYTE_3 0xFF
+
+#define RTB_COMPAT_STR "qcom,msm-rtb"
+
+/* Write
+ * 1) 3 bytes sentinel
+ * 2) 1 bytes of log type
+ * 3) 8 bytes of where the caller came from
+ * 4) 4 bytes index
+ * 4) 8 bytes extra data from the caller
+ * 5) 8 bytes of timestamp
+ *
+ * Total = 32 bytes.
+ */
+struct msm_rtb_layout {
+ unsigned char sentinel[3];
+ unsigned char log_type;
+ uint32_t idx;
+ uint64_t caller;
+ uint64_t data;
+ uint64_t timestamp;
+} __attribute__ ((__packed__));
+
+
+struct msm_rtb_state {
+ struct msm_rtb_layout *rtb;
+ phys_addr_t phys;
+ int nentries;
+ int size;
+ int enabled;
+ int initialized;
+ uint32_t filter;
+ int step_size;
+};
+
+#if defined(CONFIG_QCOM_RTB_SEPARATE_CPUS)
+DEFINE_PER_CPU(atomic_t, msm_rtb_idx_cpu);
+#else
+static atomic_t msm_rtb_idx;
+#endif
+
+static struct msm_rtb_state msm_rtb = {
+ .filter = 1 << LOGK_LOGBUF,
+ .enabled = 1,
+};
+
+module_param_named(filter, msm_rtb.filter, uint, 0644);
+module_param_named(enable, msm_rtb.enabled, int, 0644);
+
+static int msm_rtb_panic_notifier(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ msm_rtb.enabled = 0;
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block msm_rtb_panic_blk = {
+ .notifier_call = msm_rtb_panic_notifier,
+ .priority = INT_MAX,
+};
+
+int notrace msm_rtb_event_should_log(enum logk_event_type log_type)
+{
+ return msm_rtb.initialized && msm_rtb.enabled &&
+ ((1 << (log_type & ~LOGTYPE_NOPC)) & msm_rtb.filter);
+}
+EXPORT_SYMBOL(msm_rtb_event_should_log);
+
+static void msm_rtb_emit_sentinel(struct msm_rtb_layout *start)
+{
+ start->sentinel[0] = SENTINEL_BYTE_1;
+ start->sentinel[1] = SENTINEL_BYTE_2;
+ start->sentinel[2] = SENTINEL_BYTE_3;
+}
+
+static void msm_rtb_write_type(enum logk_event_type log_type,
+ struct msm_rtb_layout *start)
+{
+ start->log_type = (char)log_type;
+}
+
+static void msm_rtb_write_caller(uint64_t caller, struct msm_rtb_layout *start)
+{
+ start->caller = caller;
+}
+
+static void msm_rtb_write_idx(uint32_t idx,
+ struct msm_rtb_layout *start)
+{
+ start->idx = idx;
+}
+
+static void msm_rtb_write_data(uint64_t data, struct msm_rtb_layout *start)
+{
+ start->data = data;
+}
+
+static void msm_rtb_write_timestamp(struct msm_rtb_layout *start)
+{
+ start->timestamp = sched_clock();
+}
+
+static void uncached_logk_pc_idx(enum logk_event_type log_type, uint64_t caller,
+ uint64_t data, int idx)
+{
+ struct msm_rtb_layout *start;
+
+ start = &msm_rtb.rtb[idx & (msm_rtb.nentries - 1)];
+
+ msm_rtb_emit_sentinel(start);
+ msm_rtb_write_type(log_type, start);
+ msm_rtb_write_caller(caller, start);
+ msm_rtb_write_idx(idx, start);
+ msm_rtb_write_data(data, start);
+ msm_rtb_write_timestamp(start);
+ mb();
+
+ return;
+}
+
+static void uncached_logk_timestamp(int idx)
+{
+ unsigned long long timestamp;
+
+ timestamp = sched_clock();
+ uncached_logk_pc_idx(LOGK_TIMESTAMP|LOGTYPE_NOPC,
+ (uint64_t)lower_32_bits(timestamp),
+ (uint64_t)upper_32_bits(timestamp), idx);
+}
+
+#if defined(CONFIG_QCOM_RTB_SEPARATE_CPUS)
+static int msm_rtb_get_idx(void)
+{
+ int cpu, i, offset;
+ atomic_t *index;
+
+ /*
+ * ideally we would use get_cpu but this is a close enough
+ * approximation for our purposes.
+ */
+ cpu = raw_smp_processor_id();
+
+ index = &per_cpu(msm_rtb_idx_cpu, cpu);
+
+ i = atomic_add_return(msm_rtb.step_size, index);
+ i -= msm_rtb.step_size;
+
+ /* Check if index has wrapped around */
+ offset = (i & (msm_rtb.nentries - 1)) -
+ ((i - msm_rtb.step_size) & (msm_rtb.nentries - 1));
+ if (offset < 0) {
+ uncached_logk_timestamp(i);
+ i = atomic_add_return(msm_rtb.step_size, index);
+ i -= msm_rtb.step_size;
+ }
+
+ return i;
+}
+#else
+static int msm_rtb_get_idx(void)
+{
+ int i, offset;
+
+ i = atomic_inc_return(&msm_rtb_idx);
+ i--;
+
+ /* Check if index has wrapped around */
+ offset = (i & (msm_rtb.nentries - 1)) -
+ ((i - 1) & (msm_rtb.nentries - 1));
+ if (offset < 0) {
+ uncached_logk_timestamp(i);
+ i = atomic_inc_return(&msm_rtb_idx);
+ i--;
+ }
+
+ return i;
+}
+#endif
+
+int notrace uncached_logk_pc(enum logk_event_type log_type, void *caller,
+ void *data)
+{
+ int i;
+
+ if (!msm_rtb_event_should_log(log_type))
+ return 0;
+
+ i = msm_rtb_get_idx();
+ uncached_logk_pc_idx(log_type, (uint64_t)((unsigned long) caller),
+ (uint64_t)((unsigned long) data), i);
+
+ return 1;
+}
+EXPORT_SYMBOL(uncached_logk_pc);
+
+noinline int notrace uncached_logk(enum logk_event_type log_type, void *data)
+{
+ return uncached_logk_pc(log_type, __builtin_return_address(0), data);
+}
+EXPORT_SYMBOL(uncached_logk);
+
+static int msm_rtb_probe(struct platform_device *pdev)
+{
+ struct msm_rtb_platform_data *d = pdev->dev.platform_data;
+#if defined(CONFIG_QCOM_RTB_SEPARATE_CPUS)
+ unsigned int cpu;
+#endif
+ int ret;
+
+ if (!pdev->dev.of_node) {
+ msm_rtb.size = d->size;
+ } else {
+ u64 size;
+ struct device_node *pnode;
+
+ pnode = of_parse_phandle(pdev->dev.of_node,
+ "linux,contiguous-region", 0);
+ if (pnode != NULL) {
+ const u32 *addr;
+
+ addr = of_get_address(pnode, 0, &size, NULL);
+ if (!addr) {
+ of_node_put(pnode);
+ return -EINVAL;
+ }
+ of_node_put(pnode);
+ } else {
+ ret = of_property_read_u32(pdev->dev.of_node,
+ "qcom,rtb-size",
+ (u32 *)&size);
+ if (ret < 0)
+ return ret;
+
+ }
+
+ msm_rtb.size = size;
+ }
+
+ if (msm_rtb.size <= 0 || msm_rtb.size > SZ_1M)
+ return -EINVAL;
+
+ msm_rtb.rtb = dma_alloc_coherent(&pdev->dev, msm_rtb.size,
+ &msm_rtb.phys,
+ GFP_KERNEL);
+
+ if (!msm_rtb.rtb)
+ return -ENOMEM;
+
+ msm_rtb.nentries = msm_rtb.size / sizeof(struct msm_rtb_layout);
+
+ /* Round this down to a power of 2 */
+ msm_rtb.nentries = __rounddown_pow_of_two(msm_rtb.nentries);
+
+ memset(msm_rtb.rtb, 0, msm_rtb.size);
+
+
+#if defined(CONFIG_QCOM_RTB_SEPARATE_CPUS)
+ for_each_possible_cpu(cpu) {
+ atomic_t *a = &per_cpu(msm_rtb_idx_cpu, cpu);
+ atomic_set(a, cpu);
+ }
+ msm_rtb.step_size = num_possible_cpus();
+#else
+ atomic_set(&msm_rtb_idx, 0);
+ msm_rtb.step_size = 1;
+#endif
+
+ atomic_notifier_chain_register(&panic_notifier_list,
+ &msm_rtb_panic_blk);
+ msm_rtb.initialized = 1;
+ return 0;
+}
+
+static struct of_device_id msm_match_table[] = {
+ {.compatible = RTB_COMPAT_STR},
+ {},
+};
+
+static struct platform_driver msm_rtb_driver = {
+ .driver = {
+ .name = "msm_rtb",
+ .owner = THIS_MODULE,
+ .of_match_table = msm_match_table
+ },
+};
+
+static int __init msm_rtb_init(void)
+{
+ return platform_driver_probe(&msm_rtb_driver, msm_rtb_probe);
+}
+
+static void __exit msm_rtb_exit(void)
+{
+ platform_driver_unregister(&msm_rtb_driver);
+}
+module_init(msm_rtb_init)
+module_exit(msm_rtb_exit)
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
index eb4220a132ec..9270e1ac6460 100644
--- a/kernel/trace/power-traces.c
+++ b/kernel/trace/power-traces.c
@@ -15,4 +15,5 @@
EXPORT_TRACEPOINT_SYMBOL_GPL(suspend_resume);
EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle);
-
+EXPORT_TRACEPOINT_SYMBOL(core_ctl_set_busy);
+EXPORT_TRACEPOINT_SYMBOL(core_ctl_eval_need);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8305cbb2d5a2..ae68222c5a74 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -41,6 +41,7 @@
#include <linux/nmi.h>
#include <linux/fs.h>
#include <linux/sched/rt.h>
+#include <linux/coresight-stm.h>
#include "trace.h"
#include "trace_output.h"
@@ -573,8 +574,11 @@ int __trace_puts(unsigned long ip, const char *str, int size)
if (entry->buf[size - 1] != '\n') {
entry->buf[size] = '\n';
entry->buf[size + 1] = '\0';
- } else
+ stm_log(OST_ENTITY_TRACE_PRINTK, entry->buf, size + 2);
+ } else {
entry->buf[size] = '\0';
+ stm_log(OST_ENTITY_TRACE_PRINTK, entry->buf, size + 1);
+ }
__buffer_unlock_commit(buffer, event);
ftrace_trace_stack(&global_trace, buffer, irq_flags, 4, pc, NULL);
@@ -615,6 +619,7 @@ int __trace_bputs(unsigned long ip, const char *str)
entry = ring_buffer_event_data(event);
entry->ip = ip;
entry->str = str;
+ stm_log(OST_ENTITY_TRACE_PRINTK, entry->str, strlen(entry->str)+1);
__buffer_unlock_commit(buffer, event);
ftrace_trace_stack(&global_trace, buffer, irq_flags, 4, pc, NULL);
@@ -1352,6 +1357,7 @@ void tracing_reset_all_online_cpus(void)
#define SAVED_CMDLINES_DEFAULT 128
#define NO_CMDLINE_MAP UINT_MAX
+static unsigned saved_tgids[SAVED_CMDLINES_DEFAULT];
static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED;
struct saved_cmdlines_buffer {
unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
@@ -1590,7 +1596,7 @@ static int trace_save_cmdline(struct task_struct *tsk)
}
set_cmdline(idx, tsk->comm);
-
+ saved_tgids[idx] = tsk->tgid;
arch_spin_unlock(&trace_cmdline_lock);
return 1;
@@ -1633,6 +1639,25 @@ void trace_find_cmdline(int pid, char comm[])
preempt_enable();
}
+int trace_find_tgid(int pid)
+{
+ unsigned map;
+ int tgid;
+
+ preempt_disable();
+ arch_spin_lock(&trace_cmdline_lock);
+ map = savedcmd->map_pid_to_cmdline[pid];
+ if (map != NO_CMDLINE_MAP)
+ tgid = saved_tgids[map];
+ else
+ tgid = -1;
+
+ arch_spin_unlock(&trace_cmdline_lock);
+ preempt_enable();
+
+ return tgid;
+}
+
void tracing_record_cmdline(struct task_struct *tsk)
{
if (atomic_read(&trace_record_cmdline_disabled) || !tracing_is_on())
@@ -2220,6 +2245,7 @@ __trace_array_vprintk(struct ring_buffer *buffer,
memcpy(&entry->buf, tbuffer, len + 1);
if (!call_filter_check_discard(call, entry, buffer, event)) {
+ stm_log(OST_ENTITY_TRACE_PRINTK, entry->buf, len + 1);
__buffer_unlock_commit(buffer, event);
ftrace_trace_stack(&global_trace, buffer, flags, 6, pc, NULL);
}
@@ -2583,6 +2609,13 @@ static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m)
"# | | | | |\n");
}
+static void print_func_help_header_tgid(struct trace_buffer *buf, struct seq_file *m)
+{
+ print_event_info(buf, m);
+ seq_puts(m, "# TASK-PID TGID CPU# TIMESTAMP FUNCTION\n");
+ seq_puts(m, "# | | | | | |\n");
+}
+
static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m)
{
print_event_info(buf, m);
@@ -2595,6 +2628,18 @@ static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file
"# | | | |||| | |\n");
}
+static void print_func_help_header_irq_tgid(struct trace_buffer *buf, struct seq_file *m)
+{
+ print_event_info(buf, m);
+ seq_puts(m, "# _-----=> irqs-off\n");
+ seq_puts(m, "# / _----=> need-resched\n");
+ seq_puts(m, "# | / _---=> hardirq/softirq\n");
+ seq_puts(m, "# || / _--=> preempt-depth\n");
+ seq_puts(m, "# ||| / delay\n");
+ seq_puts(m, "# TASK-PID TGID CPU# |||| TIMESTAMP FUNCTION\n");
+ seq_puts(m, "# | | | | |||| | |\n");
+}
+
void
print_trace_header(struct seq_file *m, struct trace_iterator *iter)
{
@@ -2907,9 +2952,15 @@ void trace_default_header(struct seq_file *m)
} else {
if (!(trace_flags & TRACE_ITER_VERBOSE)) {
if (trace_flags & TRACE_ITER_IRQ_INFO)
- print_func_help_header_irq(iter->trace_buffer, m);
+ if (trace_flags & TRACE_ITER_TGID)
+ print_func_help_header_irq_tgid(iter->trace_buffer, m);
+ else
+ print_func_help_header_irq(iter->trace_buffer, m);
else
- print_func_help_header(iter->trace_buffer, m);
+ if (trace_flags & TRACE_ITER_TGID)
+ print_func_help_header_tgid(iter->trace_buffer, m);
+ else
+ print_func_help_header(iter->trace_buffer, m);
}
}
}
@@ -4161,6 +4212,50 @@ static void trace_insert_enum_map(struct module *mod,
}
static ssize_t
+tracing_saved_tgids_read(struct file *file, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ char *file_buf;
+ char *buf;
+ int len = 0;
+ int pid;
+ int i;
+
+ file_buf = kmalloc(SAVED_CMDLINES_DEFAULT*(16+1+16), GFP_KERNEL);
+ if (!file_buf)
+ return -ENOMEM;
+
+ buf = file_buf;
+
+ for (i = 0; i < SAVED_CMDLINES_DEFAULT; i++) {
+ int tgid;
+ int r;
+
+ pid = savedcmd->map_cmdline_to_pid[i];
+ if (pid == -1 || pid == NO_CMDLINE_MAP)
+ continue;
+
+ tgid = trace_find_tgid(pid);
+ r = sprintf(buf, "%d %d\n", pid, tgid);
+ buf += r;
+ len += r;
+ }
+
+ len = simple_read_from_buffer(ubuf, cnt, ppos,
+ file_buf, len);
+
+ kfree(file_buf);
+
+ return len;
+}
+
+static const struct file_operations tracing_saved_tgids_fops = {
+ .open = tracing_open_generic,
+ .read = tracing_saved_tgids_read,
+ .llseek = generic_file_llseek,
+};
+
+static ssize_t
tracing_set_trace_read(struct file *filp, char __user *ubuf,
size_t cnt, loff_t *ppos)
{
@@ -5179,8 +5274,11 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
if (entry->buf[cnt - 1] != '\n') {
entry->buf[cnt] = '\n';
entry->buf[cnt + 1] = '\0';
- } else
+ stm_log(OST_ENTITY_TRACE_MARKER, entry->buf, cnt + 2);
+ } else {
entry->buf[cnt] = '\0';
+ stm_log(OST_ENTITY_TRACE_MARKER, entry->buf, cnt + 1);
+ }
__buffer_unlock_commit(buffer, event);
@@ -6787,6 +6885,9 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
trace_create_file("trace_marker", 0220, d_tracer,
tr, &tracing_mark_fops);
+ trace_create_file("saved_tgids", 0444, d_tracer,
+ tr, &tracing_saved_tgids_fops);
+
trace_create_file("trace_clock", 0644, d_tracer, tr,
&trace_clock_fops);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 919d9d07686f..e1265f95457f 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -656,6 +656,7 @@ static inline void __trace_stack(struct trace_array *tr, unsigned long flags,
extern cycle_t ftrace_now(int cpu);
extern void trace_find_cmdline(int pid, char comm[]);
+extern int trace_find_tgid(int pid);
#ifdef CONFIG_DYNAMIC_FTRACE
extern unsigned long ftrace_update_tot_cnt;
@@ -970,7 +971,8 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
FUNCTION_FLAGS \
FGRAPH_FLAGS \
STACK_FLAGS \
- BRANCH_FLAGS
+ BRANCH_FLAGS \
+ C(TGID, "print-tgid"),
/*
* By defining C, we can make TRACE_FLAGS a list of bit names
diff --git a/kernel/trace/trace_cpu_freq_switch.c b/kernel/trace/trace_cpu_freq_switch.c
new file mode 100644
index 000000000000..f9dab6c4bb72
--- /dev/null
+++ b/kernel/trace/trace_cpu_freq_switch.c
@@ -0,0 +1,312 @@
+/*
+ * Copyright (c) 2012, 2016 The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/slab.h>
+#include <linux/rbtree.h>
+#include <linux/hrtimer.h>
+#include <linux/tracefs.h>
+#include <linux/ktime.h>
+#include <trace/events/power.h>
+#include "trace_stat.h"
+#include "trace.h"
+
+struct trans {
+ struct rb_node node;
+ unsigned int cpu;
+ unsigned int start_freq;
+ unsigned int end_freq;
+ unsigned int min_us;
+ unsigned int max_us;
+ ktime_t total_t;
+ unsigned int count;
+};
+static struct rb_root freq_trans_tree = RB_ROOT;
+
+static struct trans *tr_search(struct rb_root *root, unsigned int cpu,
+ unsigned int start_freq, unsigned int end_freq)
+{
+ struct rb_node *node = root->rb_node;
+
+ while (node) {
+ struct trans *tr = container_of(node, struct trans, node);
+
+ if (cpu < tr->cpu)
+ node = node->rb_left;
+ else if (cpu > tr->cpu)
+ node = node->rb_right;
+ else if (start_freq < tr->start_freq)
+ node = node->rb_left;
+ else if (start_freq > tr->start_freq)
+ node = node->rb_right;
+ else if (end_freq < tr->end_freq)
+ node = node->rb_left;
+ else if (end_freq > tr->end_freq)
+ node = node->rb_right;
+ else
+ return tr;
+ }
+ return NULL;
+}
+
+static int tr_insert(struct rb_root *root, struct trans *tr)
+{
+ struct rb_node **new = &(root->rb_node), *parent = NULL;
+
+ while (*new) {
+ struct trans *this = container_of(*new, struct trans, node);
+
+ parent = *new;
+ if (tr->cpu < this->cpu)
+ new = &((*new)->rb_left);
+ else if (tr->cpu > this->cpu)
+ new = &((*new)->rb_right);
+ else if (tr->start_freq < this->start_freq)
+ new = &((*new)->rb_left);
+ else if (tr->start_freq > this->start_freq)
+ new = &((*new)->rb_right);
+ else if (tr->end_freq < this->end_freq)
+ new = &((*new)->rb_left);
+ else if (tr->end_freq > this->end_freq)
+ new = &((*new)->rb_right);
+ else
+ return -EINVAL;
+ }
+
+ rb_link_node(&tr->node, parent, new);
+ rb_insert_color(&tr->node, root);
+
+ return 0;
+}
+
+struct trans_state {
+ spinlock_t lock;
+ unsigned int start_freq;
+ unsigned int end_freq;
+ ktime_t start_t;
+ bool started;
+};
+static DEFINE_PER_CPU(struct trans_state, freq_trans_state);
+
+static DEFINE_SPINLOCK(state_lock);
+
+static void probe_start(void *ignore, unsigned int start_freq,
+ unsigned int end_freq, unsigned int cpu)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&state_lock, flags);
+ per_cpu(freq_trans_state, cpu).start_freq = start_freq;
+ per_cpu(freq_trans_state, cpu).end_freq = end_freq;
+ per_cpu(freq_trans_state, cpu).start_t = ktime_get();
+ per_cpu(freq_trans_state, cpu).started = true;
+ spin_unlock_irqrestore(&state_lock, flags);
+}
+
+static void probe_end(void *ignore, unsigned int cpu)
+{
+ unsigned long flags;
+ struct trans *tr;
+ s64 dur_us;
+ ktime_t dur_t, end_t = ktime_get();
+
+ spin_lock_irqsave(&state_lock, flags);
+
+ if (!per_cpu(freq_trans_state, cpu).started)
+ goto out;
+
+ dur_t = ktime_sub(end_t, per_cpu(freq_trans_state, cpu).start_t);
+ dur_us = ktime_to_us(dur_t);
+
+ tr = tr_search(&freq_trans_tree, cpu,
+ per_cpu(freq_trans_state, cpu).start_freq,
+ per_cpu(freq_trans_state, cpu).end_freq);
+ if (!tr) {
+ tr = kzalloc(sizeof(*tr), GFP_ATOMIC);
+ if (!tr) {
+ WARN_ONCE(1, "CPU frequency trace is now invalid!\n");
+ goto out;
+ }
+
+ tr->start_freq = per_cpu(freq_trans_state, cpu).start_freq;
+ tr->end_freq = per_cpu(freq_trans_state, cpu).end_freq;
+ tr->cpu = cpu;
+ tr->min_us = UINT_MAX;
+ tr_insert(&freq_trans_tree, tr);
+ }
+ tr->total_t = ktime_add(tr->total_t, dur_t);
+ tr->count++;
+
+ if (dur_us > tr->max_us)
+ tr->max_us = dur_us;
+ if (dur_us < tr->min_us)
+ tr->min_us = dur_us;
+
+ per_cpu(freq_trans_state, cpu).started = false;
+out:
+ spin_unlock_irqrestore(&state_lock, flags);
+}
+
+static void *freq_switch_stat_start(struct tracer_stat *trace)
+{
+ struct rb_node *n;
+ unsigned long flags;
+
+ spin_lock_irqsave(&state_lock, flags);
+ n = rb_first(&freq_trans_tree);
+ spin_unlock_irqrestore(&state_lock, flags);
+
+ return n;
+}
+
+static void *freq_switch_stat_next(void *prev, int idx)
+{
+ struct rb_node *n;
+ unsigned long flags;
+
+ spin_lock_irqsave(&state_lock, flags);
+ n = rb_next(prev);
+ spin_unlock_irqrestore(&state_lock, flags);
+
+ return n;
+}
+
+static int freq_switch_stat_show(struct seq_file *s, void *p)
+{
+ unsigned long flags;
+ struct trans *tr = p;
+
+ spin_lock_irqsave(&state_lock, flags);
+ seq_printf(s, "%3d %9d %8d %5d %6lld %6d %6d\n", tr->cpu,
+ tr->start_freq, tr->end_freq, tr->count,
+ div_s64(ktime_to_us(tr->total_t), tr->count),
+ tr->min_us, tr->max_us);
+ spin_unlock_irqrestore(&state_lock, flags);
+
+ return 0;
+}
+
+static void freq_switch_stat_release(void *stat)
+{
+ struct trans *tr = stat;
+ unsigned long flags;
+
+ spin_lock_irqsave(&state_lock, flags);
+ rb_erase(&tr->node, &freq_trans_tree);
+ spin_unlock_irqrestore(&state_lock, flags);
+ kfree(tr);
+}
+
+static int freq_switch_stat_headers(struct seq_file *s)
+{
+ seq_puts(s, "CPU START_KHZ END_KHZ COUNT AVG_US MIN_US MAX_US\n");
+ seq_puts(s, " | | | | | | |\n");
+ return 0;
+}
+
+struct tracer_stat freq_switch_stats __read_mostly = {
+ .name = "cpu_freq_switch",
+ .stat_start = freq_switch_stat_start,
+ .stat_next = freq_switch_stat_next,
+ .stat_show = freq_switch_stat_show,
+ .stat_release = freq_switch_stat_release,
+ .stat_headers = freq_switch_stat_headers
+};
+
+static void trace_freq_switch_disable(void)
+{
+ unregister_stat_tracer(&freq_switch_stats);
+ unregister_trace_cpu_frequency_switch_end(probe_end, NULL);
+ unregister_trace_cpu_frequency_switch_start(probe_start, NULL);
+ pr_info("disabled cpu frequency switch time profiling\n");
+}
+
+static int trace_freq_switch_enable(void)
+{
+ int ret;
+
+ ret = register_trace_cpu_frequency_switch_start(probe_start, NULL);
+ if (ret)
+ goto out;
+
+ ret = register_trace_cpu_frequency_switch_end(probe_end, NULL);
+ if (ret)
+ goto err_register_switch_end;
+
+ ret = register_stat_tracer(&freq_switch_stats);
+ if (ret)
+ goto err_register_stat_tracer;
+
+ pr_info("enabled cpu frequency switch time profiling\n");
+ return 0;
+
+err_register_stat_tracer:
+ unregister_trace_cpu_frequency_switch_end(probe_end, NULL);
+err_register_switch_end:
+ register_trace_cpu_frequency_switch_start(probe_start, NULL);
+out:
+ pr_err("failed to enable cpu frequency switch time profiling\n");
+
+ return ret;
+}
+
+static DEFINE_MUTEX(debugfs_lock);
+static bool trace_freq_switch_enabled;
+
+static int debug_toggle_tracing(void *data, u64 val)
+{
+ int ret = 0;
+
+ mutex_lock(&debugfs_lock);
+
+ if (val == 1 && !trace_freq_switch_enabled)
+ ret = trace_freq_switch_enable();
+ else if (val == 0 && trace_freq_switch_enabled)
+ trace_freq_switch_disable();
+ else if (val > 1)
+ ret = -EINVAL;
+
+ if (!ret)
+ trace_freq_switch_enabled = val;
+
+ mutex_unlock(&debugfs_lock);
+
+ return ret;
+}
+
+static int debug_tracing_state_get(void *data, u64 *val)
+{
+ mutex_lock(&debugfs_lock);
+ *val = trace_freq_switch_enabled;
+ mutex_unlock(&debugfs_lock);
+
+ return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(debug_tracing_state_fops, debug_tracing_state_get,
+ debug_toggle_tracing, "%llu\n");
+
+static int __init trace_freq_switch_init(void)
+{
+ struct dentry *d_tracer = tracing_init_dentry();
+
+ if (IS_ERR(d_tracer))
+ return 0;
+
+ tracefs_create_file("cpu_freq_switch_profile_enabled",
+ S_IRUGO | S_IWUSR, d_tracer, NULL, &debug_tracing_state_fops);
+
+ return 0;
+}
+late_initcall(trace_freq_switch_init);
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index cc9f7a9319be..731f6484b811 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -256,7 +256,8 @@ int perf_trace_add(struct perf_event *p_event, int flags)
void perf_trace_del(struct perf_event *p_event, int flags)
{
struct trace_event_call *tp_event = p_event->tp_event;
- hlist_del_rcu(&p_event->hlist_entry);
+ if (!hlist_unhashed(&p_event->hlist_entry))
+ hlist_del_rcu(&p_event->hlist_entry);
tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event);
}
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index d202d991edae..fda3b6e1b3a0 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -287,14 +287,15 @@ static void output_printk(struct trace_event_buffer *fbuffer)
spin_unlock_irqrestore(&tracepoint_iter_lock, flags);
}
-void trace_event_buffer_commit(struct trace_event_buffer *fbuffer)
+void trace_event_buffer_commit(struct trace_event_buffer *fbuffer,
+ unsigned long len)
{
if (tracepoint_printk)
output_printk(fbuffer);
event_trigger_unlock_commit(fbuffer->trace_file, fbuffer->buffer,
fbuffer->event, fbuffer->entry,
- fbuffer->flags, fbuffer->pc);
+ fbuffer->flags, fbuffer->pc, len);
}
EXPORT_SYMBOL_GPL(trace_event_buffer_commit);
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index a663cbb84107..4641bdb40f8f 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -64,6 +64,9 @@ struct fgraph_data {
#define TRACE_GRAPH_INDENT 2
+/* Flag options */
+#define TRACE_GRAPH_PRINT_FLAT 0x80
+
static unsigned int max_depth;
static struct tracer_opt trace_opts[] = {
@@ -87,6 +90,8 @@ static struct tracer_opt trace_opts[] = {
{ TRACER_OPT(sleep-time, TRACE_GRAPH_SLEEP_TIME) },
/* Include time within nested functions */
{ TRACER_OPT(graph-time, TRACE_GRAPH_GRAPH_TIME) },
+ /* Use standard trace formatting rather than hierarchical */
+ { TRACER_OPT(funcgraph-flat, TRACE_GRAPH_PRINT_FLAT) },
{ } /* Empty entry */
};
@@ -1165,6 +1170,9 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags)
int cpu = iter->cpu;
int ret;
+ if (flags & TRACE_GRAPH_PRINT_FLAT)
+ return TRACE_TYPE_UNHANDLED;
+
if (data && per_cpu_ptr(data->cpu_data, cpu)->ignore) {
per_cpu_ptr(data->cpu_data, cpu)->ignore = 0;
return TRACE_TYPE_HANDLED;
@@ -1222,13 +1230,6 @@ print_graph_function(struct trace_iterator *iter)
return print_graph_function_flags(iter, tracer_flags.val);
}
-static enum print_line_t
-print_graph_function_event(struct trace_iterator *iter, int flags,
- struct trace_event *event)
-{
- return print_graph_function(iter);
-}
-
static void print_lat_header(struct seq_file *s, u32 flags)
{
static const char spaces[] = " " /* 16 spaces */
@@ -1297,6 +1298,11 @@ void print_graph_headers_flags(struct seq_file *s, u32 flags)
struct trace_iterator *iter = s->private;
struct trace_array *tr = iter->tr;
+ if (flags & TRACE_GRAPH_PRINT_FLAT) {
+ trace_default_header(s);
+ return;
+ }
+
if (!(tr->trace_flags & TRACE_ITER_CONTEXT_INFO))
return;
@@ -1378,19 +1384,6 @@ func_graph_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
return 0;
}
-static struct trace_event_functions graph_functions = {
- .trace = print_graph_function_event,
-};
-
-static struct trace_event graph_trace_entry_event = {
- .type = TRACE_GRAPH_ENT,
- .funcs = &graph_functions,
-};
-
-static struct trace_event graph_trace_ret_event = {
- .type = TRACE_GRAPH_RET,
- .funcs = &graph_functions
-};
static struct tracer graph_trace __tracer_data = {
.name = "function_graph",
@@ -1467,16 +1460,6 @@ static __init int init_graph_trace(void)
{
max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1);
- if (!register_trace_event(&graph_trace_entry_event)) {
- pr_warning("Warning: could not register graph trace events\n");
- return 1;
- }
-
- if (!register_trace_event(&graph_trace_ret_event)) {
- pr_warning("Warning: could not register graph trace events\n");
- return 1;
- }
-
return register_tracer(&graph_trace);
}
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 282982195e09..3bc4b6de0f4d 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -526,11 +526,21 @@ int trace_print_context(struct trace_iterator *iter)
unsigned long long t;
unsigned long secs, usec_rem;
char comm[TASK_COMM_LEN];
+ int tgid;
trace_find_cmdline(entry->pid, comm);
- trace_seq_printf(s, "%16s-%-5d [%03d] ",
- comm, entry->pid, iter->cpu);
+ trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid);
+
+ if (tr->trace_flags & TRACE_ITER_TGID) {
+ tgid = trace_find_tgid(entry->pid);
+ if (tgid < 0)
+ trace_seq_puts(s, "(-----) ");
+ else
+ trace_seq_printf(s, "(%5d) ", tgid);
+ }
+
+ trace_seq_printf(s, "[%03d] ", iter->cpu);
if (tr->trace_flags & TRACE_ITER_IRQ_INFO)
trace_print_lat_fmt(s, entry);
@@ -845,6 +855,174 @@ static struct trace_event trace_fn_event = {
.funcs = &trace_fn_funcs,
};
+/* TRACE_GRAPH_ENT */
+static enum print_line_t trace_graph_ent_trace(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct trace_seq *s = &iter->seq;
+ struct ftrace_graph_ent_entry *field;
+
+ trace_assign_type(field, iter->ent);
+
+ trace_seq_puts(s, "graph_ent: func=");
+ if (trace_seq_has_overflowed(s))
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ if (!seq_print_ip_sym(s, field->graph_ent.func, flags))
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ trace_seq_puts(s, "\n");
+ if (trace_seq_has_overflowed(s))
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t trace_graph_ent_raw(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct ftrace_graph_ent_entry *field;
+
+ trace_assign_type(field, iter->ent);
+
+ trace_seq_printf(&iter->seq, "%lx %d\n",
+ field->graph_ent.func,
+ field->graph_ent.depth);
+ if (trace_seq_has_overflowed(&iter->seq))
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t trace_graph_ent_hex(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct ftrace_graph_ent_entry *field;
+ struct trace_seq *s = &iter->seq;
+
+ trace_assign_type(field, iter->ent);
+
+ SEQ_PUT_HEX_FIELD(s, field->graph_ent.func);
+ SEQ_PUT_HEX_FIELD(s, field->graph_ent.depth);
+
+ return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t trace_graph_ent_bin(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct ftrace_graph_ent_entry *field;
+ struct trace_seq *s = &iter->seq;
+
+ trace_assign_type(field, iter->ent);
+
+ SEQ_PUT_FIELD(s, field->graph_ent.func);
+ SEQ_PUT_FIELD(s, field->graph_ent.depth);
+
+ return TRACE_TYPE_HANDLED;
+}
+
+static struct trace_event_functions trace_graph_ent_funcs = {
+ .trace = trace_graph_ent_trace,
+ .raw = trace_graph_ent_raw,
+ .hex = trace_graph_ent_hex,
+ .binary = trace_graph_ent_bin,
+};
+
+static struct trace_event trace_graph_ent_event = {
+ .type = TRACE_GRAPH_ENT,
+ .funcs = &trace_graph_ent_funcs,
+};
+
+/* TRACE_GRAPH_RET */
+static enum print_line_t trace_graph_ret_trace(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct trace_seq *s = &iter->seq;
+ struct trace_entry *entry = iter->ent;
+ struct ftrace_graph_ret_entry *field;
+
+ trace_assign_type(field, entry);
+
+ trace_seq_puts(s, "graph_ret: func=");
+ if (trace_seq_has_overflowed(s))
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ if (!seq_print_ip_sym(s, field->ret.func, flags))
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ trace_seq_puts(s, "\n");
+ if (trace_seq_has_overflowed(s))
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t trace_graph_ret_raw(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct ftrace_graph_ret_entry *field;
+
+ trace_assign_type(field, iter->ent);
+
+ trace_seq_printf(&iter->seq, "%lx %lld %lld %ld %d\n",
+ field->ret.func,
+ field->ret.calltime,
+ field->ret.rettime,
+ field->ret.overrun,
+ field->ret.depth);
+ if (trace_seq_has_overflowed(&iter->seq))
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t trace_graph_ret_hex(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct ftrace_graph_ret_entry *field;
+ struct trace_seq *s = &iter->seq;
+
+ trace_assign_type(field, iter->ent);
+
+ SEQ_PUT_HEX_FIELD(s, field->ret.func);
+ SEQ_PUT_HEX_FIELD(s, field->ret.calltime);
+ SEQ_PUT_HEX_FIELD(s, field->ret.rettime);
+ SEQ_PUT_HEX_FIELD(s, field->ret.overrun);
+ SEQ_PUT_HEX_FIELD(s, field->ret.depth);
+
+ return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t trace_graph_ret_bin(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct ftrace_graph_ret_entry *field;
+ struct trace_seq *s = &iter->seq;
+
+ trace_assign_type(field, iter->ent);
+
+ SEQ_PUT_FIELD(s, field->ret.func);
+ SEQ_PUT_FIELD(s, field->ret.calltime);
+ SEQ_PUT_FIELD(s, field->ret.rettime);
+ SEQ_PUT_FIELD(s, field->ret.overrun);
+ SEQ_PUT_FIELD(s, field->ret.depth);
+
+ return TRACE_TYPE_HANDLED;
+}
+
+static struct trace_event_functions trace_graph_ret_funcs = {
+ .trace = trace_graph_ret_trace,
+ .raw = trace_graph_ret_raw,
+ .hex = trace_graph_ret_hex,
+ .binary = trace_graph_ret_bin,
+};
+
+static struct trace_event trace_graph_ret_event = {
+ .type = TRACE_GRAPH_RET,
+ .funcs = &trace_graph_ret_funcs,
+};
+
/* TRACE_CTX an TRACE_WAKE */
static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter,
char *delim)
@@ -1222,6 +1400,8 @@ static struct trace_event trace_print_event = {
static struct trace_event *events[] __initdata = {
&trace_fn_event,
+ &trace_graph_ent_event,
+ &trace_graph_ret_event,
&trace_ctx_event,
&trace_wake_event,
&trace_stack_event,
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 9d4399b553a3..78f04e4ad829 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -359,7 +359,8 @@ static bool report_latency(struct trace_array *tr, cycle_t delta)
}
static void
-probe_wakeup_migrate_task(void *ignore, struct task_struct *task, int cpu)
+probe_wakeup_migrate_task(void *ignore, struct task_struct *task, int cpu,
+ unsigned int load)
{
if (task != wakeup_task)
return;
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 0655afbea83f..a01740a98afa 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -336,7 +336,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args);
event_trigger_unlock_commit(trace_file, buffer, event, entry,
- irq_flags, pc);
+ irq_flags, pc, 0);
}
static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
@@ -382,7 +382,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
entry->ret = syscall_get_return_value(current, regs);
event_trigger_unlock_commit(trace_file, buffer, event, entry,
- irq_flags, pc);
+ irq_flags, pc, 0);
}
static int reg_event_syscall_enter(struct trace_event_file *file,
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index d2f6d0be3503..23515a716748 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -821,7 +821,7 @@ static void __uprobe_trace_func(struct trace_uprobe *tu,
memcpy(data, ucb->buf, tu->tp.size + dsize);
- event_trigger_unlock_commit(trace_file, buffer, event, entry, 0, 0);
+ event_trigger_unlock_commit(trace_file, buffer, event, entry, 0, 0, 0);
}
/* uprobe handler */
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 198137b1cadc..9472691c1eb0 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -20,6 +20,7 @@
#include <linux/smpboot.h>
#include <linux/sched/rt.h>
#include <linux/tick.h>
+#include <linux/workqueue.h>
#include <asm/irq_regs.h>
#include <linux/kvm_para.h>
@@ -103,6 +104,11 @@ static DEFINE_PER_CPU(struct task_struct *, softlockup_task_ptr_saved);
static DEFINE_PER_CPU(bool, hard_watchdog_warn);
static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
+#endif
+#ifdef CONFIG_HARDLOCKUP_DETECTOR_OTHER_CPU
+static cpumask_t __read_mostly watchdog_cpus;
+#endif
+#ifdef CONFIG_HARDLOCKUP_DETECTOR_NMI
static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
#endif
static unsigned long soft_lockup_nmi_warn;
@@ -114,7 +120,9 @@ static unsigned long soft_lockup_nmi_warn;
#ifdef CONFIG_HARDLOCKUP_DETECTOR
unsigned int __read_mostly hardlockup_panic =
CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
+#ifdef CONFIG_HARDLOCKUP_DETECTOR_NMI
static unsigned long hardlockup_allcpu_dumped;
+#endif
/*
* We may not want to enable hard lockup detection by default in all cases,
* for example when running the kernel as a guest on a hypervisor. In these
@@ -225,7 +233,15 @@ static void __touch_watchdog(void)
__this_cpu_write(watchdog_touch_ts, get_timestamp());
}
-void touch_softlockup_watchdog(void)
+/**
+ * touch_softlockup_watchdog_sched - touch watchdog on scheduler stalls
+ *
+ * Call when the scheduler may have stalled for legitimate reasons
+ * preventing the watchdog task from executing - e.g. the scheduler
+ * entering idle state. This should only be used for scheduler events.
+ * Use touch_softlockup_watchdog() for everything else.
+ */
+void touch_softlockup_watchdog_sched(void)
{
/*
* Preemption can be enabled. It doesn't matter which CPU's timestamp
@@ -233,6 +249,12 @@ void touch_softlockup_watchdog(void)
*/
raw_cpu_write(watchdog_touch_ts, 0);
}
+
+void touch_softlockup_watchdog(void)
+{
+ touch_softlockup_watchdog_sched();
+ wq_watchdog_touch(raw_smp_processor_id());
+}
EXPORT_SYMBOL(touch_softlockup_watchdog);
void touch_all_softlockup_watchdogs(void)
@@ -246,6 +268,7 @@ void touch_all_softlockup_watchdogs(void)
*/
for_each_watchdog_cpu(cpu)
per_cpu(watchdog_touch_ts, cpu) = 0;
+ wq_watchdog_touch(-1);
}
#ifdef CONFIG_HARDLOCKUP_DETECTOR
@@ -271,7 +294,7 @@ void touch_softlockup_watchdog_sync(void)
__this_cpu_write(watchdog_touch_ts, 0);
}
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
+#ifdef CONFIG_HARDLOCKUP_DETECTOR_NMI
/* watchdog detector functions */
static bool is_hardlockup(void)
{
@@ -285,6 +308,76 @@ static bool is_hardlockup(void)
}
#endif
+#ifdef CONFIG_HARDLOCKUP_DETECTOR_OTHER_CPU
+static unsigned int watchdog_next_cpu(unsigned int cpu)
+{
+ cpumask_t cpus = watchdog_cpus;
+ unsigned int next_cpu;
+
+ next_cpu = cpumask_next(cpu, &cpus);
+ if (next_cpu >= nr_cpu_ids)
+ next_cpu = cpumask_first(&cpus);
+
+ if (next_cpu == cpu)
+ return nr_cpu_ids;
+
+ return next_cpu;
+}
+
+static int is_hardlockup_other_cpu(unsigned int cpu)
+{
+ unsigned long hrint = per_cpu(hrtimer_interrupts, cpu);
+
+ if (per_cpu(hrtimer_interrupts_saved, cpu) == hrint)
+ return 1;
+
+ per_cpu(hrtimer_interrupts_saved, cpu) = hrint;
+ return 0;
+}
+
+static void watchdog_check_hardlockup_other_cpu(void)
+{
+ unsigned int next_cpu;
+
+ /*
+ * Test for hardlockups every 3 samples. The sample period is
+ * watchdog_thresh * 2 / 5, so 3 samples gets us back to slightly over
+ * watchdog_thresh (over by 20%).
+ */
+ if (__this_cpu_read(hrtimer_interrupts) % 3 != 0)
+ return;
+
+ /* check for a hardlockup on the next cpu */
+ next_cpu = watchdog_next_cpu(smp_processor_id());
+ if (next_cpu >= nr_cpu_ids)
+ return;
+
+ smp_rmb();
+
+ if (per_cpu(watchdog_nmi_touch, next_cpu) == true) {
+ per_cpu(watchdog_nmi_touch, next_cpu) = false;
+ return;
+ }
+
+ if (is_hardlockup_other_cpu(next_cpu)) {
+ /* only warn once */
+ if (per_cpu(hard_watchdog_warn, next_cpu) == true)
+ return;
+
+ if (hardlockup_panic)
+ panic("Watchdog detected hard LOCKUP on cpu %u", next_cpu);
+ else
+ WARN(1, "Watchdog detected hard LOCKUP on cpu %u", next_cpu);
+
+ per_cpu(hard_watchdog_warn, next_cpu) = true;
+ } else {
+ per_cpu(hard_watchdog_warn, next_cpu) = false;
+ }
+}
+#else
+static inline void watchdog_check_hardlockup_other_cpu(void) { return; }
+#endif
+
static int is_softlockup(unsigned long touch_ts)
{
unsigned long now = get_timestamp();
@@ -297,7 +390,7 @@ static int is_softlockup(unsigned long touch_ts)
return 0;
}
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
+#ifdef CONFIG_HARDLOCKUP_DETECTOR_NMI
static struct perf_event_attr wd_hw_attr = {
.type = PERF_TYPE_HARDWARE,
@@ -360,7 +453,7 @@ static void watchdog_overflow_callback(struct perf_event *event,
__this_cpu_write(hard_watchdog_warn, false);
return;
}
-#endif /* CONFIG_HARDLOCKUP_DETECTOR */
+#endif /* CONFIG_HARDLOCKUP_DETECTOR_NMI */
static void watchdog_interrupt_count(void)
{
@@ -384,6 +477,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
/* kick the hardlockup detector */
watchdog_interrupt_count();
+ /* test for hardlockups on the next cpu */
+ watchdog_check_hardlockup_other_cpu();
+
/* kick the softlockup detector */
wake_up_process(__this_cpu_read(softlockup_watchdog));
@@ -561,7 +657,7 @@ static void watchdog(unsigned int cpu)
watchdog_nmi_disable(cpu);
}
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
+#ifdef CONFIG_HARDLOCKUP_DETECTOR_NMI
/*
* People like the simple clean cpu node info on boot.
* Reduce the watchdog noise by only printing messages
@@ -660,9 +756,44 @@ static void watchdog_nmi_disable(unsigned int cpu)
}
#else
+#ifdef CONFIG_HARDLOCKUP_DETECTOR_OTHER_CPU
+static int watchdog_nmi_enable(unsigned int cpu)
+{
+ /*
+ * The new cpu will be marked online before the first hrtimer interrupt
+ * runs on it. If another cpu tests for a hardlockup on the new cpu
+ * before it has run its first hrtimer, it will get a false positive.
+ * Touch the watchdog on the new cpu to delay the first check for at
+ * least 3 sampling periods to guarantee one hrtimer has run on the new
+ * cpu.
+ */
+ per_cpu(watchdog_nmi_touch, cpu) = true;
+ smp_wmb();
+ cpumask_set_cpu(cpu, &watchdog_cpus);
+ return 0;
+}
+
+static void watchdog_nmi_disable(unsigned int cpu)
+{
+ unsigned int next_cpu = watchdog_next_cpu(cpu);
+
+ /*
+ * Offlining this cpu will cause the cpu before this one to start
+ * checking the one after this one. If this cpu just finished checking
+ * the next cpu and updating hrtimer_interrupts_saved, and then the
+ * previous cpu checks it within one sample period, it will trigger a
+ * false positive. Touch the watchdog on the next cpu to prevent it.
+ */
+ if (next_cpu < nr_cpu_ids)
+ per_cpu(watchdog_nmi_touch, next_cpu) = true;
+ smp_wmb();
+ cpumask_clear_cpu(cpu, &watchdog_cpus);
+}
+#else
static int watchdog_nmi_enable(unsigned int cpu) { return 0; }
static void watchdog_nmi_disable(unsigned int cpu) { return; }
-#endif /* CONFIG_HARDLOCKUP_DETECTOR */
+#endif /* CONFIG_HARDLOCKUP_DETECTOR_OTHER_CPU */
+#endif /* CONFIG_HARDLOCKUP_DETECTOR_NMI */
static struct smp_hotplug_thread watchdog_threads = {
.store = &softlockup_watchdog,
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 450c21fd0e6e..ef84d9874d03 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -48,6 +48,7 @@
#include <linux/nodemask.h>
#include <linux/moduleparam.h>
#include <linux/uaccess.h>
+#include <linux/bug.h>
#include "workqueue_internal.h"
@@ -148,6 +149,8 @@ struct worker_pool {
int id; /* I: pool ID */
unsigned int flags; /* X: flags */
+ unsigned long watchdog_ts; /* L: watchdog timestamp */
+
struct list_head worklist; /* L: list of pending works */
int nr_workers; /* L: total number of workers */
@@ -1093,6 +1096,8 @@ static void pwq_activate_delayed_work(struct work_struct *work)
struct pool_workqueue *pwq = get_work_pwq(work);
trace_workqueue_activate_work(work);
+ if (list_empty(&pwq->pool->worklist))
+ pwq->pool->watchdog_ts = jiffies;
move_linked_works(work, &pwq->pool->worklist, NULL);
__clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work));
pwq->nr_active++;
@@ -1395,6 +1400,8 @@ retry:
trace_workqueue_activate_work(work);
pwq->nr_active++;
worklist = &pwq->pool->worklist;
+ if (list_empty(worklist))
+ pwq->pool->watchdog_ts = jiffies;
} else {
work_flags |= WORK_STRUCT_DELAYED;
worklist = &pwq->delayed_works;
@@ -2052,6 +2059,7 @@ __acquires(&pool->lock)
current->comm, preempt_count(), task_pid_nr(current),
worker->current_func);
debug_show_held_locks(current);
+ BUG_ON(PANIC_CORRUPTION);
dump_stack();
}
@@ -2167,6 +2175,8 @@ recheck:
list_first_entry(&pool->worklist,
struct work_struct, entry);
+ pool->watchdog_ts = jiffies;
+
if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) {
/* optimization path, not strictly necessary */
process_one_work(worker, work);
@@ -2250,6 +2260,7 @@ repeat:
struct pool_workqueue, mayday_node);
struct worker_pool *pool = pwq->pool;
struct work_struct *work, *n;
+ bool first = true;
__set_current_state(TASK_RUNNING);
list_del_init(&pwq->mayday_node);
@@ -2266,9 +2277,14 @@ repeat:
* process'em.
*/
WARN_ON_ONCE(!list_empty(scheduled));
- list_for_each_entry_safe(work, n, &pool->worklist, entry)
- if (get_work_pwq(work) == pwq)
+ list_for_each_entry_safe(work, n, &pool->worklist, entry) {
+ if (get_work_pwq(work) == pwq) {
+ if (first)
+ pool->watchdog_ts = jiffies;
move_linked_works(work, scheduled, &n);
+ }
+ first = false;
+ }
if (!list_empty(scheduled)) {
process_scheduled_works(rescuer);
@@ -3079,6 +3095,7 @@ static int init_worker_pool(struct worker_pool *pool)
pool->cpu = -1;
pool->node = NUMA_NO_NODE;
pool->flags |= POOL_DISASSOCIATED;
+ pool->watchdog_ts = jiffies;
INIT_LIST_HEAD(&pool->worklist);
INIT_LIST_HEAD(&pool->idle_list);
hash_init(pool->busy_hash);
@@ -4318,7 +4335,9 @@ void show_workqueue_state(void)
pr_info("pool %d:", pool->id);
pr_cont_pool_info(pool);
- pr_cont(" workers=%d", pool->nr_workers);
+ pr_cont(" hung=%us workers=%d",
+ jiffies_to_msecs(jiffies - pool->watchdog_ts) / 1000,
+ pool->nr_workers);
if (pool->manager)
pr_cont(" manager: %d",
task_pid_nr(pool->manager->task));
@@ -5177,6 +5196,154 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq)
static void workqueue_sysfs_unregister(struct workqueue_struct *wq) { }
#endif /* CONFIG_SYSFS */
+/*
+ * Workqueue watchdog.
+ *
+ * Stall may be caused by various bugs - missing WQ_MEM_RECLAIM, illegal
+ * flush dependency, a concurrency managed work item which stays RUNNING
+ * indefinitely. Workqueue stalls can be very difficult to debug as the
+ * usual warning mechanisms don't trigger and internal workqueue state is
+ * largely opaque.
+ *
+ * Workqueue watchdog monitors all worker pools periodically and dumps
+ * state if some pools failed to make forward progress for a while where
+ * forward progress is defined as the first item on ->worklist changing.
+ *
+ * This mechanism is controlled through the kernel parameter
+ * "workqueue.watchdog_thresh" which can be updated at runtime through the
+ * corresponding sysfs parameter file.
+ */
+#ifdef CONFIG_WQ_WATCHDOG
+
+static void wq_watchdog_timer_fn(unsigned long data);
+
+static unsigned long wq_watchdog_thresh = 30;
+static struct timer_list wq_watchdog_timer =
+ TIMER_DEFERRED_INITIALIZER(wq_watchdog_timer_fn, 0, 0);
+
+static unsigned long wq_watchdog_touched = INITIAL_JIFFIES;
+static DEFINE_PER_CPU(unsigned long, wq_watchdog_touched_cpu) = INITIAL_JIFFIES;
+
+static void wq_watchdog_reset_touched(void)
+{
+ int cpu;
+
+ wq_watchdog_touched = jiffies;
+ for_each_possible_cpu(cpu)
+ per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies;
+}
+
+static void wq_watchdog_timer_fn(unsigned long data)
+{
+ unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ;
+ bool lockup_detected = false;
+ struct worker_pool *pool;
+ int pi;
+
+ if (!thresh)
+ return;
+
+ rcu_read_lock();
+
+ for_each_pool(pool, pi) {
+ unsigned long pool_ts, touched, ts;
+
+ if (list_empty(&pool->worklist))
+ continue;
+
+ /* get the latest of pool and touched timestamps */
+ pool_ts = READ_ONCE(pool->watchdog_ts);
+ touched = READ_ONCE(wq_watchdog_touched);
+
+ if (time_after(pool_ts, touched))
+ ts = pool_ts;
+ else
+ ts = touched;
+
+ if (pool->cpu >= 0) {
+ unsigned long cpu_touched =
+ READ_ONCE(per_cpu(wq_watchdog_touched_cpu,
+ pool->cpu));
+ if (time_after(cpu_touched, ts))
+ ts = cpu_touched;
+ }
+
+ /* did we stall? */
+ if (time_after(jiffies, ts + thresh)) {
+ lockup_detected = true;
+ pr_emerg("BUG: workqueue lockup - pool");
+ pr_cont_pool_info(pool);
+ pr_cont(" stuck for %us!\n",
+ jiffies_to_msecs(jiffies - pool_ts) / 1000);
+ }
+ }
+
+ rcu_read_unlock();
+
+ if (lockup_detected)
+ show_workqueue_state();
+
+ wq_watchdog_reset_touched();
+ mod_timer(&wq_watchdog_timer, jiffies + thresh);
+}
+
+void wq_watchdog_touch(int cpu)
+{
+ if (cpu >= 0)
+ per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies;
+ else
+ wq_watchdog_touched = jiffies;
+}
+
+static void wq_watchdog_set_thresh(unsigned long thresh)
+{
+ wq_watchdog_thresh = 0;
+ del_timer_sync(&wq_watchdog_timer);
+
+ if (thresh) {
+ wq_watchdog_thresh = thresh;
+ wq_watchdog_reset_touched();
+ mod_timer(&wq_watchdog_timer, jiffies + thresh * HZ);
+ }
+}
+
+static int wq_watchdog_param_set_thresh(const char *val,
+ const struct kernel_param *kp)
+{
+ unsigned long thresh;
+ int ret;
+
+ ret = kstrtoul(val, 0, &thresh);
+ if (ret)
+ return ret;
+
+ if (system_wq)
+ wq_watchdog_set_thresh(thresh);
+ else
+ wq_watchdog_thresh = thresh;
+
+ return 0;
+}
+
+static const struct kernel_param_ops wq_watchdog_thresh_ops = {
+ .set = wq_watchdog_param_set_thresh,
+ .get = param_get_ulong,
+};
+
+module_param_cb(watchdog_thresh, &wq_watchdog_thresh_ops, &wq_watchdog_thresh,
+ 0644);
+
+static void wq_watchdog_init(void)
+{
+ wq_watchdog_set_thresh(wq_watchdog_thresh);
+}
+
+#else /* CONFIG_WQ_WATCHDOG */
+
+static inline void wq_watchdog_init(void) { }
+
+#endif /* CONFIG_WQ_WATCHDOG */
+
static void __init wq_numa_init(void)
{
cpumask_var_t *tbl;
@@ -5300,6 +5467,9 @@ static int __init init_workqueues(void)
!system_unbound_wq || !system_freezable_wq ||
!system_power_efficient_wq ||
!system_freezable_power_efficient_wq);
+
+ wq_watchdog_init();
+
return 0;
}
early_initcall(init_workqueues);