46 files changed, 766 insertions, 653 deletions
diff --git a/kernel/audit.c b/kernel/audit.c
index 5ffcbd354a52..34f690b9213a 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -870,6 +870,12 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 				return err;
 		}
 		if (s.mask & AUDIT_STATUS_PID) {
+			/* NOTE: we are using task_tgid_vnr() below because
+			 *       the s.pid value is relative to the namespace
+			 *       of the caller; at present this doesn't matter
+			 *       much since you can really only run auditd
+			 *       from the initial pid namespace, but something
+			 *       to keep in mind if this changes */
 			int new_pid = s.pid;
 
 			if ((!new_pid) && (task_tgid_vnr(current) != audit_pid))
@@ -1896,7 +1902,7 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
 			 " euid=%u suid=%u fsuid=%u"
 			 " egid=%u sgid=%u fsgid=%u tty=%s ses=%u",
 			 task_ppid_nr(tsk),
-			 task_pid_nr(tsk),
+			 task_tgid_nr(tsk),
 			 from_kuid(&init_user_ns, audit_get_loginuid(tsk)),
 			 from_kuid(&init_user_ns, cred->uid),
 			 from_kgid(&init_user_ns, cred->gid),
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 656c7e93ac0d..939945a5649c 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -19,6 +19,7 @@
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
+#include <linux/file.h>
 #include <linux/kernel.h>
 #include <linux/audit.h>
 #include <linux/kthread.h>
@@ -544,10 +545,11 @@ int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark)
 	unsigned long ino;
 	dev_t dev;
 
-	rcu_read_lock();
-	exe_file = rcu_dereference(tsk->mm->exe_file);
+	exe_file = get_task_exe_file(tsk);
+	if (!exe_file)
+		return 0;
 	ino = exe_file->f_inode->i_ino;
 	dev = exe_file->f_inode->i_sb->s_dev;
-	rcu_read_unlock();
+	fput(exe_file);
 	return audit_mark_compare(mark, ino, dev);
 }
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 48f45987dc6c..63f0e495f517 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -458,7 +458,7 @@ static int audit_filter_rules(struct task_struct *tsk,
 
 		switch (f->type) {
 		case AUDIT_PID:
-			pid = task_pid_nr(tsk);
+			pid = task_tgid_nr(tsk);
 			result = audit_comparator(pid, f->op, f->val);
 			break;
 		case AUDIT_PPID:
@@ -1987,7 +1987,7 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid,
 	ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN);
 	if (!ab)
 		return;
-	audit_log_format(ab, "pid=%d uid=%u", task_pid_nr(current), uid);
+	audit_log_format(ab, "pid=%d uid=%u", task_tgid_nr(current), uid);
 	audit_log_task_context(ab);
 	audit_log_format(ab, " old-auid=%u auid=%u old-ses=%u ses=%u res=%d",
 			 oldloginuid, loginuid, oldsessionid, sessionid, !rc);
@@ -2212,7 +2212,7 @@ void __audit_ptrace(struct task_struct *t)
 {
 	struct audit_context *context = current->audit_context;
 
-	context->target_pid = task_pid_nr(t);
+	context->target_pid = task_tgid_nr(t);
 	context->target_auid = audit_get_loginuid(t);
 	context->target_uid = task_uid(t);
 	context->target_sessionid = audit_get_sessionid(t);
@@ -2237,7 +2237,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
 
 	if (audit_pid && t->tgid == audit_pid) {
 		if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) {
-			audit_sig_pid = task_pid_nr(tsk);
+			audit_sig_pid = task_tgid_nr(tsk);
 			if (uid_valid(tsk->loginuid))
 				audit_sig_uid = tsk->loginuid;
 			else
@@ -2337,7 +2337,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
 void __audit_log_capset(const struct cred *new, const struct cred *old)
 {
 	struct audit_context *context = current->audit_context;
-	context->capset.pid = task_pid_nr(current);
+	context->capset.pid = task_tgid_nr(current);
 	context->capset.cap.effective   = new->cap_effective;
 	context->capset.cap.inheritable = new->cap_effective;
 	context->capset.cap.permitted   = new->cap_permitted;
@@ -2369,7 +2369,7 @@ static void audit_log_task(struct audit_buffer *ab)
 			 from_kgid(&init_user_ns, gid),
 			 sessionid);
 	audit_log_task_context(ab);
-	audit_log_format(ab, " pid=%d comm=", task_pid_nr(current));
+	audit_log_format(ab, " pid=%d comm=", task_tgid_nr(current));
 	audit_log_untrustedstring(ab, get_task_comm(comm, current));
 	audit_log_d_path_exe(ab, current->mm);
 }
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index e94c3c189338..b05fc202b548 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -211,6 +211,7 @@ static unsigned long have_free_callback __read_mostly;
 /* Ditto for the can_fork callback. */
 static unsigned long have_canfork_callback __read_mostly;
 
+static struct file_system_type cgroup2_fs_type;
 static struct cftype cgroup_dfl_base_files[];
 static struct cftype cgroup_legacy_base_files[];
 
@@ -236,6 +237,9 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css,
  */
 static bool cgroup_ssid_enabled(int ssid)
 {
+	if (CGROUP_SUBSYS_COUNT == 0)
+		return false;
+
 	return static_key_enabled(cgroup_subsys_enabled_key[ssid]);
 }
 
@@ -1649,10 +1653,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
 			all_ss = true;
 			continue;
 		}
-		if (!strcmp(token, "__DEVEL__sane_behavior")) {
-			opts->flags |= CGRP_ROOT_SANE_BEHAVIOR;
-			continue;
-		}
 		if (!strcmp(token, "noprefix")) {
 			opts->flags |= CGRP_ROOT_NOPREFIX;
 			continue;
@@ -1719,15 +1719,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
 			return -ENOENT;
 	}
 
-	if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
-		pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
-		if (nr_opts != 1) {
-			pr_err("sane_behavior: no other mount options allowed\n");
-			return -EINVAL;
-		}
-		return 0;
-	}
-
 	/*
 	 * If the 'all' option was specified select all the subsystems,
 	 * otherwise if 'none', 'name=' and a subsystem name options were
@@ -2010,6 +2001,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 			 int flags, const char *unused_dev_name,
 			 void *data)
 {
+	bool is_v2 = fs_type == &cgroup2_fs_type;
 	struct super_block *pinned_sb = NULL;
 	struct cgroup_subsys *ss;
 	struct cgroup_root *root = NULL;
@@ -2026,6 +2018,17 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 	if (!use_task_css_set_links)
 		cgroup_enable_task_cg_lists();
 
+	if (is_v2) {
+		if (data) {
+			pr_err("cgroup2: unknown option \"%s\"\n", (char *)data);
+			return ERR_PTR(-EINVAL);
+		}
+		cgrp_dfl_root_visible = true;
+		root = &cgrp_dfl_root;
+		cgroup_get(&root->cgrp);
+		goto out_mount;
+	}
+
 	mutex_lock(&cgroup_mutex);
 
 	/* First find the desired set of subsystems */
@@ -2033,15 +2036,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 	if (ret)
 		goto out_unlock;
 
-	/* look for a matching existing root */
-	if (opts.flags & CGRP_ROOT_SANE_BEHAVIOR) {
-		cgrp_dfl_root_visible = true;
-		root = &cgrp_dfl_root;
-		cgroup_get(&root->cgrp);
-		ret = 0;
-		goto out_unlock;
-	}
-
 	/*
 	 * Destruction of cgroup root is asynchronous, so subsystems may
 	 * still be dying after the previous unmount.  Let's drain the
@@ -2152,9 +2146,10 @@ out_free:
 
 	if (ret)
 		return ERR_PTR(ret);
-
+out_mount:
 	dentry = kernfs_mount(fs_type, flags, root->kf_root,
-				CGROUP_SUPER_MAGIC, &new_sb);
+			      is_v2 ? CGROUP2_SUPER_MAGIC : CGROUP_SUPER_MAGIC,
+			      &new_sb);
 	if (IS_ERR(dentry) || !new_sb)
 		cgroup_put(&root->cgrp);
 
@@ -2197,6 +2192,12 @@ static struct file_system_type cgroup_fs_type = {
 	.kill_sb = cgroup_kill_sb,
 };
 
+static struct file_system_type cgroup2_fs_type = {
+	.name = "cgroup2",
+	.mount = cgroup_mount,
+	.kill_sb = cgroup_kill_sb,
+};
+
 /**
  * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
  * @task: target task
@@ -2677,45 +2678,6 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
 	return ret;
 }
 
-int subsys_cgroup_allow_attach(struct cgroup_taskset *tset)
-{
-	const struct cred *cred = current_cred(), *tcred;
-	struct task_struct *task;
-	struct cgroup_subsys_state *css;
-
-	if (capable(CAP_SYS_NICE))
-		return 0;
-
-	cgroup_taskset_for_each(task, css, tset) {
-		tcred = __task_cred(task);
-
-		if (current != task && !uid_eq(cred->euid, tcred->uid) &&
-		    !uid_eq(cred->euid, tcred->suid))
-			return -EACCES;
-	}
-
-	return 0;
-}
-
-static int cgroup_allow_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
-{
-	struct cgroup_subsys_state *css;
-	int i;
-	int ret;
-
-	for_each_css(css, i, cgrp) {
-		if (css->ss->allow_attach) {
-			ret = css->ss->allow_attach(tset);
-			if (ret)
-				return ret;
-		} else {
-			return -EACCES;
-		}
-	}
-
-	return 0;
-}
-
 static int cgroup_procs_write_permission(struct task_struct *task,
 					 struct cgroup *dst_cgrp,
 					 struct kernfs_open_file *of)
@@ -2730,24 +2692,9 @@ static int cgroup_procs_write_permission(struct task_struct *task,
 	 */
 	if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
 	    !uid_eq(cred->euid, tcred->uid) &&
-	    !uid_eq(cred->euid, tcred->suid)) {
-		/*
-		 * if the default permission check fails, give each
-		 * cgroup a chance to extend the permission check
-		 */
-		struct cgroup_taskset tset = {
-			.src_csets = LIST_HEAD_INIT(tset.src_csets),
-			.dst_csets = LIST_HEAD_INIT(tset.dst_csets),
-			.csets = &tset.src_csets,
-		};
-		struct css_set *cset;
-		cset = task_css_set(task);
-		list_add(&cset->mg_node, &tset.src_csets);
-		ret = cgroup_allow_attach(dst_cgrp, &tset);
-		list_del(&tset.src_csets);
-		if (ret)
-			ret = -EACCES;
-	}
+	    !uid_eq(cred->euid, tcred->suid) &&
+	    !ns_capable(tcred->user_ns, CAP_SYS_RESOURCE))
+		ret = -EACCES;
 
 	if (!ret && cgroup_on_dfl(dst_cgrp)) {
 		struct super_block *sb = of->file->f_path.dentry->d_sb;
@@ -5447,6 +5394,7 @@ int __init cgroup_init(void)
 
 	WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup"));
 	WARN_ON(register_filesystem(&cgroup_fs_type));
+	WARN_ON(register_filesystem(&cgroup2_fs_type));
 	WARN_ON(!proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations));
 
 	return 0;
diff --git a/kernel/configs/tiny.config b/kernel/configs/tiny.config
index c2de56ab0fce..7fa0c4ae6394 100644
--- a/kernel/configs/tiny.config
+++ b/kernel/configs/tiny.config
@@ -1,4 +1,12 @@
+# CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE is not set
 CONFIG_CC_OPTIMIZE_FOR_SIZE=y
+# CONFIG_KERNEL_GZIP is not set
+# CONFIG_KERNEL_BZIP2 is not set
+# CONFIG_KERNEL_LZMA is not set
 CONFIG_KERNEL_XZ=y
+# CONFIG_KERNEL_LZO is not set
+# CONFIG_KERNEL_LZ4 is not set
 CONFIG_OPTIMIZE_INLINING=y
+# CONFIG_SLAB is not set
+# CONFIG_SLUB is not set
 CONFIG_SLOB=y
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 25cfcc804077..2432cc630ffb 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -185,10 +185,17 @@ void cpu_hotplug_disable(void)
 }
 EXPORT_SYMBOL_GPL(cpu_hotplug_disable);
 
+static void __cpu_hotplug_enable(void)
+{
+	if (WARN_ONCE(!cpu_hotplug_disabled, "Unbalanced cpu hotplug enable\n"))
+		return;
+	cpu_hotplug_disabled--;
+}
+
 void cpu_hotplug_enable(void)
 {
 	cpu_maps_update_begin();
-	WARN_ON(--cpu_hotplug_disabled < 0);
+	__cpu_hotplug_enable();
 	cpu_maps_update_done();
 }
 EXPORT_SYMBOL_GPL(cpu_hotplug_enable);
@@ -616,7 +623,7 @@ void enable_nonboot_cpus(void)
 
 	/* Allow everyone to use the CPU hotplug again */
 	cpu_maps_update_begin();
-	WARN_ON(--cpu_hotplug_disabled < 0);
+	__cpu_hotplug_enable();
 	if (cpumask_empty(frozen_cpus))
 		goto out;
 
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index e3c0f38acbe6..29c7240172d3 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2095,21 +2095,18 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
 	mutex_unlock(&cpuset_mutex);
 }
 
-static int cpuset_allow_attach(struct cgroup_taskset *tset)
+/*
+ * Make sure the new task conform to the current state of its parent,
+ * which could have been changed by cpuset just after it inherits the
+ * state from the parent and before it sits on the cgroup's task list.
+ */
+void cpuset_fork(struct task_struct *task, void *priv)
 {
-	const struct cred *cred = current_cred(), *tcred;
-	struct task_struct *task;
-	struct cgroup_subsys_state *css;
-
-	cgroup_taskset_for_each(task, css, tset) {
-		tcred = __task_cred(task);
-
-		if ((current != task) && !capable(CAP_SYS_ADMIN) &&
-		     cred->euid.val != tcred->uid.val && cred->euid.val != tcred->suid.val)
-			return -EACCES;
-	}
+	if (task_css_is_root(task, cpuset_cgrp_id))
+		return;
 
-	return 0;
+	set_cpus_allowed_ptr(task, &current->cpus_allowed);
+	task->mems_allowed = current->mems_allowed;
 }
 
 struct cgroup_subsys cpuset_cgrp_subsys = {
@@ -2118,11 +2115,11 @@ struct cgroup_subsys cpuset_cgrp_subsys = {
 	.css_offline	= cpuset_css_offline,
 	.css_free	= cpuset_css_free,
 	.can_attach	= cpuset_can_attach,
-	.allow_attach   = cpuset_allow_attach,
 	.cancel_attach	= cpuset_cancel_attach,
 	.attach		= cpuset_attach,
 	.post_attach	= cpuset_post_attach,
 	.bind		= cpuset_bind,
+	.fork		= cpuset_fork,
 	.legacy_cftypes	= files,
 	.early_init	= 1,
 };
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 424961e5bd80..8d2482d77c04 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1554,12 +1554,33 @@ static int __init perf_workqueue_init(void)
 
 core_initcall(perf_workqueue_init);
 
-static inline int pmu_filter_match(struct perf_event *event)
+static inline int __pmu_filter_match(struct perf_event *event)
 {
 	struct pmu *pmu = event->pmu;
 	return pmu->filter_match ? pmu->filter_match(event) : 1;
 }
 
+/*
+ * Check whether we should attempt to schedule an event group based on
+ * PMU-specific filtering. An event group can consist of HW and SW events,
+ * potentially with a SW leader, so we must check all the filters, to
+ * determine whether a group is schedulable:
+ */
+static inline int pmu_filter_match(struct perf_event *event)
+{
+	struct perf_event *child;
+
+	if (!__pmu_filter_match(event))
+		return 0;
+
+	list_for_each_entry(child, &event->sibling_list, group_entry) {
+		if (!__pmu_filter_match(child))
+			return 0;
+	}
+
+	return 1;
+}
+
 static inline int
 event_filter_match(struct perf_event *event)
 {
diff --git a/kernel/fork.c b/kernel/fork.c
index a46ce4505066..7de03658692b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -769,6 +769,29 @@ struct file *get_mm_exe_file(struct mm_struct *mm)
 EXPORT_SYMBOL(get_mm_exe_file);
 
 /**
+ * get_task_exe_file - acquire a reference to the task's executable file
+ *
+ * Returns %NULL if task's mm (if any) has no associated executable file or
+ * this is a kernel thread with borrowed mm (see the comment above get_task_mm).
+ * User must release file via fput().
+ */
+struct file *get_task_exe_file(struct task_struct *task)
+{
+	struct file *exe_file = NULL;
+	struct mm_struct *mm;
+
+	task_lock(task);
+	mm = task->mm;
+	if (mm) {
+		if (!(task->flags & PF_KTHREAD))
+			exe_file = get_mm_exe_file(mm);
+	}
+	task_unlock(task);
+	return exe_file;
+}
+EXPORT_SYMBOL(get_task_exe_file);
+
+/**
  * get_task_mm - acquire a reference to the task's mm
  *
  * Returns %NULL if the task has no mm.  Checks PF_KTHREAD (meaning
@@ -884,14 +907,12 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
 	deactivate_mm(tsk, mm);
 
 	/*
-	 * If we're exiting normally, clear a user-space tid field if
-	 * requested.  We leave this alone when dying by signal, to leave
-	 * the value intact in a core dump, and to save the unnecessary
-	 * trouble, say, a killed vfork parent shouldn't touch this mm.
-	 * Userland only wants this done for a sys_exit.
+	 * Signal userspace if we're not exiting with a core dump
+	 * because we want to leave the value intact for debugging
+	 * purposes.
 	 */
 	if (tsk->clear_child_tid) {
-		if (!(tsk->flags & PF_SIGNALED) &&
+		if (!(tsk->signal->flags & SIGNAL_GROUP_COREDUMP) &&
 		    atomic_read(&mm->mm_users) > 1) {
 			/*
 			 * We don't check the error code - if userspace has
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 15206453b12a..9812d9c0d483 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -836,7 +836,8 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
 	irq_settings_clr_and_set(desc, clr, set);
 
 	irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU |
-		   IRQD_TRIGGER_MASK | IRQD_LEVEL | IRQD_MOVE_PCNTXT);
+		   IRQD_TRIGGER_MASK | IRQD_LEVEL | IRQD_MOVE_PCNTXT |
+		   IRQD_AFFINITY_MANAGED);
 	if (irq_settings_has_no_balance_set(desc))
 		irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
 	if (irq_settings_is_per_cpu(desc))
@@ -845,6 +846,8 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
 		irqd_set(&desc->irq_data, IRQD_MOVE_PCNTXT);
 	if (irq_settings_is_level(desc))
 		irqd_set(&desc->irq_data, IRQD_LEVEL);
+	if (irq_settings_has_affinity_managed_set(desc))
+		irqd_set(&desc->irq_data, IRQD_AFFINITY_MANAGED);
 
 	irqd_set(&desc->irq_data, irq_settings_get_trigger_mask(desc));
 
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index abd286afbd27..a4775f3451b9 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -411,8 +411,29 @@ int irq_map_generic_chip(struct irq_domain *d, unsigned int virq,
 }
 EXPORT_SYMBOL_GPL(irq_map_generic_chip);
 
+static void irq_unmap_generic_chip(struct irq_domain *d, unsigned int virq)
+{
+	struct irq_data *data = irq_domain_get_irq_data(d, virq);
+	struct irq_domain_chip_generic *dgc = d->gc;
+	unsigned int hw_irq = data->hwirq;
+	struct irq_chip_generic *gc;
+	int irq_idx;
+
+	gc = irq_get_domain_generic_chip(d, hw_irq);
+	if (!gc)
+		return;
+
+	irq_idx = hw_irq % dgc->irqs_per_chip;
+
+	clear_bit(irq_idx, &gc->installed);
+	irq_domain_set_info(d, virq, hw_irq, &no_irq_chip, NULL, NULL, NULL,
+			    NULL);
+
+}
+
 struct irq_domain_ops irq_generic_chip_ops = {
 	.map	= irq_map_generic_chip,
+	.unmap  = irq_unmap_generic_chip,
 	.xlate	= irq_domain_xlate_onetwocell,
 };
 EXPORT_SYMBOL_GPL(irq_generic_chip_ops);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index fcab63c66905..56afc0be6289 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -105,6 +105,8 @@ static inline void unregister_handler_proc(unsigned int irq,
 					   struct irqaction *action) { }
 #endif
 
+extern bool irq_can_set_affinity_usr(unsigned int irq);
+
 extern int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask);
 
 extern void irq_set_thread_affinity(struct irq_desc *desc);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 5cb153a8474a..e5c70dcb7f8e 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -115,12 +115,12 @@ EXPORT_SYMBOL(synchronize_irq);
 #ifdef CONFIG_SMP
 cpumask_var_t irq_default_affinity;
 
-static int __irq_can_set_affinity(struct irq_desc *desc)
+static bool __irq_can_set_affinity(struct irq_desc *desc)
 {
 	if (!desc || !irqd_can_balance(&desc->irq_data) ||
 	    !desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity)
-		return 0;
-	return 1;
+		return false;
+	return true;
 }
 
 /**
@@ -134,6 +134,21 @@ int irq_can_set_affinity(unsigned int irq)
 }
 
 /**
+ * irq_can_set_affinity_usr - Check if affinity of a irq can be set from user space
+ * @irq:	Interrupt to check
+ *
+ * Like irq_can_set_affinity() above, but additionally checks for the
+ * AFFINITY_MANAGED flag.
+ */
+bool irq_can_set_affinity_usr(unsigned int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	return __irq_can_set_affinity(desc) &&
+		!irqd_affinity_is_managed(&desc->irq_data);
+}
+
+/**
  *	irq_set_thread_affinity - Notify irq threads to adjust affinity
  *	@desc:		irq descriptor which has affitnity changed
  *
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 4b21779d5163..cd6009006510 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -298,6 +298,7 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
 		ops->msi_finish(&arg, 0);
 
 	for_each_msi_entry(desc, dev) {
+		virq = desc->irq;
 		if (desc->nvec_used == 1)
 			dev_dbg(dev, "irq %d for MSI\n", virq);
 		else
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index a2c02fd5d6d0..a24c5b909047 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -96,7 +96,7 @@ static ssize_t write_irq_affinity(int type, struct file *file,
 	cpumask_var_t new_value;
 	int err;
 
-	if (!irq_can_set_affinity(irq) || no_irq_affinity)
+	if (!irq_can_set_affinity_usr(irq) || no_irq_affinity)
 		return -EIO;
 
 	if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
index 320579d89091..f0964f058521 100644
--- a/kernel/irq/settings.h
+++ b/kernel/irq/settings.h
@@ -17,6 +17,7 @@ enum {
 	_IRQ_IS_POLLED		= IRQ_IS_POLLED,
 	_IRQ_DISABLE_UNLAZY	= IRQ_DISABLE_UNLAZY,
 	_IRQF_MODIFY_MASK	= IRQF_MODIFY_MASK,
+	_IRQ_AFFINITY_MANAGED	= IRQ_AFFINITY_MANAGED,
 };
 
 #define IRQ_PER_CPU		GOT_YOU_MORON
@@ -32,6 +33,7 @@ enum {
 #define IRQ_DISABLE_UNLAZY	GOT_YOU_MORON
 #undef IRQF_MODIFY_MASK
 #define IRQF_MODIFY_MASK	GOT_YOU_MORON
+#define IRQ_AFFINITY_MANAGED	GOT_YOU_MORON
 
 static inline void
 irq_settings_clr_and_set(struct irq_desc *desc, u32 clr, u32 set)
@@ -65,6 +67,16 @@ static inline bool irq_settings_has_no_balance_set(struct irq_desc *desc)
 	return desc->status_use_accessors & _IRQ_NO_BALANCING;
 }
 
+static inline void irq_settings_set_affinity_managed(struct irq_desc *desc)
+{
+	desc->status_use_accessors |= _IRQ_AFFINITY_MANAGED;
+}
+
+static inline bool irq_settings_has_affinity_managed_set(struct irq_desc *desc)
+{
+	return desc->status_use_accessors & _IRQ_AFFINITY_MANAGED;
+}
+
 static inline u32 irq_settings_get_trigger_mask(struct irq_desc *desc)
 {
 	return desc->status_use_accessors & IRQ_TYPE_SENSE_MASK;
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index b70ada0028d2..6030efd4a188 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -934,7 +934,10 @@ int kexec_load_purgatory(struct kimage *image, unsigned long min,
 	return 0;
 out:
 	vfree(pi->sechdrs);
+	pi->sechdrs = NULL;
+
 	vfree(pi->purgatory_buf);
+	pi->purgatory_buf = NULL;
 	return ret;
 }
 
diff --git a/kernel/pid.c b/kernel/pid.c
index 78b3d9f80d44..b17263be9082 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -526,8 +526,11 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
 	if (!ns)
 		ns = task_active_pid_ns(current);
 	if (likely(pid_alive(task))) {
-		if (type != PIDTYPE_PID)
+		if (type != PIDTYPE_PID) {
+			if (type == __PIDTYPE_TGID)
+				type = PIDTYPE_PID;
 			task = task->group_leader;
+		}
 		nr = pid_nr_ns(rcu_dereference(task->pids[type].pid), ns);
 	}
 	rcu_read_unlock();
@@ -536,12 +539,6 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
 }
 EXPORT_SYMBOL(__task_pid_nr_ns);
 
-pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
-{
-	return pid_nr_ns(task_tgid(tsk), ns);
-}
-EXPORT_SYMBOL(task_tgid_nr_ns);
-
 struct pid_namespace *task_active_pid_ns(struct task_struct *tsk)
 {
 	return ns_of_pid(task_pid(tsk));
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index b7dd5718836e..3124cebaec31 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -299,12 +299,12 @@ static int create_image(int platform_mode)
 	save_processor_state();
 	trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, true);
 	error = swsusp_arch_suspend();
+	/* Restore control flow magically appears here */
+	restore_processor_state();
 	trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, false);
 	if (error)
 		printk(KERN_ERR "PM: Error %d creating hibernation image\n",
 			error);
-	/* Restore control flow magically appears here */
-	restore_processor_state();
 	if (!in_suspend)
 		events_check_enabled = false;
 
diff --git a/kernel/power/main.c b/kernel/power/main.c
index b2dd4d999900..27946975eff0 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -280,13 +280,7 @@ static ssize_t pm_wakeup_irq_show(struct kobject *kobj,
 	return pm_wakeup_irq ? sprintf(buf, "%u\n", pm_wakeup_irq) : -ENODATA;
 }
 
-static ssize_t pm_wakeup_irq_store(struct kobject *kobj,
-					struct kobj_attribute *attr,
-					const char *buf, size_t n)
-{
-	return -EINVAL;
-}
-power_attr(pm_wakeup_irq);
+power_attr_ro(pm_wakeup_irq);
 
 #else /* !CONFIG_PM_SLEEP_DEBUG */
 static inline void pm_print_times_init(void) {}
@@ -564,14 +558,7 @@ static ssize_t pm_trace_dev_match_show(struct kobject *kobj,
 	return show_trace_dev_match(buf, PAGE_SIZE);
 }
 
-static ssize_t
-pm_trace_dev_match_store(struct kobject *kobj, struct kobj_attribute *attr,
-			 const char *buf, size_t n)
-{
-	return -EINVAL;
-}
-
-power_attr(pm_trace_dev_match);
+power_attr_ro(pm_trace_dev_match);
 
 #endif /* CONFIG_PM_TRACE */
 
diff --git a/kernel/power/power.h b/kernel/power/power.h
index caadb566e82b..efe1b3b17c88 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -77,6 +77,15 @@ static struct kobj_attribute _name##_attr = {	\
 	.store	= _name##_store,		\
 }
 
+#define power_attr_ro(_name) \
+static struct kobj_attribute _name##_attr = {	\
+	.attr	= {				\
+		.name = __stringify(_name),	\
+		.mode = S_IRUGO,		\
+	},					\
+	.show	= _name##_show,			\
+}
+
 /* Preferred image size in bytes (default 500 MB) */
 extern unsigned long image_size;
 /* Size of memory reserved for drivers (default SPARE_PAGES x PAGE_SIZE) */
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 582b66e882ce..e6eceb0aa496 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -597,7 +597,11 @@ void pm_qos_add_request(struct pm_qos_request *req,
 	case PM_QOS_REQ_AFFINE_IRQ:
 		if (irq_can_set_affinity(req->irq)) {
 			struct irq_desc *desc = irq_to_desc(req->irq);
-			struct cpumask *mask = desc->irq_data.common->affinity;
+			struct cpumask *mask;
+
+			if (!desc)
+				return;
+			mask = desc->irq_data.common->affinity;
 
 			/* Get the current affinity */
 			cpumask_copy(&req->cpus_affine, mask);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 3a970604308f..f155c62f1f2c 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -765,9 +765,9 @@ static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn)
  */
 static bool rtree_next_node(struct memory_bitmap *bm)
 {
-	bm->cur.node = list_entry(bm->cur.node->list.next,
-				  struct rtree_node, list);
-	if (&bm->cur.node->list != &bm->cur.zone->leaves) {
+	if (!list_is_last(&bm->cur.node->list, &bm->cur.zone->leaves)) {
+		bm->cur.node = list_entry(bm->cur.node->list.next,
+					  struct rtree_node, list);
 		bm->cur.node_pfn += BM_BITS_PER_BLOCK;
 		bm->cur.node_bit  = 0;
 		touch_softlockup_watchdog();
@@ -775,9 +775,9 @@ static bool rtree_next_node(struct memory_bitmap *bm)
 	}
 
 	/* No more nodes, goto next zone */
-	bm->cur.zone = list_entry(bm->cur.zone->list.next,
+	if (!list_is_last(&bm->cur.zone->list, &bm->zones)) {
+		bm->cur.zone = list_entry(bm->cur.zone->list.next,
 				  struct mem_zone_bm_rtree, list);
-	if (&bm->cur.zone->list != &bm->zones) {
 		bm->cur.node = list_entry(bm->cur.zone->leaves.next,
 					  struct rtree_node, list);
 		bm->cur.node_pfn = 0;
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
index 084452e34a12..bdff5ed57f10 100644
--- a/kernel/power/suspend_test.c
+++ b/kernel/power/suspend_test.c
@@ -203,8 +203,10 @@ static int __init test_suspend(void)
 
 	/* RTCs have initialized by now too ... can we use one? */
 	dev = class_find_device(rtc_class, NULL, NULL, has_wakealarm);
-	if (dev)
+	if (dev) {
 		rtc = rtc_class_open(dev_name(dev));
+		put_device(dev);
+	}
 	if (!rtc) {
 		printk(warn_no_rtc);
 		return 0;
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 12cd989dadf6..160e1006640d 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -37,6 +37,14 @@
 #define HIBERNATE_SIG	"S1SUSPEND"
 
 /*
+ * When reading an {un,}compressed image, we may restore pages in place,
+ * in which case some architectures need these pages cleaning before they
+ * can be executed. We don't know which pages these may be, so clean the lot.
+ */
+static bool clean_pages_on_read;
+static bool clean_pages_on_decompress;
+
+/*
  *	The swap map is a data structure used for keeping track of each page
  *	written to a swap partition.  It consists of many swap_map_page
  *	structures that contain each an array of MAP_PAGE_ENTRIES swap entries.
@@ -241,6 +249,9 @@ static void hib_end_io(struct bio *bio)
 
 	if (bio_data_dir(bio) == WRITE)
 		put_page(page);
+	else if (clean_pages_on_read)
+		flush_icache_range((unsigned long)page_address(page),
+				   (unsigned long)page_address(page) + PAGE_SIZE);
 
 	if (bio->bi_error && !hb->error)
 		hb->error = bio->bi_error;
@@ -1049,6 +1060,7 @@ static int load_image(struct swap_map_handle *handle,
 
 	hib_init_batch(&hb);
 
+	clean_pages_on_read = true;
 	printk(KERN_INFO "PM: Loading image data pages (%u pages)...\n",
 		nr_to_read);
 	m = nr_to_read / 10;
@@ -1124,6 +1136,10 @@ static int lzo_decompress_threadfn(void *data)
 		d->unc_len = LZO_UNC_SIZE;
 		d->ret = lzo1x_decompress_safe(d->cmp + LZO_HEADER, d->cmp_len,
 		                               d->unc, &d->unc_len);
+		if (clean_pages_on_decompress)
+			flush_icache_range((unsigned long)d->unc,
+					   (unsigned long)d->unc + d->unc_len);
+
 		atomic_set(&d->stop, 1);
 		wake_up(&d->done);
 	}
@@ -1189,6 +1205,8 @@ static int load_image_lzo(struct swap_map_handle *handle,
 	}
 	memset(crc, 0, offsetof(struct crc_data, go));
 
+	clean_pages_on_decompress = true;
+
 	/*
 	 * Start the decompression threads.
 	 */
diff --git a/kernel/printk/braille.c b/kernel/printk/braille.c
index 276762f3a460..d5760c42f042 100644
--- a/kernel/printk/braille.c
+++ b/kernel/printk/braille.c
@@ -9,10 +9,10 @@
 
 char *_braille_console_setup(char **str, char **brl_options)
 {
-	if (!memcmp(*str, "brl,", 4)) {
+	if (!strncmp(*str, "brl,", 4)) {
 		*brl_options = "";
 		*str += 4;
-	} else if (!memcmp(str, "brl=", 4)) {
+	} else if (!strncmp(*str, "brl=", 4)) {
 		*brl_options = *str + 4;
 		*str = strchr(*brl_options, ',');
 		if (!*str)
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index f07343b54fe5..2cb46d51d715 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -57,6 +57,8 @@
 #include <linux/trace_events.h>
 #include <linux/suspend.h>
 
+#include <soc/qcom/watchdog.h>
+
 #include "tree.h"
 #include "rcu.h"
 
@@ -1298,6 +1300,11 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
 
 	rcu_check_gp_kthread_starvation(rsp);
 
+#ifdef CONFIG_RCU_STALL_WATCHDOG_BITE
+	/* Induce watchdog bite */
+	msm_trigger_wdog_bite();
+#endif
+
 	force_quiescent_state(rsp);  /* Kick them all. */
 }
 
@@ -1333,6 +1340,11 @@ static void print_cpu_stall(struct rcu_state *rsp)
 			   jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
 	raw_spin_unlock_irqrestore(&rnp->lock, flags);
 
+#ifdef CONFIG_RCU_STALL_WATCHDOG_BITE
+	/* Induce non secure watchdog bite to collect context */
+	msm_trigger_wdog_bite();
+#endif
+
 	/*
 	 * Attempt to revive the RCU machinery by forcing a context switch.
 	 *
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 630c19772630..32cbe72bf545 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -2275,6 +2275,7 @@ static int rcu_nocb_kthread(void *arg)
 				cl++;
 			c++;
 			local_bh_enable();
+			cond_resched_rcu_qs();
 			list = next;
 		}
 		trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1);
diff --git a/kernel/sched/boost.c b/kernel/sched/boost.c
index fcfda385b74a..5bdd51b1e55e 100644
--- a/kernel/sched/boost.c
+++ b/kernel/sched/boost.c
@@ -156,9 +156,6 @@ void sched_boost_parse_dt(void)
 	struct device_node *sn;
 	const char *boost_policy;
 
-	if (!sched_enable_hmp)
-		return;
-
 	sn = of_find_node_by_path("/sched-hmp");
 	if (!sn)
 		return;
@@ -175,9 +172,6 @@ int sched_set_boost(int type)
 {
 	int ret = 0;
 
-	if (!sched_enable_hmp)
-		return -EINVAL;
-
 	mutex_lock(&boost_mutex);
 
 	if (verify_boost_params(sysctl_sched_boost, type))
@@ -197,9 +191,6 @@ int sched_boost_handler(struct ctl_table *table, int write,
 	unsigned int *data = (unsigned int *)table->data;
 	unsigned int old_val;
 
-	if (!sched_enable_hmp)
-		return -EINVAL;
-
 	mutex_lock(&boost_mutex);
 
 	old_val = *data;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 519aee32e122..312ffdad034a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2044,6 +2044,28 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 
 	success = 1; /* we're going to change ->state */
 
+	/*
+	 * Ensure we load p->on_rq _after_ p->state, otherwise it would
+	 * be possible to, falsely, observe p->on_rq == 0 and get stuck
+	 * in smp_cond_load_acquire() below.
+	 *
+	 * sched_ttwu_pending()                 try_to_wake_up()
+	 *   [S] p->on_rq = 1;                  [L] P->state
+	 *       UNLOCK rq->lock  -----.
+	 *                              \
+	 *				 +---   RMB
+	 * schedule()                   /
+	 *       LOCK rq->lock    -----'
+	 *       UNLOCK rq->lock
+	 *
+	 * [task p]
+	 *   [S] p->state = UNINTERRUPTIBLE     [L] p->on_rq
+	 *
+	 * Pairs with the UNLOCK+LOCK on rq->lock from the
+	 * last wakeup of our task and the schedule that got our task
+	 * current.
+	 */
+	smp_rmb();
 	if (p->on_rq && ttwu_remote(p, wake_flags))
 		goto stat;
 
@@ -2321,6 +2343,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 	__dl_clear_params(p);
 
 	INIT_LIST_HEAD(&p->rt.run_list);
+	p->rt.timeout		= 0;
+	p->rt.time_slice	= sched_rr_timeslice;
+	p->rt.on_rq		= 0;
+	p->rt.on_list		= 0;
 
 #ifdef CONFIG_PREEMPT_NOTIFIERS
 	INIT_HLIST_HEAD(&p->preempt_notifiers);
@@ -3025,8 +3051,9 @@ void sched_exec(void)
 	unsigned long flags;
 	int dest_cpu, curr_cpu;
 
-	if (sched_enable_hmp)
-		return;
+#ifdef CONFIG_SCHED_HMP
+	return;
+#endif
 
 	raw_spin_lock_irqsave(&p->pi_lock, flags);
 	curr_cpu = task_cpu(p);
@@ -3734,7 +3761,7 @@ EXPORT_SYMBOL(default_wake_function);
  */
 void rt_mutex_setprio(struct task_struct *p, int prio)
 {
-	int oldprio, queued, running, enqueue_flag = ENQUEUE_RESTORE;
+	int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE;
 	struct rq *rq;
 	const struct sched_class *prev_class;
 
@@ -3762,11 +3789,15 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 
 	trace_sched_pi_setprio(p, prio);
 	oldprio = p->prio;
+
+	if (oldprio == prio)
+		queue_flag &= ~DEQUEUE_MOVE;
+
 	prev_class = p->sched_class;
 	queued = task_on_rq_queued(p);
 	running = task_current(rq, p);
 	if (queued)
-		dequeue_task(rq, p, DEQUEUE_SAVE);
+		dequeue_task(rq, p, queue_flag);
 	if (running)
 		put_prev_task(rq, p);
 
@@ -3784,7 +3815,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 		if (!dl_prio(p->normal_prio) ||
 		    (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
 			p->dl.dl_boosted = 1;
-			enqueue_flag |= ENQUEUE_REPLENISH;
+			queue_flag |= ENQUEUE_REPLENISH;
 		} else
 			p->dl.dl_boosted = 0;
 		p->sched_class = &dl_sched_class;
@@ -3792,7 +3823,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 		if (dl_prio(oldprio))
 			p->dl.dl_boosted = 0;
 		if (oldprio < prio)
-			enqueue_flag |= ENQUEUE_HEAD;
+			queue_flag |= ENQUEUE_HEAD;
 		p->sched_class = &rt_sched_class;
 	} else {
 		if (dl_prio(oldprio))
@@ -3807,7 +3838,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 	if (running)
 		p->sched_class->set_curr_task(rq);
 	if (queued)
-		enqueue_task(rq, p, enqueue_flag);
+		enqueue_task(rq, p, queue_flag);
 
 	check_class_changed(rq, p, prev_class, oldprio);
 out_unlock:
@@ -4163,6 +4194,7 @@ static int __sched_setscheduler(struct task_struct *p,
 	const struct sched_class *prev_class;
 	struct rq *rq;
 	int reset_on_fork;
+	int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE;
 
 	/* may grab non-irq protected spin_locks */
 	BUG_ON(in_interrupt());
@@ -4345,17 +4377,14 @@ change:
 		 * itself.
 		 */
 		new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
-		if (new_effective_prio == oldprio) {
-			__setscheduler_params(p, attr);
-			task_rq_unlock(rq, p, &flags);
-			return 0;
-		}
+		if (new_effective_prio == oldprio)
+			queue_flags &= ~DEQUEUE_MOVE;
 	}
 
 	queued = task_on_rq_queued(p);
 	running = task_current(rq, p);
 	if (queued)
-		dequeue_task(rq, p, DEQUEUE_SAVE);
+		dequeue_task(rq, p, queue_flags);
 	if (running)
 		put_prev_task(rq, p);
 
@@ -4365,15 +4394,14 @@ change:
 	if (running)
 		p->sched_class->set_curr_task(rq);
 	if (queued) {
-		int enqueue_flags = ENQUEUE_RESTORE;
 		/*
 		 * We enqueue to tail when the priority of a task is
 		 * increased (user space view).
 		 */
-		if (oldprio <= p->prio)
-			enqueue_flags |= ENQUEUE_HEAD;
+		if (oldprio < p->prio)
+			queue_flags |= ENQUEUE_HEAD;
 
-		enqueue_task(rq, p, enqueue_flags);
+		enqueue_task(rq, p, queue_flags);
 	}
 
 	check_class_changed(rq, p, prev_class, oldprio);
@@ -8215,8 +8243,9 @@ void __init sched_init(void)
 	int i, j;
 	unsigned long alloc_size = 0, ptr;
 
-	if (sched_enable_hmp)
-		pr_info("HMP scheduling enabled.\n");
+#ifdef CONFIG_SCHED_HMP
+	pr_info("HMP scheduling enabled.\n");
+#endif
 
 	BUG_ON(num_possible_cpus() > BITS_PER_LONG);
 
@@ -8362,6 +8391,7 @@ void __init sched_init(void)
 		rq->cluster = &init_cluster;
 		rq->curr_runnable_sum = rq->prev_runnable_sum = 0;
 		rq->nt_curr_runnable_sum = rq->nt_prev_runnable_sum = 0;
+		memset(&rq->grp_time, 0, sizeof(struct group_cpu_time));
 		rq->old_busy_time = 0;
 		rq->old_estimated_time = 0;
 		rq->old_busy_time_group = 0;
@@ -8704,7 +8734,7 @@ void sched_move_task(struct task_struct *tsk)
 	queued = task_on_rq_queued(tsk);
 
 	if (queued)
-		dequeue_task(rq, tsk, DEQUEUE_SAVE);
+		dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE);
 	if (unlikely(running))
 		put_prev_task(rq, tsk);
 
@@ -8728,7 +8758,7 @@ void sched_move_task(struct task_struct *tsk)
 	if (unlikely(running))
 		tsk->sched_class->set_curr_task(rq);
 	if (queued)
-		enqueue_task(rq, tsk, ENQUEUE_RESTORE);
+		enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE);
 
 	task_rq_unlock(rq, tsk, &flags);
 }
@@ -9520,7 +9550,6 @@ struct cgroup_subsys cpu_cgrp_subsys = {
 	.fork		= cpu_cgroup_fork,
 	.can_attach	= cpu_cgroup_can_attach,
 	.attach		= cpu_cgroup_attach,
-	.allow_attach   = subsys_cgroup_allow_attach,
 	.legacy_cftypes	= cpu_files,
 	.early_init	= 1,
 };
diff --git a/kernel/sched/core_ctl.c b/kernel/sched/core_ctl.c
index aac12bfc2ae6..1e3accddd103 100644
--- a/kernel/sched/core_ctl.c
+++ b/kernel/sched/core_ctl.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2014-2016, The Linux Foundation. All rights reserved.
+/* Copyright (c) 2014-2017, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -36,7 +36,7 @@ struct cluster_data {
 	cpumask_t cpu_mask;
 	unsigned int need_cpus;
 	unsigned int task_thres;
-	s64 last_isolate_ts;
+	s64 need_ts;
 	struct list_head lru;
 	bool pending;
 	spinlock_t pending_lock;
@@ -277,9 +277,6 @@ static ssize_t show_global_state(const struct cluster_data *state, char *buf)
 
 	for_each_possible_cpu(cpu) {
 		c = &per_cpu(cpu_state, cpu);
-		if (!c->cluster)
-			continue;
-
 		cluster = c->cluster;
 		if (!cluster || !cluster->inited)
 			continue;
@@ -301,6 +298,9 @@ static ssize_t show_global_state(const struct cluster_data *state, char *buf)
 		count += snprintf(buf + count, PAGE_SIZE - count,
 					"\tIs busy: %u\n", c->is_busy);
 		count += snprintf(buf + count, PAGE_SIZE - count,
+					"\tNot preferred: %u\n",
+						c->not_preferred);
+		count += snprintf(buf + count, PAGE_SIZE - count,
 					"\tNr running: %u\n", cluster->nrrun);
 		count += snprintf(buf + count, PAGE_SIZE - count,
 			"\tActive CPUs: %u\n", get_active_cpu_count(cluster));
@@ -323,13 +323,14 @@ static ssize_t store_not_preferred(struct cluster_data *state,
 	int ret;
 
 	ret = sscanf(buf, "%u %u %u %u\n", &val[0], &val[1], &val[2], &val[3]);
-	if (ret != 1 && ret != state->num_cpus)
+	if (ret != state->num_cpus)
 		return -EINVAL;
 
-	i = 0;
 	spin_lock_irqsave(&state_lock, flags);
-	list_for_each_entry(c, &state->lru, sib)
-		c->not_preferred = val[i++];
+	for (i = 0; i < state->num_cpus; i++) {
+		c = &per_cpu(cpu_state, i + state->first_cpu);
+		c->not_preferred = val[i];
+	}
 	spin_unlock_irqrestore(&state_lock, flags);
 
 	return count;
@@ -340,11 +341,14 @@ static ssize_t show_not_preferred(const struct cluster_data *state, char *buf)
 	struct cpu_data *c;
 	ssize_t count = 0;
 	unsigned long flags;
+	int i;
 
 	spin_lock_irqsave(&state_lock, flags);
-	list_for_each_entry(c, &state->lru, sib)
-		count += snprintf(buf + count, PAGE_SIZE - count,
-				"\tCPU:%d %u\n", c->cpu, c->not_preferred);
+	for (i = 0; i < state->num_cpus; i++) {
+		c = &per_cpu(cpu_state, i + state->first_cpu);
+		count += scnprintf(buf + count, PAGE_SIZE - count,
+				"CPU#%d: %u\n", c->cpu, c->not_preferred);
+	}
 	spin_unlock_irqrestore(&state_lock, flags);
 
 	return count;
@@ -549,6 +553,7 @@ static bool eval_need(struct cluster_data *cluster)
 	bool need_flag = false;
 	unsigned int active_cpus;
 	unsigned int new_need;
+	s64 now;
 
 	if (unlikely(!cluster->inited))
 		return 0;
@@ -573,9 +578,10 @@ static bool eval_need(struct cluster_data *cluster)
 	need_flag = adjustment_possible(cluster, new_need);
 
 	last_need = cluster->need_cpus;
-	cluster->need_cpus = new_need;
+	now = ktime_to_ms(ktime_get());
 
-	if (!need_flag) {
+	if (new_need == last_need) {
+		cluster->need_ts = now;
 		spin_unlock_irqrestore(&state_lock, flags);
 		return 0;
 	}
@@ -583,12 +589,15 @@ static bool eval_need(struct cluster_data *cluster)
 	if (need_cpus > cluster->active_cpus) {
 		ret = 1;
 	} else if (need_cpus < cluster->active_cpus) {
-		s64 now = ktime_to_ms(ktime_get());
-		s64 elapsed = now - cluster->last_isolate_ts;
+		s64 elapsed = now - cluster->need_ts;
 
 		ret = elapsed >= cluster->offline_delay_ms;
 	}
 
+	if (ret) {
+		cluster->need_ts = now;
+		cluster->need_cpus = new_need;
+	}
 	trace_core_ctl_eval_need(cluster->first_cpu, last_need, need_cpus,
 				 ret && need_flag);
 	spin_unlock_irqrestore(&state_lock, flags);
@@ -746,7 +755,6 @@ static void try_to_isolate(struct cluster_data *cluster, unsigned int need)
 		if (!sched_isolate_cpu(c->cpu)) {
 			c->isolated_by_us = true;
 			move_cpu_lru(c);
-			cluster->last_isolate_ts = ktime_to_ms(ktime_get());
 		} else {
 			pr_debug("Unable to isolate CPU%u\n", c->cpu);
 		}
@@ -779,7 +787,6 @@ static void try_to_isolate(struct cluster_data *cluster, unsigned int need)
 		if (!sched_isolate_cpu(c->cpu)) {
 			c->isolated_by_us = true;
 			move_cpu_lru(c);
-			cluster->last_isolate_ts = ktime_to_ms(ktime_get());
 		} else {
 			pr_debug("Unable to isolate CPU%u\n", c->cpu);
 		}
diff --git a/kernel/sched/cpufreq_sched.c b/kernel/sched/cpufreq_sched.c
index f6f9b9b3a4a8..d751bc2d0d6e 100644
--- a/kernel/sched/cpufreq_sched.c
+++ b/kernel/sched/cpufreq_sched.c
@@ -289,7 +289,7 @@ static int cpufreq_sched_policy_init(struct cpufreq_policy *policy)
 	pr_debug("%s: throttle threshold = %u [ns]\n",
 		  __func__, gd->up_throttle_nsec);
 
-	rc = sysfs_create_group(get_governor_parent_kobj(policy), get_sysfs_attr());
+	rc = sysfs_create_group(&policy->kobj, get_sysfs_attr());
 	if (rc) {
 		pr_err("%s: couldn't create sysfs attributes: %d\n", __func__, rc);
 		goto err;
@@ -332,7 +332,7 @@ static int cpufreq_sched_policy_exit(struct cpufreq_policy *policy)
 		put_task_struct(gd->task);
 	}
 
-	sysfs_remove_group(get_governor_parent_kobj(policy), get_sysfs_attr());
+	sysfs_remove_group(&policy->kobj, get_sysfs_attr());
 
 	policy->governor_data = NULL;
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c52655581c4c..cf55fc2663fb 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2620,7 +2620,6 @@ struct cpu_select_env {
 	u8 need_idle:1;
 	u8 need_waker_cluster:1;
 	u8 sync:1;
-	u8 ignore_prev_cpu:1;
 	enum sched_boost_policy boost_policy;
 	u8 pack_task:1;
 	int prev_cpu;
@@ -2630,6 +2629,7 @@ struct cpu_select_env {
 	u64 cpu_load;
 	u32 sbc_best_flag;
 	u32 sbc_best_cluster_flag;
+	struct cpumask search_cpus;
 };
 
 struct cluster_cpu_stats {
@@ -2834,11 +2834,14 @@ struct cpu_select_env *env, struct cluster_cpu_stats *stats)
 {
 	struct sched_cluster *next = NULL;
 	int i;
+	struct cpumask search_cpus;
 
 	while (!bitmap_empty(env->backup_list, num_clusters)) {
 		next = next_candidate(env->backup_list, 0, num_clusters);
 		__clear_bit(next->id, env->backup_list);
-		for_each_cpu_and(i, &env->p->cpus_allowed, &next->cpus) {
+
+		cpumask_and(&search_cpus, &env->search_cpus, &next->cpus);
+		for_each_cpu(i, &search_cpus) {
 			trace_sched_cpu_load_wakeup(cpu_rq(i), idle_cpu(i),
 			sched_irqload(i), power_cost(i, task_load(env->p) +
 					cpu_cravg_sync(i, env->sync)), 0);
@@ -3010,11 +3013,7 @@ static void find_best_cpu_in_cluster(struct sched_cluster *c,
 	int i;
 	struct cpumask search_cpus;
 
-	cpumask_and(&search_cpus, tsk_cpus_allowed(env->p), &c->cpus);
-	cpumask_andnot(&search_cpus, &search_cpus, cpu_isolated_mask);
-
-	if (env->ignore_prev_cpu)
-		cpumask_clear_cpu(env->prev_cpu, &search_cpus);
+	cpumask_and(&search_cpus, &env->search_cpus, &c->cpus);
 
 	env->need_idle = wake_to_idle(env->p) || c->wake_up_idle;
 
@@ -3026,7 +3025,7 @@ static void find_best_cpu_in_cluster(struct sched_cluster *c,
 			power_cost(i, task_load(env->p) +
 					cpu_cravg_sync(i, env->sync)), 0);
 
-		if (unlikely(!cpu_active(i)) || skip_cpu(i, env))
+		if (skip_cpu(i, env))
 			continue;
 
 		update_spare_capacity(stats, env, i, c->capacity,
@@ -3081,9 +3080,7 @@ bias_to_prev_cpu(struct cpu_select_env *env, struct cluster_cpu_stats *stats)
 		return false;
 
 	prev_cpu = env->prev_cpu;
-	if (!cpumask_test_cpu(prev_cpu, tsk_cpus_allowed(task)) ||
-					unlikely(!cpu_active(prev_cpu)) ||
-					cpu_isolated(prev_cpu))
+	if (!cpumask_test_cpu(prev_cpu, &env->search_cpus))
 		return false;
 
 	if (task->ravg.mark_start - task->last_cpu_selected_ts >=
@@ -3116,7 +3113,7 @@ bias_to_prev_cpu(struct cpu_select_env *env, struct cluster_cpu_stats *stats)
 			spill_threshold_crossed(env, cpu_rq(prev_cpu))) {
 		update_spare_capacity(stats, env, prev_cpu,
 				cluster->capacity, env->cpu_load);
-		env->ignore_prev_cpu = 1;
+		cpumask_clear_cpu(prev_cpu, &env->search_cpus);
 		return false;
 	}
 
@@ -3132,23 +3129,17 @@ wake_to_waker_cluster(struct cpu_select_env *env)
 }
 
 static inline bool
-bias_to_waker_cpu(struct task_struct *p, int cpu)
+bias_to_waker_cpu(struct cpu_select_env *env, int cpu)
 {
 	return sysctl_sched_prefer_sync_wakee_to_waker &&
 	       cpu_rq(cpu)->nr_running == 1 &&
-	       cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) &&
-	       cpu_active(cpu) && !cpu_isolated(cpu);
+	       cpumask_test_cpu(cpu, &env->search_cpus);
 }
 
 static inline int
-cluster_allowed(struct task_struct *p, struct sched_cluster *cluster)
+cluster_allowed(struct cpu_select_env *env, struct sched_cluster *cluster)
 {
-	cpumask_t tmp_mask;
-
-	cpumask_and(&tmp_mask, &cluster->cpus, cpu_active_mask);
-	cpumask_and(&tmp_mask, &tmp_mask, &p->cpus_allowed);
-
-	return !cpumask_empty(&tmp_mask);
+	return cpumask_intersects(&env->search_cpus, &cluster->cpus);
 }
 
 /* return cheapest cpu that can fit this task */
@@ -3169,7 +3160,6 @@ static int select_best_cpu(struct task_struct *p, int target, int reason,
 		.need_waker_cluster	= 0,
 		.sync			= sync,
 		.prev_cpu		= target,
-		.ignore_prev_cpu	= 0,
 		.rtg			= NULL,
 		.sbc_best_flag		= 0,
 		.sbc_best_cluster_flag	= 0,
@@ -3182,6 +3172,9 @@ static int select_best_cpu(struct task_struct *p, int target, int reason,
 	bitmap_copy(env.candidate_list, all_cluster_ids, NR_CPUS);
 	bitmap_zero(env.backup_list, NR_CPUS);
 
+	cpumask_and(&env.search_cpus, tsk_cpus_allowed(p), cpu_active_mask);
+	cpumask_andnot(&env.search_cpus, &env.search_cpus, cpu_isolated_mask);
+
 	init_cluster_cpu_stats(&stats);
 	special = env_has_special_flags(&env);
 
@@ -3191,19 +3184,19 @@ static int select_best_cpu(struct task_struct *p, int target, int reason,
 
 	if (grp && grp->preferred_cluster) {
 		pref_cluster = grp->preferred_cluster;
-		if (!cluster_allowed(p, pref_cluster))
+		if (!cluster_allowed(&env, pref_cluster))
 			clear_bit(pref_cluster->id, env.candidate_list);
 		else
 			env.rtg = grp;
 	} else if (!special) {
 		cluster = cpu_rq(cpu)->cluster;
 		if (wake_to_waker_cluster(&env)) {
-			if (bias_to_waker_cpu(p, cpu)) {
+			if (bias_to_waker_cpu(&env, cpu)) {
 				target = cpu;
 				sbc_flag = SBC_FLAG_WAKER_CLUSTER |
 					   SBC_FLAG_WAKER_CPU;
 				goto out;
-			} else if (cluster_allowed(p, cluster)) {
+			} else if (cluster_allowed(&env, cluster)) {
 				env.need_waker_cluster = 1;
 				bitmap_zero(env.candidate_list, NR_CPUS);
 				__set_bit(cluster->id, env.candidate_list);
@@ -3332,9 +3325,9 @@ void _inc_hmp_sched_stats_fair(struct rq *rq,
 	 * inc/dec_nr_big_task and inc/dec_cumulative_runnable_avg called
 	 * from inc_cfs_rq_hmp_stats() have similar checks), we gain a bit on
 	 * efficiency by short-circuiting for_each_sched_entity() loop when
-	 * !sched_enable_hmp || sched_disable_window_stats
+	 * sched_disable_window_stats
 	 */
-	if (!sched_enable_hmp || sched_disable_window_stats)
+	if (sched_disable_window_stats)
 		return;
 
 	for_each_sched_entity(se) {
@@ -3357,7 +3350,7 @@ _dec_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p, int change_cra)
 	struct sched_entity *se = &p->se;
 
 	/* See comment on efficiency in _inc_hmp_sched_stats_fair */
-	if (!sched_enable_hmp || sched_disable_window_stats)
+	if (sched_disable_window_stats)
 		return;
 
 	for_each_sched_entity(se) {
@@ -3482,8 +3475,7 @@ static inline int migration_needed(struct task_struct *p, int cpu)
 	int nice;
 	struct related_thread_group *grp;
 
-	if (!sched_enable_hmp || p->state != TASK_RUNNING ||
-	    p->nr_cpus_allowed == 1)
+	if (p->state != TASK_RUNNING || p->nr_cpus_allowed == 1)
 		return 0;
 
 	/* No need to migrate task that is about to be throttled */
@@ -3500,8 +3492,15 @@ static inline int migration_needed(struct task_struct *p, int cpu)
 	nice = task_nice(p);
 	rcu_read_lock();
 	grp = task_related_thread_group(p);
+	/*
+	 * Don't assume higher capacity means higher power. If the task
+	 * is running on the power efficient CPU, avoid migrating it
+	 * to a lower capacity cluster.
+	 */
 	if (!grp && (nice > SCHED_UPMIGRATE_MIN_NICE ||
-	       upmigrate_discouraged(p)) && cpu_capacity(cpu) > min_capacity) {
+			upmigrate_discouraged(p)) &&
+			cpu_capacity(cpu) > min_capacity &&
+			cpu_max_power_cost(cpu) == max_power_cost) {
 		rcu_read_unlock();
 		return DOWN_MIGRATION;
 	}
@@ -6846,17 +6845,19 @@ static inline int find_best_target(struct task_struct *p, bool boosted, bool pre
 
 		if (new_util < cur_capacity) {
 			if (cpu_rq(i)->nr_running) {
-				if(prefer_idle) {
-					// Find a target cpu with lowest
-					// utilization.
+				if (prefer_idle) {
+					/* Find a target cpu with highest
+					 * utilization.
+					 */
 					if (target_util == 0 ||
 						target_util < new_util) {
 						target_cpu = i;
 						target_util = new_util;
 					}
 				} else {
-					// Find a target cpu with highest
-					// utilization.
+					/* Find a target cpu with lowest
+					 * utilization.
+					 */
 					if (target_util == 0 ||
 						target_util > new_util) {
 						target_cpu = i;
@@ -7024,8 +7025,9 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 	int want_affine = 0;
 	int sync = wake_flags & WF_SYNC;
 
-	if (sched_enable_hmp)
-		return select_best_cpu(p, prev_cpu, 0, sync);
+#ifdef CONFIG_SCHED_HMP
+	return select_best_cpu(p, prev_cpu, 0, sync);
+#endif
 
 	if (sd_flag & SD_BALANCE_WAKE)
 		want_affine = (!wake_wide(p) && task_fits_max(p, cpu) &&
@@ -7940,10 +7942,11 @@ static int detach_tasks(struct lb_env *env)
 	if (env->imbalance <= 0)
 		return 0;
 
+	if (!same_cluster(env->dst_cpu, env->src_cpu))
+		env->flags |= LBF_IGNORE_PREFERRED_CLUSTER_TASKS;
+
 	if (cpu_capacity(env->dst_cpu) < cpu_capacity(env->src_cpu))
 		env->flags |= LBF_IGNORE_BIG_TASKS;
-	else if (!same_cluster(env->dst_cpu, env->src_cpu))
-		env->flags |= LBF_IGNORE_PREFERRED_CLUSTER_TASKS;
 
 redo:
 	while (!list_empty(tasks)) {
@@ -8381,7 +8384,8 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
 		mcc->cpu = cpu;
 #ifdef CONFIG_SCHED_DEBUG
 		raw_spin_unlock_irqrestore(&mcc->lock, flags);
-		pr_info("CPU%d: update max cpu_capacity %lu\n", cpu, capacity);
+		printk_deferred(KERN_INFO "CPU%d: update max cpu_capacity %lu\n",
+				cpu, capacity);
 		goto skip_unlock;
 #endif
 	}
@@ -9312,8 +9316,9 @@ static struct rq *find_busiest_queue(struct lb_env *env,
 	unsigned long busiest_load = 0, busiest_capacity = 1;
 	int i;
 
-	if (sched_enable_hmp)
-		return find_busiest_queue_hmp(env, group);
+#ifdef CONFIG_SCHED_HMP
+	return find_busiest_queue_hmp(env, group);
+#endif
 
 	for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
 		unsigned long capacity, wl;
@@ -10119,8 +10124,9 @@ static inline int find_new_ilb(int type)
 {
 	int ilb;
 
-	if (sched_enable_hmp)
-		return find_new_hmp_ilb(type);
+#ifdef CONFIG_SCHED_HMP
+	return find_new_hmp_ilb(type);
+#endif
 
 	ilb = cpumask_first(nohz.idle_cpus_mask);
 
@@ -10495,8 +10501,9 @@ static inline int _nohz_kick_needed(struct rq *rq, int cpu, int *type)
 	if (likely(!atomic_read(&nohz.nr_cpus)))
 		return 0;
 
-	if (sched_enable_hmp)
-		return _nohz_kick_needed_hmp(rq, cpu, type);
+#ifdef CONFIG_SCHED_HMP
+	return _nohz_kick_needed_hmp(rq, cpu, type);
+#endif
 
 	if (time_before(now, nohz.next_balance))
 		return 0;
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 7cc74e56fde4..c30c48fde7e6 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -73,4 +73,8 @@ SCHED_FEAT(ATTACH_AGE_LOAD, true)
  * Energy aware scheduling. Use platform energy model to guide scheduling
  * decisions optimizing for energy efficiency.
  */
+#ifdef CONFIG_DEFAULT_USE_ENERGY_AWARE
+SCHED_FEAT(ENERGY_AWARE, true)
+#else
 SCHED_FEAT(ENERGY_AWARE, false)
+#endif
diff --git a/kernel/sched/hmp.c b/kernel/sched/hmp.c
index 95125c5518e2..df47c26ab6d2 100644
--- a/kernel/sched/hmp.c
+++ b/kernel/sched/hmp.c
@@ -28,8 +28,7 @@ const char *task_event_names[] = {"PUT_PREV_TASK", "PICK_NEXT_TASK",
 				  "TASK_WAKE", "TASK_MIGRATE", "TASK_UPDATE",
 				"IRQ_UPDATE"};
 
-const char *migrate_type_names[] = {"GROUP_TO_RQ", "RQ_TO_GROUP",
-					 "RQ_TO_RQ", "GROUP_TO_GROUP"};
+const char *migrate_type_names[] = {"GROUP_TO_RQ", "RQ_TO_GROUP"};
 
 static ktime_t ktime_last;
 static bool sched_ktime_suspended;
@@ -456,6 +455,12 @@ compare_clusters(void *priv, struct list_head *a, struct list_head *b)
 	cluster1 = container_of(a, struct sched_cluster, list);
 	cluster2 = container_of(b, struct sched_cluster, list);
 
+	/*
+	 * Don't assume higher capacity means higher power. If the
+	 * power cost is same, sort the higher capacity cluster before
+	 * the lower capacity cluster to start placing the tasks
+	 * on the higher capacity cluster.
+	 */
 	ret = cluster1->max_power_cost > cluster2->max_power_cost ||
 		(cluster1->max_power_cost == cluster2->max_power_cost &&
 		cluster1->max_possible_capacity <
@@ -616,19 +621,6 @@ int register_cpu_cycle_counter_cb(struct cpu_cycle_counter_cb *cb)
 	return 0;
 }
 
-int __init set_sched_enable_hmp(char *str)
-{
-	int enable_hmp = 0;
-
-	get_option(&str, &enable_hmp);
-
-	sched_enable_hmp = !!enable_hmp;
-
-	return 0;
-}
-
-early_param("sched_enable_hmp", set_sched_enable_hmp);
-
 /* Clear any HMP scheduler related requests pending from or on cpu */
 void clear_hmp_request(int cpu)
 {
@@ -726,7 +718,7 @@ __read_mostly unsigned int sysctl_sched_cpu_high_irqload = (10 * NSEC_PER_MSEC);
 unsigned int __read_mostly sysctl_sched_enable_thread_grouping;
 
 
-__read_mostly unsigned int sysctl_sched_new_task_windows = 5;
+#define SCHED_NEW_TASK_WINDOWS 5
 
 #define SCHED_FREQ_ACCOUNT_WAIT_TIME 0
 
@@ -870,9 +862,6 @@ unsigned int max_task_load(void)
 	return sched_ravg_window;
 }
 
-/* Use this knob to turn on or off HMP-aware task placement logic */
-unsigned int __read_mostly sched_enable_hmp;
-
 /* A cpu can no longer accommodate more tasks if:
  *
  *	rq->nr_running > sysctl_sched_spill_nr_run ||
@@ -970,8 +959,8 @@ unsigned int __read_mostly sysctl_sched_restrict_cluster_spill;
 unsigned int __read_mostly sysctl_sched_short_burst;
 unsigned int __read_mostly sysctl_sched_short_sleep = 1 * NSEC_PER_MSEC;
 
-static void
-_update_up_down_migrate(unsigned int *up_migrate, unsigned int *down_migrate)
+static void _update_up_down_migrate(unsigned int *up_migrate,
+			unsigned int *down_migrate, bool is_group)
 {
 	unsigned int delta;
 
@@ -985,7 +974,8 @@ _update_up_down_migrate(unsigned int *up_migrate, unsigned int *down_migrate)
 	*up_migrate >>= 10;
 	*up_migrate *= NSEC_PER_USEC;
 
-	*up_migrate = min(*up_migrate, sched_ravg_window);
+	if (!is_group)
+		*up_migrate = min(*up_migrate, sched_ravg_window);
 
 	*down_migrate /= NSEC_PER_USEC;
 	*down_migrate *= up_down_migrate_scale_factor;
@@ -1000,14 +990,14 @@ static void update_up_down_migrate(void)
 	unsigned int up_migrate = pct_to_real(sysctl_sched_upmigrate_pct);
 	unsigned int down_migrate = pct_to_real(sysctl_sched_downmigrate_pct);
 
-	_update_up_down_migrate(&up_migrate, &down_migrate);
+	_update_up_down_migrate(&up_migrate, &down_migrate, false);
 	sched_upmigrate = up_migrate;
 	sched_downmigrate = down_migrate;
 
 	up_migrate = pct_to_real(sysctl_sched_group_upmigrate_pct);
 	down_migrate = pct_to_real(sysctl_sched_group_downmigrate_pct);
 
-	_update_up_down_migrate(&up_migrate, &down_migrate);
+	_update_up_down_migrate(&up_migrate, &down_migrate, true);
 	sched_group_upmigrate = up_migrate;
 	sched_group_downmigrate = down_migrate;
 }
@@ -1245,7 +1235,7 @@ unlock:
 
 void inc_nr_big_task(struct hmp_sched_stats *stats, struct task_struct *p)
 {
-	if (!sched_enable_hmp || sched_disable_window_stats)
+	if (sched_disable_window_stats)
 		return;
 
 	if (is_big_task(p))
@@ -1254,7 +1244,7 @@ void inc_nr_big_task(struct hmp_sched_stats *stats, struct task_struct *p)
 
 void dec_nr_big_task(struct hmp_sched_stats *stats, struct task_struct *p)
 {
-	if (!sched_enable_hmp || sched_disable_window_stats)
+	if (sched_disable_window_stats)
 		return;
 
 	if (is_big_task(p))
@@ -1323,7 +1313,7 @@ void fixup_nr_big_tasks(struct hmp_sched_stats *stats,
 	u64 new_task_load;
 	u64 old_task_load;
 
-	if (!sched_enable_hmp || sched_disable_window_stats)
+	if (sched_disable_window_stats)
 		return;
 
 	old_task_load = scale_load_to_cpu(task_load(p), task_cpu(p));
@@ -1433,9 +1423,6 @@ int sched_window_update_handler(struct ctl_table *table, int write,
 	unsigned int *data = (unsigned int *)table->data;
 	unsigned int old_val;
 
-	if (!sched_enable_hmp)
-		return -EINVAL;
-
 	mutex_lock(&policy_mutex);
 
 	old_val = *data;
@@ -1471,9 +1458,6 @@ int sched_hmp_proc_update_handler(struct ctl_table *table, int write,
 	unsigned int *data = (unsigned int *)table->data;
 	int update_task_count = 0;
 
-	if (!sched_enable_hmp)
-		return 0;
-
 	/*
 	 * The policy mutex is acquired with cpu_hotplug.lock
 	 * held from cpu_up()->cpufreq_governor_interactive()->
@@ -1713,45 +1697,19 @@ static inline unsigned int load_to_freq(struct rq *rq, u64 load)
 	return freq;
 }
 
-static inline struct group_cpu_time *
-_group_cpu_time(struct related_thread_group *grp, int cpu);
-
-/*
- * Return load from all related group in given cpu.
- * Caller must ensure that related_thread_group_lock is held.
- */
-static void _group_load_in_cpu(int cpu, u64 *grp_load, u64 *new_grp_load)
-{
-	struct related_thread_group *grp;
-
-	for_each_related_thread_group(grp) {
-		struct group_cpu_time *cpu_time;
-
-		cpu_time = _group_cpu_time(grp, cpu);
-		*grp_load += cpu_time->prev_runnable_sum;
-		if (new_grp_load)
-			*new_grp_load += cpu_time->nt_prev_runnable_sum;
-	}
-}
-
 /*
  * Return load from all related groups in given frequency domain.
- * Caller must ensure that related_thread_group_lock is held.
  */
 static void group_load_in_freq_domain(struct cpumask *cpus,
 				u64 *grp_load, u64 *new_grp_load)
 {
-	struct related_thread_group *grp;
 	int j;
 
-	for_each_related_thread_group(grp) {
-		for_each_cpu(j, cpus) {
-			struct group_cpu_time *cpu_time;
+	for_each_cpu(j, cpus) {
+		struct rq *rq = cpu_rq(j);
 
-			cpu_time = _group_cpu_time(grp, j);
-			*grp_load += cpu_time->prev_runnable_sum;
-			*new_grp_load += cpu_time->nt_prev_runnable_sum;
-		}
+		*grp_load += rq->grp_time.prev_runnable_sum;
+		*new_grp_load += rq->grp_time.nt_prev_runnable_sum;
 	}
 }
 
@@ -1776,9 +1734,6 @@ static int send_notification(struct rq *rq, int check_pred, int check_groups)
 	int rc = 0;
 	u64 group_load = 0, new_load  = 0;
 
-	if (!sched_enable_hmp)
-		return 0;
-
 	if (check_pred) {
 		u64 prev = rq->old_busy_time;
 		u64 predicted = rq->hmp_stats.pred_demands_sum;
@@ -1796,20 +1751,18 @@ static int send_notification(struct rq *rq, int check_pred, int check_groups)
 		if (freq_required < cur_freq + sysctl_sched_pred_alert_freq)
 			return 0;
 	} else {
-		read_lock_irqsave(&related_thread_group_lock, flags);
 		/*
 		 * Protect from concurrent update of rq->prev_runnable_sum and
 		 * group cpu load
 		 */
-		raw_spin_lock(&rq->lock);
+		raw_spin_lock_irqsave(&rq->lock, flags);
 		if (check_groups)
-			_group_load_in_cpu(cpu_of(rq), &group_load, NULL);
+			group_load = rq->grp_time.prev_runnable_sum;
 
 		new_load = rq->prev_runnable_sum + group_load;
 		new_load = freq_policy_load(rq, new_load);
 
-		raw_spin_unlock(&rq->lock);
-		read_unlock_irqrestore(&related_thread_group_lock, flags);
+		raw_spin_unlock_irqrestore(&rq->lock, flags);
 
 		cur_freq = load_to_freq(rq, rq->old_busy_time);
 		freq_required = load_to_freq(rq, new_load);
@@ -1897,7 +1850,7 @@ static int account_busy_for_cpu_time(struct rq *rq, struct task_struct *p,
 
 static inline bool is_new_task(struct task_struct *p)
 {
-	return p->ravg.active_windows < sysctl_sched_new_task_windows;
+	return p->ravg.active_windows < SCHED_NEW_TASK_WINDOWS;
 }
 
 #define INC_STEP 8
@@ -2283,6 +2236,31 @@ static void rollover_task_window(struct task_struct *p, bool full_window)
 	}
 }
 
+static void rollover_cpu_window(struct rq *rq, bool full_window)
+{
+	u64 curr_sum = rq->curr_runnable_sum;
+	u64 nt_curr_sum = rq->nt_curr_runnable_sum;
+	u64 grp_curr_sum = rq->grp_time.curr_runnable_sum;
+	u64 grp_nt_curr_sum = rq->grp_time.nt_curr_runnable_sum;
+
+	if (unlikely(full_window)) {
+		curr_sum = 0;
+		nt_curr_sum = 0;
+		grp_curr_sum = 0;
+		grp_nt_curr_sum = 0;
+	}
+
+	rq->prev_runnable_sum = curr_sum;
+	rq->nt_prev_runnable_sum = nt_curr_sum;
+	rq->grp_time.prev_runnable_sum = grp_curr_sum;
+	rq->grp_time.nt_prev_runnable_sum = grp_nt_curr_sum;
+
+	rq->curr_runnable_sum = 0;
+	rq->nt_curr_runnable_sum = 0;
+	rq->grp_time.curr_runnable_sum = 0;
+	rq->grp_time.nt_curr_runnable_sum = 0;
+}
+
 /*
  * Account cpu activity in its busy time counters (rq->curr/prev_runnable_sum)
  */
@@ -2299,8 +2277,6 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
 	u64 *prev_runnable_sum = &rq->prev_runnable_sum;
 	u64 *nt_curr_runnable_sum = &rq->nt_curr_runnable_sum;
 	u64 *nt_prev_runnable_sum = &rq->nt_prev_runnable_sum;
-	int flip_counters = 0;
-	int prev_sum_reset = 0;
 	bool new_task;
 	struct related_thread_group *grp;
 	int cpu = rq->cpu;
@@ -2315,51 +2291,6 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
 
 	new_task = is_new_task(p);
 
-	grp = p->grp;
-	if (grp && sched_freq_aggregate) {
-		/* cpu_time protected by rq_lock */
-		struct group_cpu_time *cpu_time =
-			_group_cpu_time(grp, cpu_of(rq));
-
-		curr_runnable_sum = &cpu_time->curr_runnable_sum;
-		prev_runnable_sum = &cpu_time->prev_runnable_sum;
-
-		nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
-		nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
-
-		if (cpu_time->window_start != rq->window_start) {
-			int nr_windows;
-
-			delta = rq->window_start - cpu_time->window_start;
-			nr_windows = div64_u64(delta, window_size);
-			if (nr_windows > 1)
-				prev_sum_reset = 1;
-
-			cpu_time->window_start = rq->window_start;
-			flip_counters = 1;
-		}
-
-		if (p_is_curr_task && new_window) {
-			u64 curr_sum = rq->curr_runnable_sum;
-			u64 nt_curr_sum = rq->nt_curr_runnable_sum;
-
-			if (full_window)
-				curr_sum = nt_curr_sum = 0;
-
-			rq->prev_runnable_sum = curr_sum;
-			rq->nt_prev_runnable_sum = nt_curr_sum;
-
-			rq->curr_runnable_sum = 0;
-			rq->nt_curr_runnable_sum = 0;
-		}
-	} else {
-		if (p_is_curr_task && new_window) {
-			flip_counters = 1;
-			if (full_window)
-				prev_sum_reset = 1;
-		}
-	}
-
 	/*
 	 * Handle per-task window rollover. We don't care about the idle
 	 * task or exiting tasks.
@@ -2369,26 +2300,25 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
 			rollover_task_window(p, full_window);
 	}
 
-	if (flip_counters) {
-		u64 curr_sum = *curr_runnable_sum;
-		u64 nt_curr_sum = *nt_curr_runnable_sum;
+	if (p_is_curr_task && new_window) {
+		rollover_cpu_window(rq, full_window);
+		rollover_top_tasks(rq, full_window);
+	}
 
-		if (prev_sum_reset)
-			curr_sum = nt_curr_sum = 0;
+	if (!account_busy_for_cpu_time(rq, p, irqtime, event))
+		goto done;
 
-		*prev_runnable_sum = curr_sum;
-		*nt_prev_runnable_sum = nt_curr_sum;
+	grp = p->grp;
+	if (grp && sched_freq_aggregate) {
+		struct group_cpu_time *cpu_time = &rq->grp_time;
 
-		*curr_runnable_sum = 0;
-		*nt_curr_runnable_sum = 0;
+		curr_runnable_sum = &cpu_time->curr_runnable_sum;
+		prev_runnable_sum = &cpu_time->prev_runnable_sum;
 
-		if (p_is_curr_task)
-			rollover_top_tasks(rq, full_window);
+		nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
+		nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
 	}
 
-	if (!account_busy_for_cpu_time(rq, p, irqtime, event))
-		goto done;
-
 	if (!new_window) {
 		/*
 		 * account_busy_for_cpu_time() = 1 so busy time needs
@@ -2864,7 +2794,7 @@ static u64 update_task_demand(struct task_struct *p, struct rq *rq,
 }
 
 static inline void
-update_task_burst(struct task_struct *p, struct rq *rq, int event, int runtime)
+update_task_burst(struct task_struct *p, struct rq *rq, int event, u64 runtime)
 {
 	/*
 	 * update_task_demand() has checks for idle task and
@@ -2905,7 +2835,7 @@ void update_task_ravg(struct task_struct *p, struct rq *rq, int event,
 done:
 	trace_sched_update_task_ravg(p, rq, event, wallclock, irqtime,
 				     rq->cc.cycles, rq->cc.time,
-				     _group_cpu_time(p->grp, cpu_of(rq)));
+				     p->grp ? &rq->grp_time : NULL);
 
 	p->ravg.mark_start = wallclock;
 }
@@ -3012,7 +2942,7 @@ void set_window_start(struct rq *rq)
 {
 	static int sync_cpu_available;
 
-	if (rq->window_start || !sched_enable_hmp)
+	if (rq->window_start)
 		return;
 
 	if (!sync_cpu_available) {
@@ -3063,7 +2993,6 @@ void reset_all_window_stats(u64 window_start, unsigned int window_size)
 	u64 start_ts = sched_ktime_clock();
 	int reason = WINDOW_CHANGE;
 	unsigned int old = 0, new = 0;
-	struct related_thread_group *grp;
 
 	local_irq_save(flags);
 
@@ -3081,19 +3010,6 @@ void reset_all_window_stats(u64 window_start, unsigned int window_size)
 
 	read_unlock(&tasklist_lock);
 
-	list_for_each_entry(grp, &active_related_thread_groups, list) {
-		int j;
-
-		for_each_possible_cpu(j) {
-			struct group_cpu_time *cpu_time;
-			/* Protected by rq lock */
-			cpu_time = _group_cpu_time(grp, j);
-			memset(cpu_time, 0, sizeof(struct group_cpu_time));
-			if (window_start)
-				cpu_time->window_start = window_start;
-		}
-	}
-
 	if (window_size) {
 		sched_ravg_window = window_size * TICK_NSEC;
 		set_hmp_defaults();
@@ -3109,6 +3025,7 @@ void reset_all_window_stats(u64 window_start, unsigned int window_size)
 			rq->window_start = window_start;
 		rq->curr_runnable_sum = rq->prev_runnable_sum = 0;
 		rq->nt_curr_runnable_sum = rq->nt_prev_runnable_sum = 0;
+		memset(&rq->grp_time, 0, sizeof(struct group_cpu_time));
 		for (i = 0; i < NUM_TRACKED_WINDOWS; i++) {
 			memset(&rq->load_subs[i], 0,
 					sizeof(struct load_subtractions));
@@ -3198,15 +3115,12 @@ static inline u64 freq_policy_load(struct rq *rq, u64 load)
 	case FREQ_REPORT_CPU_LOAD:
 		break;
 	default:
-		WARN_ON_ONCE(1);
+		break;
 	}
 
 	return load;
 }
 
-static inline void
-sync_window_start(struct rq *rq, struct group_cpu_time *cpu_time);
-
 void sched_get_cpus_busy(struct sched_load *busy,
 			 const struct cpumask *query_cpus)
 {
@@ -3223,7 +3137,6 @@ void sched_get_cpus_busy(struct sched_load *busy,
 	unsigned int window_size;
 	u64 max_prev_sum = 0;
 	int max_busy_cpu = cpumask_first(query_cpus);
-	struct related_thread_group *grp;
 	u64 total_group_load = 0, total_ngload = 0;
 	bool aggregate_load = false;
 	struct sched_cluster *cluster = cpu_cluster(cpumask_first(query_cpus));
@@ -3233,8 +3146,6 @@ void sched_get_cpus_busy(struct sched_load *busy,
 
 	local_irq_save(flags);
 
-	read_lock(&related_thread_group_lock);
-
 	/*
 	 * This function could be called in timer context, and the
 	 * current task may have been executing for a long time. Ensure
@@ -3287,15 +3198,6 @@ void sched_get_cpus_busy(struct sched_load *busy,
 
 	raw_spin_unlock(&cluster->load_lock);
 
-	for_each_related_thread_group(grp) {
-		for_each_cpu(cpu, query_cpus) {
-			/* Protected by rq_lock */
-			struct group_cpu_time *cpu_time =
-						_group_cpu_time(grp, cpu);
-			sync_window_start(cpu_rq(cpu), cpu_time);
-		}
-	}
-
 	group_load_in_freq_domain(
 			&cpu_rq(max_busy_cpu)->freq_domain_cpumask,
 			&total_group_load, &total_ngload);
@@ -3316,7 +3218,8 @@ void sched_get_cpus_busy(struct sched_load *busy,
 				ngload[i] = total_ngload;
 			}
 		} else {
-			_group_load_in_cpu(cpu, &group_load[i], &ngload[i]);
+			group_load[i] = rq->grp_time.prev_runnable_sum;
+			ngload[i] = rq->grp_time.nt_prev_runnable_sum;
 		}
 
 		load[i] += group_load[i];
@@ -3341,8 +3244,6 @@ skip_early:
 	for_each_cpu(cpu, query_cpus)
 		raw_spin_unlock(&(cpu_rq(cpu))->lock);
 
-	read_unlock(&related_thread_group_lock);
-
 	local_irq_restore(flags);
 
 	i = 0;
@@ -3373,7 +3274,9 @@ exit_early:
 		trace_sched_get_busy(cpu, busy[i].prev_load,
 				     busy[i].new_task_load,
 				     busy[i].predicted_load,
-				     early_detection[i]);
+				     early_detection[i],
+				     aggregate_load &&
+				      cpu == max_busy_cpu);
 		i++;
 	}
 }
@@ -3620,7 +3523,7 @@ void fixup_busy_time(struct task_struct *p, int new_cpu)
 	bool new_task;
 	struct related_thread_group *grp;
 
-	if (!sched_enable_hmp || (!p->on_rq && p->state != TASK_WAKING))
+	if (!p->on_rq && p->state != TASK_WAKING)
 		return;
 
 	if (exiting_task(p)) {
@@ -3659,18 +3562,17 @@ void fixup_busy_time(struct task_struct *p, int new_cpu)
 	if (grp && sched_freq_aggregate) {
 		struct group_cpu_time *cpu_time;
 
-		cpu_time = _group_cpu_time(grp, cpu_of(src_rq));
+		cpu_time = &src_rq->grp_time;
 		src_curr_runnable_sum = &cpu_time->curr_runnable_sum;
 		src_prev_runnable_sum = &cpu_time->prev_runnable_sum;
 		src_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
 		src_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
 
-		cpu_time = _group_cpu_time(grp, cpu_of(dest_rq));
+		cpu_time = &dest_rq->grp_time;
 		dst_curr_runnable_sum = &cpu_time->curr_runnable_sum;
 		dst_prev_runnable_sum = &cpu_time->prev_runnable_sum;
 		dst_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
 		dst_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
-		sync_window_start(dest_rq, cpu_time);
 
 		if (p->ravg.curr_window) {
 			*src_curr_runnable_sum -= p->ravg.curr_window;
@@ -3799,61 +3701,6 @@ void set_preferred_cluster(struct related_thread_group *grp)
 
 #define DEFAULT_CGROUP_COLOC_ID 1
 
-static inline void free_group_cputime(struct related_thread_group *grp)
-{
-	free_percpu(grp->cpu_time);
-}
-
-static int alloc_group_cputime(struct related_thread_group *grp)
-{
-	int i;
-	struct group_cpu_time *cpu_time;
-	int cpu = raw_smp_processor_id();
-	struct rq *rq = cpu_rq(cpu);
-	u64 window_start = rq->window_start;
-
-	grp->cpu_time = alloc_percpu_gfp(struct group_cpu_time, GFP_ATOMIC);
-	if (!grp->cpu_time)
-		return -ENOMEM;
-
-	for_each_possible_cpu(i) {
-		cpu_time = per_cpu_ptr(grp->cpu_time, i);
-		memset(cpu_time, 0, sizeof(struct group_cpu_time));
-		cpu_time->window_start = window_start;
-	}
-
-	return 0;
-}
-
-/*
- * A group's window_start may be behind. When moving it forward, flip prev/curr
- * counters. When moving forward > 1 window, prev counter is set to 0
- */
-static inline void
-sync_window_start(struct rq *rq, struct group_cpu_time *cpu_time)
-{
-	u64 delta;
-	int nr_windows;
-	u64 curr_sum = cpu_time->curr_runnable_sum;
-	u64 nt_curr_sum = cpu_time->nt_curr_runnable_sum;
-
-	delta = rq->window_start - cpu_time->window_start;
-	if (!delta)
-		return;
-
-	nr_windows = div64_u64(delta, sched_ravg_window);
-	if (nr_windows > 1)
-		curr_sum = nt_curr_sum = 0;
-
-	cpu_time->prev_runnable_sum  = curr_sum;
-	cpu_time->curr_runnable_sum  = 0;
-
-	cpu_time->nt_prev_runnable_sum = nt_curr_sum;
-	cpu_time->nt_curr_runnable_sum = 0;
-
-	cpu_time->window_start = rq->window_start;
-}
-
 /*
  * Task's cpu usage is accounted in:
  *	rq->curr/prev_runnable_sum,  when its ->grp is NULL
@@ -3871,7 +3718,6 @@ static void transfer_busy_time(struct rq *rq, struct related_thread_group *grp,
 	u64 *src_prev_runnable_sum, *dst_prev_runnable_sum;
 	u64 *src_nt_curr_runnable_sum, *dst_nt_curr_runnable_sum;
 	u64 *src_nt_prev_runnable_sum, *dst_nt_prev_runnable_sum;
-	struct migration_sum_data d;
 	int migrate_type;
 	int cpu = cpu_of(rq);
 	bool new_task;
@@ -3886,15 +3732,10 @@ static void transfer_busy_time(struct rq *rq, struct related_thread_group *grp,
 	update_task_ravg(p, rq, TASK_UPDATE, wallclock, 0);
 	new_task = is_new_task(p);
 
-	/* cpu_time protected by related_thread_group_lock, grp->lock rq_lock */
-	cpu_time = _group_cpu_time(grp, cpu);
+	cpu_time = &rq->grp_time;
 	if (event == ADD_TASK) {
-		sync_window_start(rq, cpu_time);
 		migrate_type = RQ_TO_GROUP;
-		d.src_rq = rq;
-		d.src_cpu_time = NULL;
-		d.dst_rq = NULL;
-		d.dst_cpu_time = cpu_time;
+
 		src_curr_runnable_sum = &rq->curr_runnable_sum;
 		dst_curr_runnable_sum = &cpu_time->curr_runnable_sum;
 		src_prev_runnable_sum = &rq->prev_runnable_sum;
@@ -3919,17 +3760,7 @@ static void transfer_busy_time(struct rq *rq, struct related_thread_group *grp,
 
 	} else {
 		migrate_type = GROUP_TO_RQ;
-		d.src_rq = NULL;
-		d.src_cpu_time = cpu_time;
-		d.dst_rq = rq;
-		d.dst_cpu_time = NULL;
 
-		/*
-		 * In case of REM_TASK, cpu_time->window_start would be
-		 * uptodate, because of the update_task_ravg() we called
-		 * above on the moving task. Hence no need for
-		 * sync_window_start()
-		 */
 		src_curr_runnable_sum = &cpu_time->curr_runnable_sum;
 		dst_curr_runnable_sum = &rq->curr_runnable_sum;
 		src_prev_runnable_sum = &cpu_time->prev_runnable_sum;
@@ -3975,7 +3806,7 @@ static void transfer_busy_time(struct rq *rq, struct related_thread_group *grp,
 	p->ravg.curr_window_cpu[cpu] = p->ravg.curr_window;
 	p->ravg.prev_window_cpu[cpu] = p->ravg.prev_window;
 
-	trace_sched_migration_update_sum(p, migrate_type, &d);
+	trace_sched_migration_update_sum(p, migrate_type, rq);
 
 	BUG_ON((s64)*src_curr_runnable_sum < 0);
 	BUG_ON((s64)*src_prev_runnable_sum < 0);
@@ -3983,18 +3814,6 @@ static void transfer_busy_time(struct rq *rq, struct related_thread_group *grp,
 	BUG_ON((s64)*src_nt_prev_runnable_sum < 0);
 }
 
-static inline struct group_cpu_time *
-task_group_cpu_time(struct task_struct *p, int cpu)
-{
-	return _group_cpu_time(rcu_dereference(p->grp), cpu);
-}
-
-static inline struct group_cpu_time *
-_group_cpu_time(struct related_thread_group *grp, int cpu)
-{
-	return grp ? per_cpu_ptr(grp->cpu_time, cpu) : NULL;
-}
-
 static inline struct related_thread_group*
 lookup_related_thread_group(unsigned int group_id)
 {
@@ -4014,12 +3833,6 @@ int alloc_related_thread_groups(void)
 			goto err;
 		}
 
-		if (alloc_group_cputime(grp)) {
-			kfree(grp);
-			ret = -ENOMEM;
-			goto err;
-		}
-
 		grp->id = i;
 		INIT_LIST_HEAD(&grp->tasks);
 		INIT_LIST_HEAD(&grp->list);
@@ -4034,7 +3847,6 @@ err:
 	for (i = 1; i < MAX_NUM_CGROUP_COLOC_ID; i++) {
 		grp = lookup_related_thread_group(i);
 		if (grp) {
-			free_group_cputime(grp);
 			kfree(grp);
 			related_thread_groups[i] = NULL;
 		} else {
@@ -4418,9 +4230,6 @@ static int register_sched_callback(void)
 {
 	int ret;
 
-	if (!sched_enable_hmp)
-		return 0;
-
 	ret = cpufreq_register_notifier(&notifier_policy_block,
 						CPUFREQ_POLICY_NOTIFIER);
 
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 3fe00d6fa335..07b2c63e4983 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -441,7 +441,7 @@ static void dequeue_top_rt_rq(struct rt_rq *rt_rq);
 
 static inline int on_rt_rq(struct sched_rt_entity *rt_se)
 {
-	return !list_empty(&rt_se->run_list);
+	return rt_se->on_rq;
 }
 
 #ifdef CONFIG_RT_GROUP_SCHED
@@ -487,8 +487,8 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
 	return rt_se->my_q;
 }
 
-static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head);
-static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
+static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
+static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
 
 static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 {
@@ -504,7 +504,7 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 		if (!rt_se)
 			enqueue_top_rt_rq(rt_rq);
 		else if (!on_rt_rq(rt_se))
-			enqueue_rt_entity(rt_se, false);
+			enqueue_rt_entity(rt_se, 0);
 
 		if (rt_rq->highest_prio.curr < curr->prio)
 			resched_curr(rq);
@@ -521,7 +521,7 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
 	if (!rt_se)
 		dequeue_top_rt_rq(rt_rq);
 	else if (on_rt_rq(rt_se))
-		dequeue_rt_entity(rt_se);
+		dequeue_rt_entity(rt_se, 0);
 }
 
 static inline int rt_rq_throttled(struct rt_rq *rt_rq)
@@ -1257,7 +1257,30 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 	dec_rt_group(rt_se, rt_rq);
 }
 
-static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
+/*
+ * Change rt_se->run_list location unless SAVE && !MOVE
+ *
+ * assumes ENQUEUE/DEQUEUE flags match
+ */
+static inline bool move_entity(unsigned int flags)
+{
+	if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE)
+		return false;
+
+	return true;
+}
+
+static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_array *array)
+{
+	list_del_init(&rt_se->run_list);
+
+	if (list_empty(array->queue + rt_se_prio(rt_se)))
+		__clear_bit(rt_se_prio(rt_se), array->bitmap);
+
+	rt_se->on_list = 0;
+}
+
+static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
 {
 	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
 	struct rt_prio_array *array = &rt_rq->active;
@@ -1270,26 +1293,37 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
 	 * get throttled and the current group doesn't have any other
 	 * active members.
 	 */
-	if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
+	if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) {
+		if (rt_se->on_list)
+			__delist_rt_entity(rt_se, array);
 		return;
+	}
 
-	if (head)
-		list_add(&rt_se->run_list, queue);
-	else
-		list_add_tail(&rt_se->run_list, queue);
-	__set_bit(rt_se_prio(rt_se), array->bitmap);
+	if (move_entity(flags)) {
+		WARN_ON_ONCE(rt_se->on_list);
+		if (flags & ENQUEUE_HEAD)
+			list_add(&rt_se->run_list, queue);
+		else
+			list_add_tail(&rt_se->run_list, queue);
+
+		__set_bit(rt_se_prio(rt_se), array->bitmap);
+		rt_se->on_list = 1;
+	}
+	rt_se->on_rq = 1;
 
 	inc_rt_tasks(rt_se, rt_rq);
 }
 
-static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
+static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
 {
 	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
 	struct rt_prio_array *array = &rt_rq->active;
 
-	list_del_init(&rt_se->run_list);
-	if (list_empty(array->queue + rt_se_prio(rt_se)))
-		__clear_bit(rt_se_prio(rt_se), array->bitmap);
+	if (move_entity(flags)) {
+		WARN_ON_ONCE(!rt_se->on_list);
+		__delist_rt_entity(rt_se, array);
+	}
+	rt_se->on_rq = 0;
 
 	dec_rt_tasks(rt_se, rt_rq);
 }
@@ -1298,7 +1332,7 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
  * Because the prio of an upper entry depends on the lower
  * entries, we must remove entries top - down.
  */
-static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
+static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags)
 {
 	struct sched_rt_entity *back = NULL;
 
@@ -1311,31 +1345,31 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
 
 	for (rt_se = back; rt_se; rt_se = rt_se->back) {
 		if (on_rt_rq(rt_se))
-			__dequeue_rt_entity(rt_se);
+			__dequeue_rt_entity(rt_se, flags);
 	}
 }
 
-static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
+static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
 {
 	struct rq *rq = rq_of_rt_se(rt_se);
 
-	dequeue_rt_stack(rt_se);
+	dequeue_rt_stack(rt_se, flags);
 	for_each_sched_rt_entity(rt_se)
-		__enqueue_rt_entity(rt_se, head);
+		__enqueue_rt_entity(rt_se, flags);
 	enqueue_top_rt_rq(&rq->rt);
 }
 
-static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
+static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
 {
 	struct rq *rq = rq_of_rt_se(rt_se);
 
-	dequeue_rt_stack(rt_se);
+	dequeue_rt_stack(rt_se, flags);
 
 	for_each_sched_rt_entity(rt_se) {
 		struct rt_rq *rt_rq = group_rt_rq(rt_se);
 
 		if (rt_rq && rt_rq->rt_nr_running)
-			__enqueue_rt_entity(rt_se, false);
+			__enqueue_rt_entity(rt_se, flags);
 	}
 	enqueue_top_rt_rq(&rq->rt);
 }
@@ -1351,7 +1385,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
 	if (flags & ENQUEUE_WAKEUP)
 		rt_se->timeout = 0;
 
-	enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
+	enqueue_rt_entity(rt_se, flags);
 	inc_hmp_sched_stats_rt(rq, p);
 
 	if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
@@ -1363,7 +1397,7 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
 	struct sched_rt_entity *rt_se = &p->rt;
 
 	update_curr_rt(rq);
-	dequeue_rt_entity(rt_se);
+	dequeue_rt_entity(rt_se, flags);
 	dec_hmp_sched_stats_rt(rq, p);
 
 	dequeue_pushable_task(rq, p);
@@ -1406,6 +1440,7 @@ static void yield_task_rt(struct rq *rq)
 #ifdef CONFIG_SMP
 static int find_lowest_rq(struct task_struct *task);
 
+#ifdef CONFIG_SCHED_HMP
 static int
 select_task_rq_rt_hmp(struct task_struct *p, int cpu, int sd_flag, int flags)
 {
@@ -1419,6 +1454,7 @@ select_task_rq_rt_hmp(struct task_struct *p, int cpu, int sd_flag, int flags)
 
 	return cpu;
 }
+#endif
 
 static int
 select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
@@ -1426,8 +1462,9 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
 	struct task_struct *curr;
 	struct rq *rq;
 
-	if (sched_enable_hmp)
-		return select_task_rq_rt_hmp(p, cpu, sd_flag, flags);
+#ifdef CONFIG_SCHED_HMP
+	return select_task_rq_rt_hmp(p, cpu, sd_flag, flags);
+#endif
 
 	/* For anything but wake ups, just return the task_cpu */
 	if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
@@ -1796,14 +1833,6 @@ static int find_lowest_rq_hmp(struct task_struct *task)
 
 	return best_cpu;
 }
-
-#else	/* CONFIG_SCHED_HMP */
-
-static int find_lowest_rq_hmp(struct task_struct *task)
-{
-	return -1;
-}
-
 #endif	/* CONFIG_SCHED_HMP */
 
 static int find_lowest_rq(struct task_struct *task)
@@ -1813,8 +1842,9 @@ static int find_lowest_rq(struct task_struct *task)
 	int this_cpu = smp_processor_id();
 	int cpu      = task_cpu(task);
 
-	if (sched_enable_hmp)
-		return find_lowest_rq_hmp(task);
+#ifdef CONFIG_SCHED_HMP
+	return find_lowest_rq_hmp(task);
+#endif
 
 	/* Make sure the mask is initialized first */
 	if (unlikely(!lowest_mask))
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index d907eeb297a3..75500042fd32 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -366,6 +366,13 @@ struct load_subtractions {
 	u64 new_subs;
 };
 
+struct group_cpu_time {
+	u64 curr_runnable_sum;
+	u64 prev_runnable_sum;
+	u64 nt_curr_runnable_sum;
+	u64 nt_prev_runnable_sum;
+};
+
 struct sched_cluster {
 	raw_spinlock_t load_lock;
 	struct list_head list;
@@ -407,12 +414,6 @@ struct related_thread_group {
 	struct sched_cluster *preferred_cluster;
 	struct rcu_head rcu;
 	u64 last_update;
-	struct group_cpu_time __percpu *cpu_time;	/* one per cluster */
-};
-
-struct migration_sum_data {
-	struct rq *src_rq, *dst_rq;
-	struct group_cpu_time *src_cpu_time, *dst_cpu_time;
 };
 
 extern struct list_head cluster_head;
@@ -776,6 +777,7 @@ struct rq {
 	u64 prev_runnable_sum;
 	u64 nt_curr_runnable_sum;
 	u64 nt_prev_runnable_sum;
+	struct group_cpu_time grp_time;
 	struct load_subtractions load_subs[NUM_TRACKED_WINDOWS];
 	DECLARE_BITMAP_ARRAY(top_tasks_bitmap,
 			NUM_TRACKED_WINDOWS, NUM_LOAD_INDICES);
@@ -1069,10 +1071,6 @@ enum sched_boost_policy {
 #define WINDOW_STATS_AVG		3
 #define WINDOW_STATS_INVALID_POLICY	4
 
-#define FREQ_REPORT_MAX_CPU_LOAD_TOP_TASK	0
-#define FREQ_REPORT_CPU_LOAD			1
-#define FREQ_REPORT_TOP_TASK			2
-
 #define SCHED_UPMIGRATE_MIN_NICE 15
 #define EXITING_TASK_MARKER	0xdeaddead
 
@@ -1083,7 +1081,6 @@ enum sched_boost_policy {
 extern struct mutex policy_mutex;
 extern unsigned int sched_ravg_window;
 extern unsigned int sched_disable_window_stats;
-extern unsigned int sched_enable_hmp;
 extern unsigned int max_possible_freq;
 extern unsigned int min_max_freq;
 extern unsigned int pct_task_load(struct task_struct *p);
@@ -1127,7 +1124,6 @@ extern void update_cluster_topology(void);
 extern void note_task_waking(struct task_struct *p, u64 wallclock);
 extern void set_task_last_switch_out(struct task_struct *p, u64 wallclock);
 extern void init_clusters(void);
-extern int __init set_sched_enable_hmp(char *str);
 extern void reset_cpu_hmp_stats(int cpu, int reset_cra);
 extern unsigned int max_task_load(void);
 extern void sched_account_irqtime(int cpu, struct task_struct *curr,
@@ -1257,7 +1253,7 @@ inc_cumulative_runnable_avg(struct hmp_sched_stats *stats,
 {
 	u32 task_load;
 
-	if (!sched_enable_hmp || sched_disable_window_stats)
+	if (sched_disable_window_stats)
 		return;
 
 	task_load = sched_disable_window_stats ? 0 : p->ravg.demand;
@@ -1272,7 +1268,7 @@ dec_cumulative_runnable_avg(struct hmp_sched_stats *stats,
 {
 	u32 task_load;
 
-	if (!sched_enable_hmp || sched_disable_window_stats)
+	if (sched_disable_window_stats)
 		return;
 
 	task_load = sched_disable_window_stats ? 0 : p->ravg.demand;
@@ -1290,7 +1286,7 @@ fixup_cumulative_runnable_avg(struct hmp_sched_stats *stats,
 			      struct task_struct *p, s64 task_load_delta,
 			      s64 pred_demand_delta)
 {
-	if (!sched_enable_hmp || sched_disable_window_stats)
+	if (sched_disable_window_stats)
 		return;
 
 	stats->cumulative_runnable_avg += task_load_delta;
@@ -1350,14 +1346,6 @@ check_for_freq_change(struct rq *rq, bool check_pred, bool check_groups);
 extern void notify_migration(int src_cpu, int dest_cpu,
 			bool src_cpu_dead, struct task_struct *p);
 
-struct group_cpu_time {
-	u64 curr_runnable_sum;
-	u64 prev_runnable_sum;
-	u64 nt_curr_runnable_sum;
-	u64 nt_prev_runnable_sum;
-	u64 window_start;
-};
-
 /* Is frequency of two cpus synchronized with each other? */
 static inline int same_freq_domain(int src_cpu, int dst_cpu)
 {
@@ -1667,7 +1655,6 @@ static inline int update_preferred_cluster(struct related_thread_group *grp,
 
 static inline void add_new_task_to_grp(struct task_struct *new) {}
 
-#define sched_enable_hmp 0
 #define PRED_DEMAND_DELTA (0)
 
 static inline void
@@ -1954,19 +1941,41 @@ static const u32 prio_to_wmult[40] = {
  /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
 };
 
+/*
+ * {de,en}queue flags:
+ *
+ * DEQUEUE_SLEEP  - task is no longer runnable
+ * ENQUEUE_WAKEUP - task just became runnable
+ *
+ * SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks
+ *                are in a known state which allows modification. Such pairs
+ *                should preserve as much state as possible.
+ *
+ * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location
+ *        in the runqueue.
+ *
+ * ENQUEUE_HEAD      - place at front of runqueue (tail if not specified)
+ * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline)
+ * ENQUEUE_WAKING    - sched_class::task_waking was called
+ *
+ */
+
+#define DEQUEUE_SLEEP		0x01
+#define DEQUEUE_SAVE		0x02 /* matches ENQUEUE_RESTORE */
+#define DEQUEUE_MOVE		0x04 /* matches ENQUEUE_MOVE */
+
 #define ENQUEUE_WAKEUP		0x01
-#define ENQUEUE_HEAD		0x02
+#define ENQUEUE_RESTORE		0x02
+#define ENQUEUE_MOVE		0x04
+
+#define ENQUEUE_HEAD		0x08
+#define ENQUEUE_REPLENISH	0x10
 #ifdef CONFIG_SMP
-#define ENQUEUE_WAKING		0x04	/* sched_class::task_waking was called */
+#define ENQUEUE_WAKING		0x20
 #else
 #define ENQUEUE_WAKING		0x00
 #endif
-#define ENQUEUE_REPLENISH	0x08
-#define ENQUEUE_RESTORE	0x10
-#define ENQUEUE_WAKEUP_NEW	0x20
-
-#define DEQUEUE_SLEEP		0x01
-#define DEQUEUE_SAVE		0x02
+#define ENQUEUE_WAKEUP_NEW	0x40
 
 #define RETRY_TASK		((void *)-1UL)
 
diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c
index b2ff383d6062..b0c5fe6d1f3b 100644
--- a/kernel/sched/tune.c
+++ b/kernel/sched/tune.c
@@ -896,7 +896,6 @@ struct cgroup_subsys schedtune_cgrp_subsys = {
 	.cancel_attach  = schedtune_cancel_attach,
 	.legacy_cftypes	= files,
 	.early_init	= 1,
-	.allow_attach	= subsys_cgroup_allow_attach,
 	.attach		= schedtune_attach,
 };
 
@@ -910,6 +909,7 @@ schedtune_init_cgroups(void)
 	for_each_possible_cpu(cpu) {
 		bg = &per_cpu(cpu_boost_groups, cpu);
 		memset(bg, 0, sizeof(struct boost_groups));
+		raw_spin_lock_init(&bg->lock);
 	}
 
 	pr_info("schedtune: configured to support %d boost groups\n",
diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c
index 07b7f84b37e2..2ffb1680b380 100644
--- a/kernel/sched/walt.c
+++ b/kernel/sched/walt.c
@@ -22,7 +22,6 @@
 #include <linux/syscore_ops.h>
 #include <linux/cpufreq.h>
 #include <trace/events/sched.h>
-#include <clocksource/arm_arch_timer.h>
 #include "sched.h"
 #include "walt.h"
 
@@ -188,10 +187,8 @@ update_window_start(struct rq *rq, u64 wallclock)
 	delta = wallclock - rq->window_start;
 	/* If the MPM global timer is cleared, set delta as 0 to avoid kernel BUG happening */
 	if (delta < 0) {
-		if (arch_timer_read_counter() == 0)
-			delta = 0;
-		else
-			BUG_ON(1);
+		delta = 0;
+		WARN_ONCE(1, "WALT wallclock appears to have gone backwards or reset\n");
 	}
 
 	if (delta < walt_ravg_window)
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index d3873333c766..51eef8e7df39 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -133,6 +133,7 @@ static int ten_thousand = 10000;
 #endif
 #ifdef CONFIG_SCHED_HMP
 static int one_thousand = 1000;
+static int max_freq_reporting_policy = FREQ_REPORT_INVALID_POLICY - 1;
 #endif
 
 /* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */
@@ -178,7 +179,7 @@ extern int no_unaligned_warning;
 #define SYSCTL_WRITES_WARN	 0
 #define SYSCTL_WRITES_STRICT	 1
 
-static int sysctl_writes_strict = SYSCTL_WRITES_WARN;
+static int sysctl_writes_strict = SYSCTL_WRITES_STRICT;
 
 static int proc_do_cad_pid(struct ctl_table *table, int write,
 		  void __user *buffer, size_t *lenp, loff_t *ppos);
@@ -297,6 +298,7 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &zero,
+		.extra2		= &max_freq_reporting_policy,
 	},
 	{
 		.procname	= "sched_freq_inc_notify",
@@ -447,13 +449,6 @@ static struct ctl_table kern_table[] = {
 		.proc_handler   = proc_dointvec,
 	},
 	{
-		.procname       = "sched_new_task_windows",
-		.data           = &sysctl_sched_new_task_windows,
-		.maxlen         = sizeof(unsigned int),
-		.mode           = 0644,
-		.proc_handler   = sched_window_update_handler,
-	},
-	{
 		.procname	= "sched_pred_alert_freq",
 		.data		= &sysctl_sched_pred_alert_freq,
 		.maxlen		= sizeof(unsigned int),
@@ -591,7 +586,8 @@ static struct ctl_table kern_table[] = {
 		.data		= &sysctl_sched_time_avg,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &one,
 	},
 	{
 		.procname	= "sched_shares_window_ns",
@@ -2375,6 +2371,21 @@ static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp,
 	return 0;
 }
 
+static int do_proc_douintvec_conv(bool *negp, unsigned long *lvalp,
+				 int *valp,
+				 int write, void *data)
+{
+	if (write) {
+		if (*negp)
+			return -EINVAL;
+		*valp = *lvalp;
+	} else {
+		unsigned int val = *valp;
+		*lvalp = (unsigned long)val;
+	}
+	return 0;
+}
+
 static const char proc_wspace_sep[] = { ' ', '\t', '\n' };
 
 static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
@@ -2502,8 +2513,27 @@ static int do_proc_dointvec(struct ctl_table *table, int write,
 int proc_dointvec(struct ctl_table *table, int write,
 		     void __user *buffer, size_t *lenp, loff_t *ppos)
 {
-    return do_proc_dointvec(table,write,buffer,lenp,ppos,
-		    	    NULL,NULL);
+	return do_proc_dointvec(table, write, buffer, lenp, ppos, NULL, NULL);
+}
+
+/**
+ * proc_douintvec - read a vector of unsigned integers
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) unsigned integer
+ * values from/to the user buffer, treated as an ASCII string.
+ *
+ * Returns 0 on success.
+ */
+int proc_douintvec(struct ctl_table *table, int write,
+		     void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	return do_proc_dointvec(table, write, buffer, lenp, ppos,
+				do_proc_douintvec_conv, NULL);
 }
 
 /*
@@ -3116,6 +3146,12 @@ int proc_dointvec(struct ctl_table *table, int write,
 	return -ENOSYS;
 }
 
+int proc_douintvec(struct ctl_table *table, int write,
+		  void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	return -ENOSYS;
+}
+
 int proc_dointvec_minmax(struct ctl_table *table, int write,
 		    void __user *buffer, size_t *lenp, loff_t *ppos)
 {
@@ -3161,6 +3197,7 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
  * exception granted :-)
  */
 EXPORT_SYMBOL(proc_dointvec);
+EXPORT_SYMBOL(proc_douintvec);
 EXPORT_SYMBOL(proc_dointvec_jiffies);
 EXPORT_SYMBOL(proc_dointvec_minmax);
 EXPORT_SYMBOL(proc_dointvec_userhz_jiffies);
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 0cdc34ebd8d1..2af5687b83c9 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -412,12 +412,10 @@ static int alarmtimer_suspend(struct device *dev)
 	now = rtc_tm_to_ktime(tm);
 	now = ktime_add(now, min);
 	if (poweron_alarm) {
-		struct rtc_time tm_val;
-		unsigned long secs;
+		uint64_t msec = 0;
 
-		tm_val = rtc_ktime_to_tm(min);
-		rtc_tm_to_time(&tm_val, &secs);
-		lpm_suspend_wake_time(secs);
+		msec = ktime_to_ms(min);
+		lpm_suspend_wake_time(msec);
 	} else {
 		/* Set alarm, if in the past reject suspend briefly to handle */
 		ret = rtc_timer_start(rtc, &rtctimer, now, ktime_set(0, 0));
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index b98810d2f3b4..89cc82a38e4d 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -108,7 +108,7 @@ static int finished_booting;
 
 #ifdef CONFIG_CLOCKSOURCE_WATCHDOG
 static void clocksource_watchdog_work(struct work_struct *work);
-static void clocksource_select(void);
+static void clocksource_select(bool force);
 
 static LIST_HEAD(watchdog_list);
 static struct clocksource *watchdog;
@@ -415,7 +415,7 @@ static int clocksource_watchdog_kthread(void *data)
 {
 	mutex_lock(&clocksource_mutex);
 	if (__clocksource_watchdog_kthread())
-		clocksource_select();
+		clocksource_select(false);
 	mutex_unlock(&clocksource_mutex);
 	return 0;
 }
@@ -555,11 +555,12 @@ static inline void clocksource_update_max_deferment(struct clocksource *cs)
 
 #ifndef CONFIG_ARCH_USES_GETTIMEOFFSET
 
-static struct clocksource *clocksource_find_best(bool oneshot, bool skipcur)
+static struct clocksource *clocksource_find_best(bool oneshot, bool skipcur,
+						bool force)
 {
 	struct clocksource *cs;
 
-	if (!finished_booting || list_empty(&clocksource_list))
+	if ((!finished_booting && !force) || list_empty(&clocksource_list))
 		return NULL;
 
 	/*
@@ -577,13 +578,13 @@ static struct clocksource *clocksource_find_best(bool oneshot, bool skipcur)
 	return NULL;
 }
 
-static void __clocksource_select(bool skipcur)
+static void __clocksource_select(bool skipcur, bool force)
 {
 	bool oneshot = tick_oneshot_mode_active();
 	struct clocksource *best, *cs;
 
 	/* Find the best suitable clocksource */
-	best = clocksource_find_best(oneshot, skipcur);
+	best = clocksource_find_best(oneshot, skipcur, force);
 	if (!best)
 		return;
 
@@ -623,22 +624,40 @@ static void __clocksource_select(bool skipcur)
  * Select the clocksource with the best rating, or the clocksource,
  * which is selected by userspace override.
  */
-static void clocksource_select(void)
+static void clocksource_select(bool force)
 {
-	__clocksource_select(false);
+	return __clocksource_select(false, force);
 }
 
 static void clocksource_select_fallback(void)
 {
-	__clocksource_select(true);
+	__clocksource_select(true, false);
 }
 
 #else /* !CONFIG_ARCH_USES_GETTIMEOFFSET */
-static inline void clocksource_select(void) { }
+
+static inline void clocksource_select(bool force) { }
 static inline void clocksource_select_fallback(void) { }
 
 #endif
 
+/**
+ * clocksource_select_force - Force re-selection of the best clocksource
+ *				among registered clocksources
+ *
+ * clocksource_select() can't select the best clocksource before
+ * calling clocksource_done_booting() and since clocksource_select()
+ * should be called with clocksource_mutex held, provide a new API
+ * can be called from other files to select best clockrouce irrespective
+ * of finished_booting flag.
+ */
+void clocksource_select_force(void)
+{
+	mutex_lock(&clocksource_mutex);
+	clocksource_select(true);
+	mutex_unlock(&clocksource_mutex);
+}
+
 /*
  * clocksource_done_booting - Called near the end of core bootup
  *
@@ -655,7 +674,7 @@ static int __init clocksource_done_booting(void)
 	 * Run the watchdog first to eliminate unstable clock sources
 	 */
 	__clocksource_watchdog_kthread();
-	clocksource_select();
+	clocksource_select(false);
 	mutex_unlock(&clocksource_mutex);
 	return 0;
 }
@@ -744,6 +763,7 @@ void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq
 }
 EXPORT_SYMBOL_GPL(__clocksource_update_freq_scale);
 
+
 /**
  * __clocksource_register_scale - Used to install new clocksources
  * @cs:		clocksource to be registered
@@ -765,7 +785,7 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
 	mutex_lock(&clocksource_mutex);
 	clocksource_enqueue(cs);
 	clocksource_enqueue_watchdog(cs);
-	clocksource_select();
+	clocksource_select(false);
 	clocksource_select_watchdog(false);
 	mutex_unlock(&clocksource_mutex);
 	return 0;
@@ -788,7 +808,7 @@ void clocksource_change_rating(struct clocksource *cs, int rating)
 {
 	mutex_lock(&clocksource_mutex);
 	__clocksource_change_rating(cs, rating);
-	clocksource_select();
+	clocksource_select(false);
 	clocksource_select_watchdog(false);
 	mutex_unlock(&clocksource_mutex);
 }
@@ -892,7 +912,7 @@ static ssize_t sysfs_override_clocksource(struct device *dev,
 
 	ret = sysfs_get_uname(buf, override_name, count);
 	if (ret >= 0)
-		clocksource_select();
+		clocksource_select(false);
 
 	mutex_unlock(&clocksource_mutex);
 
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 4ff237dbc006..ede4bf13d3e9 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -298,17 +298,34 @@ u32 (*arch_gettimeoffset)(void) = default_arch_gettimeoffset;
 static inline u32 arch_gettimeoffset(void) { return 0; }
 #endif
 
+static inline s64 timekeeping_delta_to_ns(struct tk_read_base *tkr,
+					  cycle_t delta)
+{
+	s64 nsec;
+
+	nsec = delta * tkr->mult + tkr->xtime_nsec;
+	nsec >>= tkr->shift;
+
+	/* If arch requires, add in get_arch_timeoffset() */
+	return nsec + arch_gettimeoffset();
+}
+
 static inline s64 timekeeping_get_ns(struct tk_read_base *tkr)
 {
 	cycle_t delta;
-	s64 nsec;
 
 	delta = timekeeping_get_delta(tkr);
+	return timekeeping_delta_to_ns(tkr, delta);
+}
 
-	nsec = (delta * tkr->mult + tkr->xtime_nsec) >> tkr->shift;
+static inline s64 timekeeping_cycles_to_ns(struct tk_read_base *tkr,
+					    cycle_t cycles)
+{
+	cycle_t delta;
 
-	/* If arch requires, add in get_arch_timeoffset() */
-	return nsec + arch_gettimeoffset();
+	/* calculate the delta since the last update_wall_time */
+	delta = clocksource_delta(cycles, tkr->cycle_last, tkr->mask);
+	return timekeeping_delta_to_ns(tkr, delta);
 }
 
 /**
@@ -385,8 +402,11 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
 		tkr = tkf->base + (seq & 0x01);
 		now = ktime_to_ns(tkr->base);
 
-		now += clocksource_delta(tkr->read(tkr->clock),
-					 tkr->cycle_last, tkr->mask);
+		now += timekeeping_delta_to_ns(tkr,
+				clocksource_delta(
+					tkr->read(tkr->clock),
+					tkr->cycle_last,
+					tkr->mask));
 	} while (read_seqcount_retry(&tkf->seq, seq));
 
 	return now;
@@ -404,6 +424,35 @@ u64 ktime_get_raw_fast_ns(void)
 }
 EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns);
 
+/**
+ * ktime_get_boot_fast_ns - NMI safe and fast access to boot clock.
+ *
+ * To keep it NMI safe since we're accessing from tracing, we're not using a
+ * separate timekeeper with updates to monotonic clock and boot offset
+ * protected with seqlocks. This has the following minor side effects:
+ *
+ * (1) Its possible that a timestamp be taken after the boot offset is updated
+ * but before the timekeeper is updated. If this happens, the new boot offset
+ * is added to the old timekeeping making the clock appear to update slightly
+ * earlier:
+ *    CPU 0                                        CPU 1
+ *    timekeeping_inject_sleeptime64()
+ *    __timekeeping_inject_sleeptime(tk, delta);
+ *                                                 timestamp();
+ *    timekeeping_update(tk, TK_CLEAR_NTP...);
+ *
+ * (2) On 32-bit systems, the 64-bit boot offset (tk->offs_boot) may be
+ * partially updated.  Since the tk->offs_boot update is a rare event, this
+ * should be a rare occurrence which postprocessing should be able to handle.
+ */
+u64 notrace ktime_get_boot_fast_ns(void)
+{
+	struct timekeeper *tk = &tk_core.timekeeper;
+
+	return (ktime_get_mono_fast_ns() + ktime_to_ns(tk->offs_boot));
+}
+EXPORT_SYMBOL_GPL(ktime_get_boot_fast_ns);
+
 /* Suspend-time cycles value for halted fast timekeeper. */
 static cycle_t cycles_at_suspend;
 
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 2acad4b6a92a..2963266fb7bf 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -1,4 +1,8 @@
 
+# We are fully aware of the dangers of __builtin_return_address()
+FRAME_CFLAGS := $(call cc-disable-warning,frame-address)
+KBUILD_CFLAGS += $(FRAME_CFLAGS)
+
 # Do not instrument the tracer itself:
 
 ifdef CONFIG_FUNCTION_TRACER
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 1d0f1a1ac44c..66d9e907aa07 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -895,6 +895,7 @@ static struct {
 	{ trace_clock,			"perf",		1 },
 	{ ktime_get_mono_fast_ns,	"mono",		1 },
 	{ ktime_get_raw_fast_ns,	"mono_raw",	1 },
+	{ ktime_get_boot_fast_ns,	"boot",		1 },
 	ARCH_TRACE_CLOCKS
 };
 
@@ -1357,11 +1358,11 @@ void tracing_reset_all_online_cpus(void)
 
 #define SAVED_CMDLINES_DEFAULT 128
 #define NO_CMDLINE_MAP UINT_MAX
-static unsigned saved_tgids[SAVED_CMDLINES_DEFAULT];
 static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED;
 struct saved_cmdlines_buffer {
 	unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
 	unsigned *map_cmdline_to_pid;
+	unsigned *saved_tgids;
 	unsigned cmdline_num;
 	int cmdline_idx;
 	char *saved_cmdlines;
@@ -1395,12 +1396,22 @@ static int allocate_cmdlines_buffer(unsigned int val,
 		return -ENOMEM;
 	}
 
+	s->saved_tgids = kmalloc_array(val, sizeof(*s->saved_tgids),
+					GFP_KERNEL);
+	if (!s->saved_tgids) {
+		kfree(s->map_cmdline_to_pid);
+		kfree(s->saved_cmdlines);
+		return -ENOMEM;
+	}
+
 	s->cmdline_idx = 0;
 	s->cmdline_num = val;
 	memset(&s->map_pid_to_cmdline, NO_CMDLINE_MAP,
 	       sizeof(s->map_pid_to_cmdline));
 	memset(s->map_cmdline_to_pid, NO_CMDLINE_MAP,
 	       val * sizeof(*s->map_cmdline_to_pid));
+	memset(s->saved_tgids, 0,
+	       val * sizeof(*s->saved_tgids));
 
 	return 0;
 }
@@ -1596,7 +1607,7 @@ static int trace_save_cmdline(struct task_struct *tsk)
 	}
 
 	set_cmdline(idx, tsk->comm);
-	saved_tgids[idx] = tsk->tgid;
+	savedcmd->saved_tgids[idx] = tsk->tgid;
 	arch_spin_unlock(&trace_cmdline_lock);
 
 	return 1;
@@ -1648,7 +1659,7 @@ int trace_find_tgid(int pid)
 	arch_spin_lock(&trace_cmdline_lock);
 	map = savedcmd->map_pid_to_cmdline[pid];
 	if (map != NO_CMDLINE_MAP)
-		tgid = saved_tgids[map];
+		tgid = savedcmd->saved_tgids[map];
 	else
 		tgid = -1;
 
@@ -3980,6 +3991,7 @@ static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s)
 {
 	kfree(s->saved_cmdlines);
 	kfree(s->map_cmdline_to_pid);
+	kfree(s->saved_tgids);
 	kfree(s);
 }
 
@@ -4221,13 +4233,13 @@ tracing_saved_tgids_read(struct file *file, char __user *ubuf,
 	int pid;
 	int i;
 
-	file_buf = kmalloc(SAVED_CMDLINES_DEFAULT*(16+1+16), GFP_KERNEL);
+	file_buf = kmalloc(savedcmd->cmdline_num*(16+1+16), GFP_KERNEL);
 	if (!file_buf)
 		return -ENOMEM;
 
 	buf = file_buf;
 
-	for (i = 0; i < SAVED_CMDLINES_DEFAULT; i++) {
+	for (i = 0; i < savedcmd->cmdline_num; i++) {
 		int tgid;
 		int r;
 
@@ -4822,19 +4834,20 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
 	struct trace_iterator *iter = filp->private_data;
 	ssize_t sret;
 
-	/* return any leftover data */
-	sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
-	if (sret != -EBUSY)
-		return sret;
-
-	trace_seq_init(&iter->seq);
-
 	/*
 	 * Avoid more than one consumer on a single file descriptor
 	 * This is just a matter of traces coherency, the ring buffer itself
 	 * is protected.
 	 */
 	mutex_lock(&iter->mutex);
+
+	/* return any leftover data */
+	sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
+	if (sret != -EBUSY)
+		goto out;
+
+	trace_seq_init(&iter->seq);
+
 	if (iter->trace->read) {
 		sret = iter->trace->read(iter, filp, ubuf, cnt, ppos);
 		if (sret)
@@ -5864,9 +5877,6 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 		return -EBUSY;
 #endif
 
-	if (splice_grow_spd(pipe, &spd))
-		return -ENOMEM;
-
 	if (*ppos & (PAGE_SIZE - 1))
 		return -EINVAL;
 
@@ -5876,6 +5886,9 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 		len &= PAGE_MASK;
 	}
 
+	if (splice_grow_spd(pipe, &spd))
+		return -ENOMEM;
+
  again:
 	trace_access_lock(iter->cpu_file);
 	entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file);
@@ -5933,19 +5946,21 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 	/* did we read anything? */
 	if (!spd.nr_pages) {
 		if (ret)
-			return ret;
+			goto out;
 
+		ret = -EAGAIN;
 		if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK))
-			return -EAGAIN;
+			goto out;
 
 		ret = wait_on_pipe(iter, true);
 		if (ret)
-			return ret;
+			goto out;
 
 		goto again;
 	}
 
 	ret = splice_to_pipe(pipe, &spd);
+out:
 	splice_shrink_spd(&spd);
 
 	return ret;
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index f2813e137b23..91fa701b4a24 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -27,6 +27,7 @@
 #include <linux/kvm_para.h>
 #include <linux/perf_event.h>
 #include <linux/kthread.h>
+#include <soc/qcom/watchdog.h>
 
 /*
  * The run state of the lockup detectors is controlled by the content of the
@@ -366,8 +367,11 @@ static void watchdog_check_hardlockup_other_cpu(void)
 		if (per_cpu(hard_watchdog_warn, next_cpu) == true)
 			return;
 
-		if (hardlockup_panic)
-			panic("Watchdog detected hard LOCKUP on cpu %u", next_cpu);
+		if (hardlockup_panic) {
+			pr_err("Watchdog detected hard LOCKUP on cpu %u",
+					next_cpu);
+			msm_trigger_wdog_bite();
+		}
 		else
 			WARN(1, "Watchdog detected hard LOCKUP on cpu %u", next_cpu);
 
@@ -430,6 +434,9 @@ static void watchdog_overflow_callback(struct perf_event *event,
 			return;
 
 		pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
+		if (hardlockup_panic)
+			msm_trigger_wdog_bite();
+
 		print_modules();
 		print_irqtrace_events(current);
 		if (regs)
@@ -552,6 +559,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
 		pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
 			smp_processor_id(), duration,
 			current->comm, task_pid_nr(current));
+
+		if (softlockup_panic)
+			msm_trigger_wdog_bite();
 		__this_cpu_write(softlockup_task_ptr_saved, current);
 		print_modules();
 		print_irqtrace_events(current);