summaryrefslogtreecommitdiff
path: root/kernel/sched/core.c
diff options
context:
space:
mode:
authorBlagovest Kolenichev <bkolenichev@codeaurora.org>2017-11-06 07:02:23 -0800
committerBlagovest Kolenichev <bkolenichev@codeaurora.org>2017-11-06 15:58:47 -0800
commit985aecee1dfefe1471290deffdebd2a9a2f073c9 (patch)
tree8c55a0ea6933c9ee3e0649ccda5fbd03c013367b /kernel/sched/core.c
parent22b18281457de02c9c830504fd4d48726db86f31 (diff)
parentceee5bdd470586fddfbbb8c6d0287ba792525d3f (diff)
Merge android-4.4@ceee5bd (v4.4.95) into msm-4.4
* refs/heads/tmp-ceee5bd BACKPORT: arm64: relocatable: suppress R_AARCH64_ABS64 relocations in vmlinux sched/core: fix have_sched_energy_data build warning sched/core: Warn if ENERGY_AWARE is enabled but data is missing sched: walt: Correct WALT window size initialization FROMLIST: sched/fair: Use wake_q length as a hint for wake_wide sched: WALT: account cumulative window demand sched/fair: remove useless variable in find_best_target sched/tune: access schedtune_initialized under CGROUP_SCHEDTUNE sched/fair: consider task utilization in group_max_util() sched/fair: consider task utilization in group_norm_util() sched/fair: enforce EAS mode sched/fair: ignore backup CPU when not valid sched/fair: trace energy_diff for non boosted tasks UPSTREAM: sched/fair: Sync task util before slow-path wakeup UPSTREAM: sched/fair: Fix usage of find_idlest_group() when the local group is idlest UPSTREAM: sched/fair: Fix usage of find_idlest_group() when no groups are allowed BACKPORT: sched/fair: Fix find_idlest_group when local group is not allowed UPSTREAM: sched/fair: Remove unnecessary comparison with -1 BACKPORT: sched/fair: Move select_task_rq_fair slow-path into its own function UPSTREAM: sched/fair: Force balancing on nohz balance if local group has capacity UPSTREAM: sched/core: Add missing update_rq_clock() call in set_user_nice() UPSTREAM: sched/core: Add missing update_rq_clock() call for task_hot() UPSTREAM: sched/core: Add missing update_rq_clock() in detach_task_cfs_rq() UPSTREAM: sched/core: Add missing update_rq_clock() in post_init_entity_util_avg() UPSTREAM: sched/core: Fix find_idlest_group() for fork BACKPORT: sched/fair: Fix PELT integrity for new tasks BACKPORT: sched/cgroup: Fix cpu_cgroup_fork() handling UPSTREAM: sched/fair: Fix and optimize the fork() path BACKPORT: sched/fair: Make it possible to account fair load avg consistently cpufreq/sched: Consider max cpu capacity when choosing frequencies Linux 4.4.95 FS-Cache: fix dereference of NULL user_key_payload fscrypto: require write access to mount to set encryption policy KEYS: Fix race between updating and finding a negative key fscrypt: fix dereference of NULL user_key_payload f2fs crypto: add missing locking for keyring_key access f2fs crypto: replace some BUG_ON()'s with error checks sched/autogroup: Fix autogroup_move_group() to never skip sched_move_task() parisc: Fix double-word compare and exchange in LWS code on 32-bit kernels parisc: Avoid trashing sr2 and sr3 in LWS code pkcs7: Prevent NULL pointer dereference, since sinfo is not always set. KEYS: don't let add_key() update an uninstantiated key lib/digsig: fix dereference of NULL user_key_payload KEYS: encrypted: fix dereference of NULL user_key_payload rtlwifi: rtl8821ae: Fix connection lost problem clockevents/drivers/cs5535: Improve resilience to spurious interrupts bus: mbus: fix window size calculation for 4GB windows brcmsmac: make some local variables 'static const' to reduce stack size i2c: ismt: Separate I2C block read from SMBus block read ALSA: hda: Remove superfluous '-' added by printk conversion ALSA: seq: Enable 'use' locking in all configurations drm/nouveau/mmu: flush tlbs before deleting page tables drm/nouveau/bsp/g92: disable by default can: esd_usb2: Fix can_dlc value for received RTR, frames usb: musb: Check for host-mode using is_host_active() on reset interrupt usb: musb: sunxi: Explicitly release USB PHY on exit can: gs_usb: fix busy loop if no more TX context is available ALSA: usb-audio: Add native DSD support for Pro-Ject Pre Box S2 Digital usb: hub: Allow reset retry for USB2 devices on connect bounce usb: quirks: add quirk for WORLDE MINI MIDI keyboard usb: cdc_acm: Add quirk for Elatec TWN3 USB: serial: metro-usb: add MS7820 device id USB: core: fix out-of-bounds access bug in usb_get_bos_descriptor() USB: devio: Revert "USB: devio: Don't corrupt user memory" ANDROID: binder: show high watermark of alloc->pages. ANDROID: binder: Add thread->process_todo flag. UPSTREAM: arm64: compat: Remove leftover variable declaration ANDROID: sched/fair: Select correct capacity state for energy_diff Revert "UPSTREAM: efi/libstub/arm64: Set -fpie when building the EFI stub" cpufreq: schedutil: clamp util to CPU maximum capacity FROMLIST: android: binder: Fix null ptr dereference in debug msg FROMLIST: android: binder: Change binder_shrinker to static cpufreq/sched: Use cpu max freq rather than policy max Conflicts: include/linux/sched.h kernel/sched/core.c kernel/sched/fair.c Change-Id: I2751f851df741f00e797deaf2119872b3dced655 Signed-off-by: Blagovest Kolenichev <bkolenichev@codeaurora.org>
Diffstat (limited to 'kernel/sched/core.c')
-rw-r--r--kernel/sched/core.c184
1 files changed, 144 insertions, 40 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c408280ddd12..eacfd2ac56a1 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -99,6 +99,10 @@
ATOMIC_NOTIFIER_HEAD(load_alert_notifier_head);
+#ifdef CONFIG_SMP
+static bool have_sched_energy_data(void);
+#endif
+
DEFINE_MUTEX(sched_domains_mutex);
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
@@ -201,6 +205,11 @@ static int sched_feat_set(char *cmp)
sysctl_sched_features &= ~(1UL << i);
sched_feat_disable(i);
} else {
+#ifdef CONFIG_SMP
+ if (i == __SCHED_FEAT_ENERGY_AWARE)
+ WARN(!have_sched_energy_data(),
+ "Missing sched energy data\n");
+#endif
sysctl_sched_features |= (1UL << i);
sched_feat_enable(i);
}
@@ -554,6 +563,8 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task)
if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL))
return;
+ head->count++;
+
get_task_struct(task);
/*
@@ -563,6 +574,10 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task)
head->lastp = &node->next;
}
+static int
+try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags,
+ int sibling_count_hint);
+
void wake_up_q(struct wake_q_head *head)
{
struct wake_q_node *node = head->first;
@@ -577,10 +592,10 @@ void wake_up_q(struct wake_q_head *head)
task->wake_q.next = NULL;
/*
- * wake_up_process() implies a wmb() to pair with the queueing
+ * try_to_wake_up() implies a wmb() to pair with the queueing
* in wake_q_add() so as not to miss wakeups.
*/
- wake_up_process(task);
+ try_to_wake_up(task, TASK_NORMAL, 0, head->count);
put_task_struct(task);
}
}
@@ -1702,14 +1717,16 @@ out:
* The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
*/
static inline
-int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
+int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags,
+ int sibling_count_hint)
{
bool allow_isolated = (p->flags & PF_KTHREAD);
lockdep_assert_held(&p->pi_lock);
if (p->nr_cpus_allowed > 1)
- cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
+ cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags,
+ sibling_count_hint);
/*
* In order not to call set_task_cpu() on a blocking task we need
@@ -2007,6 +2024,8 @@ static void ttwu_queue(struct task_struct *p, int cpu)
* @p: the thread to be awakened
* @state: the mask of task states that can be woken
* @wake_flags: wake modifier flags (WF_*)
+ * @sibling_count_hint: A hint at the number of threads that are being woken up
+ * in this event.
*
* Put it on the run-queue if it's not already there. The "current"
* thread is always on the run-queue (except when the actual
@@ -2018,7 +2037,8 @@ static void ttwu_queue(struct task_struct *p, int cpu)
* or @state didn't match @p's state.
*/
static int
-try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
+try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags,
+ int sibling_count_hint)
{
unsigned long flags;
int cpu, src_cpu, success = 0;
@@ -2134,7 +2154,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
if (p->sched_class->task_waking)
p->sched_class->task_waking(p);
- cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
+ cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags,
+ sibling_count_hint);
/* Refresh src_cpu as it could have changed since we last read it */
src_cpu = task_cpu(p);
@@ -2236,7 +2257,7 @@ out:
*/
int wake_up_process(struct task_struct *p)
{
- return try_to_wake_up(p, TASK_NORMAL, 0);
+ return try_to_wake_up(p, TASK_NORMAL, 0, 1);
}
EXPORT_SYMBOL(wake_up_process);
@@ -2256,13 +2277,13 @@ EXPORT_SYMBOL(wake_up_process);
int wake_up_process_no_notif(struct task_struct *p)
{
WARN_ON(task_is_stopped_or_traced(p));
- return try_to_wake_up(p, TASK_NORMAL, WF_NO_NOTIFIER);
+ return try_to_wake_up(p, TASK_NORMAL, WF_NO_NOTIFIER, 1);
}
EXPORT_SYMBOL(wake_up_process_no_notif);
int wake_up_state(struct task_struct *p, unsigned int state)
{
- return try_to_wake_up(p, state, 0);
+ return try_to_wake_up(p, state, 0, 1);
}
/*
@@ -2337,9 +2358,16 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
p->se.prev_sum_exec_runtime = 0;
p->se.nr_migrations = 0;
p->se.vruntime = 0;
+#ifdef CONFIG_SCHED_WALT
+ p->last_sleep_ts = 0;
+#endif
INIT_LIST_HEAD(&p->se.group_node);
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ p->se.cfs_rq = NULL;
+#endif
+
#ifdef CONFIG_SCHEDSTATS
memset(&p->se.statistics, 0, sizeof(p->se.statistics));
#endif
@@ -2429,11 +2457,11 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
__sched_fork(clone_flags, p);
/*
- * We mark the process as running here. This guarantees that
+ * We mark the process as NEW here. This guarantees that
* nobody will actually run it, and a signal or other external
* event cannot wake it up and insert it on the runqueue either.
*/
- p->state = TASK_RUNNING;
+ p->state = TASK_NEW;
/*
* Make sure we do not leak PI boosting priority to the child.
@@ -2470,8 +2498,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
p->sched_class = &fair_sched_class;
}
- if (p->sched_class->task_fork)
- p->sched_class->task_fork(p);
+ init_entity_runnable_average(&p->se);
/*
* The child is not yet in the pid-hash so no cgroup attach races,
@@ -2481,7 +2508,13 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
* Silence PROVE_RCU.
*/
raw_spin_lock_irqsave(&p->pi_lock, flags);
- set_task_cpu(p, cpu);
+ /*
+ * We're setting the cpu for the first time, we don't migrate,
+ * so use __set_task_cpu().
+ */
+ __set_task_cpu(p, cpu);
+ if (p->sched_class->task_fork)
+ p->sched_class->task_fork(p);
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
#ifdef CONFIG_SCHED_INFO
@@ -2614,6 +2647,8 @@ void wake_up_new_task(struct task_struct *p)
add_new_task_to_grp(p);
raw_spin_lock_irqsave(&p->pi_lock, flags);
+ p->state = TASK_RUNNING;
+
/* Initialize new task's runnable average */
init_entity_runnable_average(&p->se);
#ifdef CONFIG_SMP
@@ -2621,11 +2656,15 @@ void wake_up_new_task(struct task_struct *p)
* Fork balancing, do it here and not earlier because:
* - cpus_allowed can change in the fork path
* - any previously selected cpu might disappear through hotplug
+ *
+ * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
+ * as we're not fully set-up yet.
*/
- set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
+ __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0, 1));
#endif
rq = __task_rq_lock(p);
mark_task_starting(p);
+ update_rq_clock(rq);
post_init_entity_util_avg(&p->se);
activate_task(rq, p, ENQUEUE_WAKEUP_NEW);
p->on_rq = TASK_ON_RQ_QUEUED;
@@ -3071,7 +3110,7 @@ void sched_exec(void)
raw_spin_lock_irqsave(&p->pi_lock, flags);
curr_cpu = task_cpu(p);
- dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
+ dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0, 1);
if (dest_cpu == smp_processor_id())
goto unlock;
@@ -3171,7 +3210,9 @@ static void sched_freq_tick_pelt(int cpu)
* utilization and to harm its performance the least, request
* a jump to a higher OPP as soon as the margin of free capacity
* is impacted (specified by capacity_margin).
+ * Remember CPU utilization in sched_capacity_reqs should be normalised.
*/
+ cpu_utilization = cpu_utilization * SCHED_CAPACITY_SCALE / capacity_orig_of(cpu);
set_cfs_cpu_capacity(cpu, true, cpu_utilization);
}
@@ -3198,7 +3239,9 @@ static void sched_freq_tick_walt(int cpu)
* It is likely that the load is growing so we
* keep the added margin in our request as an
* extra boost.
+ * Remember CPU utilization in sched_capacity_reqs should be normalised.
*/
+ cpu_utilization = cpu_utilization * SCHED_CAPACITY_SCALE / capacity_orig_of(cpu);
set_cfs_cpu_capacity(cpu, true, cpu_utilization);
}
@@ -3579,6 +3622,10 @@ static void __sched notrace __schedule(bool preempt)
if (!is_idle_task(prev) && !prev->on_rq)
update_avg_burst(prev);
+#ifdef CONFIG_SCHED_WALT
+ if (!prev->on_rq)
+ prev->last_sleep_ts = wallclock;
+#endif
rq->nr_switches++;
rq->curr = next;
++*switch_count;
@@ -3755,7 +3802,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void)
int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
void *key)
{
- return try_to_wake_up(curr->private, mode, wake_flags);
+ return try_to_wake_up(curr->private, mode, wake_flags, 1);
}
EXPORT_SYMBOL(default_wake_function);
@@ -3781,6 +3828,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
BUG_ON(prio > MAX_PRIO);
rq = __task_rq_lock(p);
+ update_rq_clock(rq);
/*
* Idle task boosting is a nono in general. There is one
@@ -3876,6 +3924,8 @@ void set_user_nice(struct task_struct *p, long nice)
* the task might be in the middle of scheduling on another CPU.
*/
rq = task_rq_lock(p, &flags);
+ update_rq_clock(rq);
+
/*
* The RT priorities are set via sched_setscheduler(), but we still
* allow the 'normal' nice value to be set - but as expected
@@ -4303,6 +4353,7 @@ recheck:
* runqueue lock must be held.
*/
rq = task_rq_lock(p, &flags);
+ update_rq_clock(rq);
/*
* Changing the policy of the stop threads its a very bad idea
@@ -7151,6 +7202,19 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
atomic_set(&sg->sgc->nr_busy_cpus, sg->group_weight);
}
+static bool have_sched_energy_data(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ if (!rcu_dereference(per_cpu(sd_scs, cpu)) ||
+ !rcu_dereference(per_cpu(sd_ea, cpu)))
+ return false;
+ }
+
+ return true;
+}
+
/*
* Check that the per-cpu provided sd energy data is consistent for all cpus
* within the mask.
@@ -7967,6 +8031,9 @@ static int build_sched_domains(const struct cpumask *cpu_map,
}
rcu_read_unlock();
+ WARN(sched_feat(ENERGY_AWARE) && !have_sched_energy_data(),
+ "Missing data for energy aware scheduling\n");
+
ret = 0;
error:
__free_domain_allocs(&d, alloc_state, cpu_map);
@@ -8784,27 +8851,9 @@ void sched_offline_group(struct task_group *tg)
spin_unlock_irqrestore(&task_group_lock, flags);
}
-/* change task's runqueue when it moves between groups.
- * The caller of this function should have put the task in its new group
- * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to
- * reflect its new group.
- */
-void sched_move_task(struct task_struct *tsk)
+static void sched_change_group(struct task_struct *tsk, int type)
{
struct task_group *tg;
- int queued, running;
- unsigned long flags;
- struct rq *rq;
-
- rq = task_rq_lock(tsk, &flags);
-
- running = task_current(rq, tsk);
- queued = task_on_rq_queued(tsk);
-
- if (queued)
- dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE);
- if (unlikely(running))
- put_prev_task(rq, tsk);
/*
* All callers are synchronized by task_rq_lock(); we do not use RCU
@@ -8817,11 +8866,37 @@ void sched_move_task(struct task_struct *tsk)
tsk->sched_task_group = tg;
#ifdef CONFIG_FAIR_GROUP_SCHED
- if (tsk->sched_class->task_move_group)
- tsk->sched_class->task_move_group(tsk);
+ if (tsk->sched_class->task_change_group)
+ tsk->sched_class->task_change_group(tsk, type);
else
#endif
set_task_rq(tsk, task_cpu(tsk));
+}
+
+/*
+ * Change task's runqueue when it moves between groups.
+ *
+ * The caller of this function should have put the task in its new group by
+ * now. This function just updates tsk->se.cfs_rq and tsk->se.parent to reflect
+ * its new group.
+ */
+void sched_move_task(struct task_struct *tsk)
+{
+ int queued, running;
+ unsigned long flags;
+ struct rq *rq;
+
+ rq = task_rq_lock(tsk, &flags);
+
+ running = task_current(rq, tsk);
+ queued = task_on_rq_queued(tsk);
+
+ if (queued)
+ dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE);
+ if (unlikely(running))
+ put_prev_task(rq, tsk);
+
+ sched_change_group(tsk, TASK_MOVE_GROUP);
if (unlikely(running))
tsk->sched_class->set_curr_task(rq);
@@ -9258,15 +9333,28 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
sched_free_group(tg);
}
+/*
+ * This is called before wake_up_new_task(), therefore we really only
+ * have to set its group bits, all the other stuff does not apply.
+ */
static void cpu_cgroup_fork(struct task_struct *task, void *private)
{
- sched_move_task(task);
+ unsigned long flags;
+ struct rq *rq;
+
+ rq = task_rq_lock(task, &flags);
+
+ update_rq_clock(rq);
+ sched_change_group(task, TASK_SET_GROUP);
+
+ task_rq_unlock(rq, task, &flags);
}
static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
{
struct task_struct *task;
struct cgroup_subsys_state *css;
+ int ret = 0;
cgroup_taskset_for_each(task, css, tset) {
#ifdef CONFIG_RT_GROUP_SCHED
@@ -9277,8 +9365,24 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
if (task->sched_class != &fair_sched_class)
return -EINVAL;
#endif
+ /*
+ * Serialize against wake_up_new_task() such that if its
+ * running, we're sure to observe its full state.
+ */
+ raw_spin_lock_irq(&task->pi_lock);
+ /*
+ * Avoid calling sched_move_task() before wake_up_new_task()
+ * has happened. This would lead to problems with PELT, due to
+ * move wanting to detach+attach while we're not attached yet.
+ */
+ if (task->state == TASK_NEW)
+ ret = -EINVAL;
+ raw_spin_unlock_irq(&task->pi_lock);
+
+ if (ret)
+ break;
}
- return 0;
+ return ret;
}
static void cpu_cgroup_attach(struct cgroup_taskset *tset)