summaryrefslogtreecommitdiff
path: root/kernel/sched/rt.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/rt.c')
-rw-r--r--kernel/sched/rt.c477
1 files changed, 350 insertions, 127 deletions
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 8ec86abe0ea1..4184d4a97fd2 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -7,6 +7,10 @@
#include <linux/slab.h>
#include <linux/irq_work.h>
+#include <linux/hrtimer.h>
+
+#include "walt.h"
+#include "tune.h"
int sched_rr_timeslice = RR_TIMESLICE;
@@ -64,10 +68,6 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
raw_spin_unlock(&rt_b->rt_runtime_lock);
}
-#if defined(CONFIG_SMP) && defined(HAVE_RT_PUSH_IPI)
-static void push_irq_work_func(struct irq_work *work);
-#endif
-
void init_rt_rq(struct rt_rq *rt_rq)
{
struct rt_prio_array *array;
@@ -87,13 +87,6 @@ void init_rt_rq(struct rt_rq *rt_rq)
rt_rq->rt_nr_migratory = 0;
rt_rq->overloaded = 0;
plist_head_init(&rt_rq->pushable_tasks);
-
-#ifdef HAVE_RT_PUSH_IPI
- rt_rq->push_flags = 0;
- rt_rq->push_cpu = nr_cpu_ids;
- raw_spin_lock_init(&rt_rq->push_lock);
- init_irq_work(&rt_rq->push_work, push_irq_work_func);
-#endif
#endif /* CONFIG_SMP */
/* We start is dequeued state, because no RT tasks are queued */
rt_rq->rt_queued = 0;
@@ -833,6 +826,8 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
struct rq *rq = rq_of_rt_rq(rt_rq);
raw_spin_lock(&rq->lock);
+ update_rq_clock(rq);
+
if (rt_rq->rt_time) {
u64 runtime;
@@ -889,6 +884,51 @@ static inline int rt_se_prio(struct sched_rt_entity *rt_se)
return rt_task_of(rt_se)->prio;
}
+static void dump_throttled_rt_tasks(struct rt_rq *rt_rq)
+{
+ struct rt_prio_array *array = &rt_rq->active;
+ struct sched_rt_entity *rt_se;
+ char buf[500];
+ char *pos = buf;
+ char *end = buf + sizeof(buf);
+ int idx;
+
+ pos += snprintf(pos, sizeof(buf),
+ "sched: RT throttling activated for rt_rq %p (cpu %d)\n",
+ rt_rq, cpu_of(rq_of_rt_rq(rt_rq)));
+
+ if (bitmap_empty(array->bitmap, MAX_RT_PRIO))
+ goto out;
+
+ pos += snprintf(pos, end - pos, "potential CPU hogs:\n");
+ idx = sched_find_first_bit(array->bitmap);
+ while (idx < MAX_RT_PRIO) {
+ list_for_each_entry(rt_se, array->queue + idx, run_list) {
+ struct task_struct *p;
+
+ if (!rt_entity_is_task(rt_se))
+ continue;
+
+ p = rt_task_of(rt_se);
+ if (pos < end)
+ pos += snprintf(pos, end - pos, "\t%s (%d)\n",
+ p->comm, p->pid);
+ }
+ idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx + 1);
+ }
+out:
+#ifdef CONFIG_PANIC_ON_RT_THROTTLING
+ /*
+ * Use pr_err() in the BUG() case since printk_sched() will
+ * not get flushed and deadlock is not a concern.
+ */
+ pr_err("%s", buf);
+ BUG();
+#else
+ printk_deferred("%s", buf);
+#endif
+}
+
static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
{
u64 runtime = sched_rt_runtime(rt_rq);
@@ -912,8 +952,14 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
* but accrue some time due to boosting.
*/
if (likely(rt_b->rt_runtime)) {
+ static bool once = false;
+
rt_rq->rt_throttled = 1;
- printk_deferred_once("sched: RT throttling activated\n");
+
+ if (!once) {
+ once = true;
+ dump_throttled_rt_tasks(rt_rq);
+ }
} else {
/*
* In case we did anyway, make it go away,
@@ -932,6 +978,70 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
return 0;
}
+#define RT_SCHEDTUNE_INTERVAL 50000000ULL
+
+static enum hrtimer_restart rt_schedtune_timer(struct hrtimer *timer)
+{
+ struct sched_rt_entity *rt_se = container_of(timer,
+ struct sched_rt_entity,
+ schedtune_timer);
+ struct task_struct *p = rt_task_of(rt_se);
+ struct rq *rq = task_rq(p);
+
+ raw_spin_lock(&rq->lock);
+
+ /*
+ * Nothing to do if:
+ * - task has switched runqueues
+ * - task isn't RT anymore
+ */
+ if (rq != task_rq(p) || (p->sched_class != &rt_sched_class))
+ goto out;
+
+ /*
+ * If task got enqueued back during callback time, it means we raced
+ * with the enqueue on another cpu, that's Ok, just do nothing as
+ * enqueue path would have tried to cancel us and we shouldn't run
+ * Also check the schedtune_enqueued flag as class-switch on a
+ * sleeping task may have already canceled the timer and done dq
+ */
+ if (p->on_rq || !rt_se->schedtune_enqueued)
+ goto out;
+
+ /*
+ * RT task is no longer active, cancel boost
+ */
+ rt_se->schedtune_enqueued = false;
+ schedtune_dequeue_task(p, cpu_of(rq));
+ cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_RT);
+out:
+ raw_spin_unlock(&rq->lock);
+
+ /*
+ * This can free the task_struct if no more references.
+ */
+ put_task_struct(p);
+
+ return HRTIMER_NORESTART;
+}
+
+void init_rt_schedtune_timer(struct sched_rt_entity *rt_se)
+{
+ struct hrtimer *timer = &rt_se->schedtune_timer;
+
+ hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ timer->function = rt_schedtune_timer;
+ rt_se->schedtune_enqueued = false;
+}
+
+static void start_schedtune_timer(struct sched_rt_entity *rt_se)
+{
+ struct hrtimer *timer = &rt_se->schedtune_timer;
+
+ hrtimer_start(timer, ns_to_ktime(RT_SCHEDTUNE_INTERVAL),
+ HRTIMER_MODE_REL_PINNED);
+}
+
/*
* Update the current task's runtime statistics. Skip current tasks that
* are not in our scheduling class.
@@ -949,6 +1059,9 @@ static void update_curr_rt(struct rq *rq)
if (unlikely((s64)delta_exec <= 0))
return;
+ /* Kick cpufreq (see the comment in kernel/sched/sched.h). */
+ cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_RT);
+
schedstat_set(curr->se.statistics.exec_max,
max(curr->se.statistics.exec_max, delta_exec));
@@ -1261,9 +1374,37 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
rt_se->timeout = 0;
enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
+ walt_inc_cumulative_runnable_avg(rq, p);
if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
enqueue_pushable_task(rq, p);
+
+ if (!schedtune_task_boost(p))
+ return;
+
+ /*
+ * If schedtune timer is active, that means a boost was already
+ * done, just cancel the timer so that deboost doesn't happen.
+ * Otherwise, increase the boost. If an enqueued timer was
+ * cancelled, put the task reference.
+ */
+ if (hrtimer_try_to_cancel(&rt_se->schedtune_timer) == 1)
+ put_task_struct(p);
+
+ /*
+ * schedtune_enqueued can be true in the following situation:
+ * enqueue_task_rt grabs rq lock before timer fires
+ * or before its callback acquires rq lock
+ * schedtune_enqueued can be false if timer callback is running
+ * and timer just released rq lock, or if the timer finished
+ * running and canceling the boost
+ */
+ if (rt_se->schedtune_enqueued)
+ return;
+
+ rt_se->schedtune_enqueued = true;
+ schedtune_enqueue_task(p, cpu_of(rq));
+ cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_RT);
}
static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
@@ -1272,8 +1413,22 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
update_curr_rt(rq);
dequeue_rt_entity(rt_se);
+ walt_dec_cumulative_runnable_avg(rq, p);
dequeue_pushable_task(rq, p);
+
+ if (!rt_se->schedtune_enqueued)
+ return;
+
+ if (flags == DEQUEUE_SLEEP) {
+ get_task_struct(p);
+ start_schedtune_timer(rt_se);
+ return;
+ }
+
+ rt_se->schedtune_enqueued = false;
+ schedtune_dequeue_task(p, cpu_of(rq));
+ cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_RT);
}
/*
@@ -1313,8 +1468,35 @@ static void yield_task_rt(struct rq *rq)
#ifdef CONFIG_SMP
static int find_lowest_rq(struct task_struct *task);
+/*
+ * Perform a schedtune dequeue and cancelation of boost timers if needed.
+ * Should be called only with the rq->lock held.
+ */
+static void schedtune_dequeue_rt(struct rq *rq, struct task_struct *p)
+{
+ struct sched_rt_entity *rt_se = &p->rt;
+
+ BUG_ON(!raw_spin_is_locked(&rq->lock));
+
+ if (!rt_se->schedtune_enqueued)
+ return;
+
+ /*
+ * Incase of class change cancel any active timers. If an enqueued
+ * timer was cancelled, put the task ref.
+ */
+ if (hrtimer_try_to_cancel(&rt_se->schedtune_timer) == 1)
+ put_task_struct(p);
+
+ /* schedtune_enqueued is true, deboost it */
+ rt_se->schedtune_enqueued = false;
+ schedtune_dequeue_task(p, task_cpu(p));
+ cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_RT);
+}
+
static int
-select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
+select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags,
+ int sibling_count_hint)
{
struct task_struct *curr;
struct rq *rq;
@@ -1366,6 +1548,19 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
rcu_read_unlock();
out:
+ /*
+ * If previous CPU was different, make sure to cancel any active
+ * schedtune timers and deboost.
+ */
+ if (task_cpu(p) != cpu) {
+ unsigned long fl;
+ struct rq *prq = task_rq(p);
+
+ raw_spin_lock_irqsave(&prq->lock, fl);
+ schedtune_dequeue_rt(prq, p);
+ raw_spin_unlock_irqrestore(&prq->lock, fl);
+ }
+
return cpu;
}
@@ -1780,7 +1975,9 @@ retry:
}
deactivate_task(rq, next_task, 0);
+ next_task->on_rq = TASK_ON_RQ_MIGRATING;
set_task_cpu(next_task, lowest_rq->cpu);
+ next_task->on_rq = TASK_ON_RQ_QUEUED;
activate_task(lowest_rq, next_task, 0);
ret = 1;
@@ -1802,160 +1999,172 @@ static void push_rt_tasks(struct rq *rq)
}
#ifdef HAVE_RT_PUSH_IPI
+
/*
- * The search for the next cpu always starts at rq->cpu and ends
- * when we reach rq->cpu again. It will never return rq->cpu.
- * This returns the next cpu to check, or nr_cpu_ids if the loop
- * is complete.
+ * When a high priority task schedules out from a CPU and a lower priority
+ * task is scheduled in, a check is made to see if there's any RT tasks
+ * on other CPUs that are waiting to run because a higher priority RT task
+ * is currently running on its CPU. In this case, the CPU with multiple RT
+ * tasks queued on it (overloaded) needs to be notified that a CPU has opened
+ * up that may be able to run one of its non-running queued RT tasks.
+ *
+ * All CPUs with overloaded RT tasks need to be notified as there is currently
+ * no way to know which of these CPUs have the highest priority task waiting
+ * to run. Instead of trying to take a spinlock on each of these CPUs,
+ * which has shown to cause large latency when done on machines with many
+ * CPUs, sending an IPI to the CPUs to have them push off the overloaded
+ * RT tasks waiting to run.
+ *
+ * Just sending an IPI to each of the CPUs is also an issue, as on large
+ * count CPU machines, this can cause an IPI storm on a CPU, especially
+ * if its the only CPU with multiple RT tasks queued, and a large number
+ * of CPUs scheduling a lower priority task at the same time.
+ *
+ * Each root domain has its own irq work function that can iterate over
+ * all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT
+ * tassk must be checked if there's one or many CPUs that are lowering
+ * their priority, there's a single irq work iterator that will try to
+ * push off RT tasks that are waiting to run.
+ *
+ * When a CPU schedules a lower priority task, it will kick off the
+ * irq work iterator that will jump to each CPU with overloaded RT tasks.
+ * As it only takes the first CPU that schedules a lower priority task
+ * to start the process, the rto_start variable is incremented and if
+ * the atomic result is one, then that CPU will try to take the rto_lock.
+ * This prevents high contention on the lock as the process handles all
+ * CPUs scheduling lower priority tasks.
+ *
+ * All CPUs that are scheduling a lower priority task will increment the
+ * rt_loop_next variable. This will make sure that the irq work iterator
+ * checks all RT overloaded CPUs whenever a CPU schedules a new lower
+ * priority task, even if the iterator is in the middle of a scan. Incrementing
+ * the rt_loop_next will cause the iterator to perform another scan.
*
- * rq->rt.push_cpu holds the last cpu returned by this function,
- * or if this is the first instance, it must hold rq->cpu.
*/
-static int rto_next_cpu(struct rq *rq)
+static int rto_next_cpu(struct root_domain *rd)
{
- int prev_cpu = rq->rt.push_cpu;
+ int next;
int cpu;
- cpu = cpumask_next(prev_cpu, rq->rd->rto_mask);
-
/*
- * If the previous cpu is less than the rq's CPU, then it already
- * passed the end of the mask, and has started from the beginning.
- * We end if the next CPU is greater or equal to rq's CPU.
+ * When starting the IPI RT pushing, the rto_cpu is set to -1,
+ * rt_next_cpu() will simply return the first CPU found in
+ * the rto_mask.
+ *
+ * If rto_next_cpu() is called with rto_cpu is a valid cpu, it
+ * will return the next CPU found in the rto_mask.
+ *
+ * If there are no more CPUs left in the rto_mask, then a check is made
+ * against rto_loop and rto_loop_next. rto_loop is only updated with
+ * the rto_lock held, but any CPU may increment the rto_loop_next
+ * without any locking.
*/
- if (prev_cpu < rq->cpu) {
- if (cpu >= rq->cpu)
- return nr_cpu_ids;
+ for (;;) {
- } else if (cpu >= nr_cpu_ids) {
- /*
- * We passed the end of the mask, start at the beginning.
- * If the result is greater or equal to the rq's CPU, then
- * the loop is finished.
- */
- cpu = cpumask_first(rq->rd->rto_mask);
- if (cpu >= rq->cpu)
- return nr_cpu_ids;
- }
- rq->rt.push_cpu = cpu;
+ /* When rto_cpu is -1 this acts like cpumask_first() */
+ cpu = cpumask_next(rd->rto_cpu, rd->rto_mask);
- /* Return cpu to let the caller know if the loop is finished or not */
- return cpu;
-}
+ rd->rto_cpu = cpu;
-static int find_next_push_cpu(struct rq *rq)
-{
- struct rq *next_rq;
- int cpu;
+ if (cpu < nr_cpu_ids)
+ return cpu;
- while (1) {
- cpu = rto_next_cpu(rq);
- if (cpu >= nr_cpu_ids)
- break;
- next_rq = cpu_rq(cpu);
+ rd->rto_cpu = -1;
- /* Make sure the next rq can push to this rq */
- if (next_rq->rt.highest_prio.next < rq->rt.highest_prio.curr)
+ /*
+ * ACQUIRE ensures we see the @rto_mask changes
+ * made prior to the @next value observed.
+ *
+ * Matches WMB in rt_set_overload().
+ */
+ next = atomic_read_acquire(&rd->rto_loop_next);
+
+ if (rd->rto_loop == next)
break;
+
+ rd->rto_loop = next;
}
- return cpu;
+ return -1;
}
-#define RT_PUSH_IPI_EXECUTING 1
-#define RT_PUSH_IPI_RESTART 2
+static inline bool rto_start_trylock(atomic_t *v)
+{
+ return !atomic_cmpxchg_acquire(v, 0, 1);
+}
-static void tell_cpu_to_push(struct rq *rq)
+static inline void rto_start_unlock(atomic_t *v)
{
- int cpu;
+ atomic_set_release(v, 0);
+}
- if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) {
- raw_spin_lock(&rq->rt.push_lock);
- /* Make sure it's still executing */
- if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) {
- /*
- * Tell the IPI to restart the loop as things have
- * changed since it started.
- */
- rq->rt.push_flags |= RT_PUSH_IPI_RESTART;
- raw_spin_unlock(&rq->rt.push_lock);
- return;
- }
- raw_spin_unlock(&rq->rt.push_lock);
- }
+static void tell_cpu_to_push(struct rq *rq)
+{
+ int cpu = -1;
- /* When here, there's no IPI going around */
+ /* Keep the loop going if the IPI is currently active */
+ atomic_inc(&rq->rd->rto_loop_next);
- rq->rt.push_cpu = rq->cpu;
- cpu = find_next_push_cpu(rq);
- if (cpu >= nr_cpu_ids)
+ /* Only one CPU can initiate a loop at a time */
+ if (!rto_start_trylock(&rq->rd->rto_loop_start))
return;
- rq->rt.push_flags = RT_PUSH_IPI_EXECUTING;
+ raw_spin_lock(&rq->rd->rto_lock);
+
+ /*
+ * The rto_cpu is updated under the lock, if it has a valid cpu
+ * then the IPI is still running and will continue due to the
+ * update to loop_next, and nothing needs to be done here.
+ * Otherwise it is finishing up and an ipi needs to be sent.
+ */
+ if (rq->rd->rto_cpu < 0)
+ cpu = rto_next_cpu(rq->rd);
+
+ raw_spin_unlock(&rq->rd->rto_lock);
- irq_work_queue_on(&rq->rt.push_work, cpu);
+ rto_start_unlock(&rq->rd->rto_loop_start);
+
+ if (cpu >= 0) {
+ /* Make sure the rd does not get freed while pushing */
+ sched_get_rd(rq->rd);
+ irq_work_queue_on(&rq->rd->rto_push_work, cpu);
+ }
}
/* Called from hardirq context */
-static void try_to_push_tasks(void *arg)
+void rto_push_irq_work_func(struct irq_work *work)
{
- struct rt_rq *rt_rq = arg;
- struct rq *rq, *src_rq;
- int this_cpu;
+ struct root_domain *rd =
+ container_of(work, struct root_domain, rto_push_work);
+ struct rq *rq;
int cpu;
- this_cpu = rt_rq->push_cpu;
-
- /* Paranoid check */
- BUG_ON(this_cpu != smp_processor_id());
-
- rq = cpu_rq(this_cpu);
- src_rq = rq_of_rt_rq(rt_rq);
+ rq = this_rq();
-again:
+ /*
+ * We do not need to grab the lock to check for has_pushable_tasks.
+ * When it gets updated, a check is made if a push is possible.
+ */
if (has_pushable_tasks(rq)) {
raw_spin_lock(&rq->lock);
- push_rt_task(rq);
+ push_rt_tasks(rq);
raw_spin_unlock(&rq->lock);
}
- /* Pass the IPI to the next rt overloaded queue */
- raw_spin_lock(&rt_rq->push_lock);
- /*
- * If the source queue changed since the IPI went out,
- * we need to restart the search from that CPU again.
- */
- if (rt_rq->push_flags & RT_PUSH_IPI_RESTART) {
- rt_rq->push_flags &= ~RT_PUSH_IPI_RESTART;
- rt_rq->push_cpu = src_rq->cpu;
- }
+ raw_spin_lock(&rd->rto_lock);
- cpu = find_next_push_cpu(src_rq);
+ /* Pass the IPI to the next rt overloaded queue */
+ cpu = rto_next_cpu(rd);
- if (cpu >= nr_cpu_ids)
- rt_rq->push_flags &= ~RT_PUSH_IPI_EXECUTING;
- raw_spin_unlock(&rt_rq->push_lock);
+ raw_spin_unlock(&rd->rto_lock);
- if (cpu >= nr_cpu_ids)
+ if (cpu < 0) {
+ sched_put_rd(rd);
return;
-
- /*
- * It is possible that a restart caused this CPU to be
- * chosen again. Don't bother with an IPI, just see if we
- * have more to push.
- */
- if (unlikely(cpu == rq->cpu))
- goto again;
+ }
/* Try the next RT overloaded CPU */
- irq_work_queue_on(&rt_rq->push_work, cpu);
-}
-
-static void push_irq_work_func(struct irq_work *work)
-{
- struct rt_rq *rt_rq = container_of(work, struct rt_rq, push_work);
-
- try_to_push_tasks(rt_rq);
+ irq_work_queue_on(&rd->rto_push_work, cpu);
}
#endif /* HAVE_RT_PUSH_IPI */
@@ -1965,8 +2174,9 @@ static void pull_rt_task(struct rq *this_rq)
bool resched = false;
struct task_struct *p;
struct rq *src_rq;
+ int rt_overload_count = rt_overloaded(this_rq);
- if (likely(!rt_overloaded(this_rq)))
+ if (likely(!rt_overload_count))
return;
/*
@@ -1975,6 +2185,11 @@ static void pull_rt_task(struct rq *this_rq)
*/
smp_rmb();
+ /* If we are the only overloaded CPU do nothing */
+ if (rt_overload_count == 1 &&
+ cpumask_test_cpu(this_rq->cpu, this_rq->rd->rto_mask))
+ return;
+
#ifdef HAVE_RT_PUSH_IPI
if (sched_feat(RT_PUSH_IPI)) {
tell_cpu_to_push(this_rq);
@@ -2034,7 +2249,9 @@ static void pull_rt_task(struct rq *this_rq)
resched = true;
deactivate_task(src_rq, p, 0);
+ p->on_rq = TASK_ON_RQ_MIGRATING;
set_task_cpu(p, this_cpu);
+ p->on_rq = TASK_ON_RQ_QUEUED;
activate_task(this_rq, p, 0);
/*
* We continue with the search, just in
@@ -2095,6 +2312,13 @@ static void rq_offline_rt(struct rq *rq)
static void switched_from_rt(struct rq *rq, struct task_struct *p)
{
/*
+ * On class switch from rt, always cancel active schedtune timers,
+ * this handles the cases where we switch class for a task that is
+ * already rt-dequeued but has a running timer.
+ */
+ schedtune_dequeue_rt(rq, p);
+
+ /*
* If there are other RT tasks then we will reschedule
* and the scheduling of the other RT tasks will handle
* the balancing. But if we are the last RT task
@@ -2136,10 +2360,9 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
#ifdef CONFIG_SMP
if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
queue_push_tasks(rq);
-#else
- if (p->prio < rq->curr->prio)
- resched_curr(rq);
#endif /* CONFIG_SMP */
+ if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq)))
+ resched_curr(rq);
}
}