summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorLinux Build Service Account <lnxbuild@quicinc.com>2018-01-11 04:14:45 -0800
committerGerrit - the friendly Code Review server <code-review@localhost>2018-01-11 04:14:45 -0800
commite525ef12a4ba7da1569592615fddf082c51b2f3e (patch)
tree5733d8ac1a78313edf973cf0e265f0fefafd4308 /kernel
parent1707cb3ff140dec01127669e1c88d5b850f272e4 (diff)
parent80eb7ccae411759ac85501602aae882dede1c7fa (diff)
Merge "Merge android-4.4.104 (8bc4213) into msm-4.4"
Diffstat (limited to 'kernel')
-rw-r--r--kernel/sched/core.c6
-rw-r--r--kernel/sched/rt.c235
-rw-r--r--kernel/sched/sched.h24
-rw-r--r--kernel/time/timekeeping.c57
-rw-r--r--kernel/workqueue_internal.h3
5 files changed, 171 insertions, 154 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 13a64488c3d2..2370e7631728 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6641,6 +6641,12 @@ static int init_rootdomain(struct root_domain *rd)
if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
goto free_dlo_mask;
+#ifdef HAVE_RT_PUSH_IPI
+ rd->rto_cpu = -1;
+ raw_spin_lock_init(&rd->rto_lock);
+ init_irq_work(&rd->rto_push_work, rto_push_irq_work_func);
+#endif
+
init_dl_bw(&rd->dl_bw);
if (cpudl_init(&rd->cpudl) != 0)
goto free_dlo_mask;
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index af6a7f424d94..c290db7f289a 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -68,10 +68,6 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
raw_spin_unlock(&rt_b->rt_runtime_lock);
}
-#if defined(CONFIG_SMP) && defined(HAVE_RT_PUSH_IPI)
-static void push_irq_work_func(struct irq_work *work);
-#endif
-
void init_rt_rq(struct rt_rq *rt_rq)
{
struct rt_prio_array *array;
@@ -91,13 +87,6 @@ void init_rt_rq(struct rt_rq *rt_rq)
rt_rq->rt_nr_migratory = 0;
rt_rq->overloaded = 0;
plist_head_init(&rt_rq->pushable_tasks);
-
-#ifdef HAVE_RT_PUSH_IPI
- rt_rq->push_flags = 0;
- rt_rq->push_cpu = nr_cpu_ids;
- raw_spin_lock_init(&rt_rq->push_lock);
- init_irq_work(&rt_rq->push_work, push_irq_work_func);
-#endif
#endif /* CONFIG_SMP */
/* We start is dequeued state, because no RT tasks are queued */
rt_rq->rt_queued = 0;
@@ -2104,160 +2093,166 @@ static void push_rt_tasks(struct rq *rq)
}
#ifdef HAVE_RT_PUSH_IPI
+
/*
- * The search for the next cpu always starts at rq->cpu and ends
- * when we reach rq->cpu again. It will never return rq->cpu.
- * This returns the next cpu to check, or nr_cpu_ids if the loop
- * is complete.
+ * When a high priority task schedules out from a CPU and a lower priority
+ * task is scheduled in, a check is made to see if there's any RT tasks
+ * on other CPUs that are waiting to run because a higher priority RT task
+ * is currently running on its CPU. In this case, the CPU with multiple RT
+ * tasks queued on it (overloaded) needs to be notified that a CPU has opened
+ * up that may be able to run one of its non-running queued RT tasks.
+ *
+ * All CPUs with overloaded RT tasks need to be notified as there is currently
+ * no way to know which of these CPUs have the highest priority task waiting
+ * to run. Instead of trying to take a spinlock on each of these CPUs,
+ * which has shown to cause large latency when done on machines with many
+ * CPUs, sending an IPI to the CPUs to have them push off the overloaded
+ * RT tasks waiting to run.
+ *
+ * Just sending an IPI to each of the CPUs is also an issue, as on large
+ * count CPU machines, this can cause an IPI storm on a CPU, especially
+ * if its the only CPU with multiple RT tasks queued, and a large number
+ * of CPUs scheduling a lower priority task at the same time.
+ *
+ * Each root domain has its own irq work function that can iterate over
+ * all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT
+ * tassk must be checked if there's one or many CPUs that are lowering
+ * their priority, there's a single irq work iterator that will try to
+ * push off RT tasks that are waiting to run.
+ *
+ * When a CPU schedules a lower priority task, it will kick off the
+ * irq work iterator that will jump to each CPU with overloaded RT tasks.
+ * As it only takes the first CPU that schedules a lower priority task
+ * to start the process, the rto_start variable is incremented and if
+ * the atomic result is one, then that CPU will try to take the rto_lock.
+ * This prevents high contention on the lock as the process handles all
+ * CPUs scheduling lower priority tasks.
+ *
+ * All CPUs that are scheduling a lower priority task will increment the
+ * rt_loop_next variable. This will make sure that the irq work iterator
+ * checks all RT overloaded CPUs whenever a CPU schedules a new lower
+ * priority task, even if the iterator is in the middle of a scan. Incrementing
+ * the rt_loop_next will cause the iterator to perform another scan.
*
- * rq->rt.push_cpu holds the last cpu returned by this function,
- * or if this is the first instance, it must hold rq->cpu.
*/
static int rto_next_cpu(struct rq *rq)
{
- int prev_cpu = rq->rt.push_cpu;
+ struct root_domain *rd = rq->rd;
+ int next;
int cpu;
- cpu = cpumask_next(prev_cpu, rq->rd->rto_mask);
-
/*
- * If the previous cpu is less than the rq's CPU, then it already
- * passed the end of the mask, and has started from the beginning.
- * We end if the next CPU is greater or equal to rq's CPU.
+ * When starting the IPI RT pushing, the rto_cpu is set to -1,
+ * rt_next_cpu() will simply return the first CPU found in
+ * the rto_mask.
+ *
+ * If rto_next_cpu() is called with rto_cpu is a valid cpu, it
+ * will return the next CPU found in the rto_mask.
+ *
+ * If there are no more CPUs left in the rto_mask, then a check is made
+ * against rto_loop and rto_loop_next. rto_loop is only updated with
+ * the rto_lock held, but any CPU may increment the rto_loop_next
+ * without any locking.
*/
- if (prev_cpu < rq->cpu) {
- if (cpu >= rq->cpu)
- return nr_cpu_ids;
+ for (;;) {
- } else if (cpu >= nr_cpu_ids) {
- /*
- * We passed the end of the mask, start at the beginning.
- * If the result is greater or equal to the rq's CPU, then
- * the loop is finished.
- */
- cpu = cpumask_first(rq->rd->rto_mask);
- if (cpu >= rq->cpu)
- return nr_cpu_ids;
- }
- rq->rt.push_cpu = cpu;
+ /* When rto_cpu is -1 this acts like cpumask_first() */
+ cpu = cpumask_next(rd->rto_cpu, rd->rto_mask);
- /* Return cpu to let the caller know if the loop is finished or not */
- return cpu;
-}
+ rd->rto_cpu = cpu;
-static int find_next_push_cpu(struct rq *rq)
-{
- struct rq *next_rq;
- int cpu;
+ if (cpu < nr_cpu_ids)
+ return cpu;
- while (1) {
- cpu = rto_next_cpu(rq);
- if (cpu >= nr_cpu_ids)
- break;
- next_rq = cpu_rq(cpu);
+ rd->rto_cpu = -1;
- /* Make sure the next rq can push to this rq */
- if (next_rq->rt.highest_prio.next < rq->rt.highest_prio.curr)
+ /*
+ * ACQUIRE ensures we see the @rto_mask changes
+ * made prior to the @next value observed.
+ *
+ * Matches WMB in rt_set_overload().
+ */
+ next = atomic_read_acquire(&rd->rto_loop_next);
+
+ if (rd->rto_loop == next)
break;
+
+ rd->rto_loop = next;
}
- return cpu;
+ return -1;
+}
+
+static inline bool rto_start_trylock(atomic_t *v)
+{
+ return !atomic_cmpxchg_acquire(v, 0, 1);
}
-#define RT_PUSH_IPI_EXECUTING 1
-#define RT_PUSH_IPI_RESTART 2
+static inline void rto_start_unlock(atomic_t *v)
+{
+ atomic_set_release(v, 0);
+}
static void tell_cpu_to_push(struct rq *rq)
{
- int cpu;
+ int cpu = -1;
- if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) {
- raw_spin_lock(&rq->rt.push_lock);
- /* Make sure it's still executing */
- if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) {
- /*
- * Tell the IPI to restart the loop as things have
- * changed since it started.
- */
- rq->rt.push_flags |= RT_PUSH_IPI_RESTART;
- raw_spin_unlock(&rq->rt.push_lock);
- return;
- }
- raw_spin_unlock(&rq->rt.push_lock);
- }
+ /* Keep the loop going if the IPI is currently active */
+ atomic_inc(&rq->rd->rto_loop_next);
- /* When here, there's no IPI going around */
-
- rq->rt.push_cpu = rq->cpu;
- cpu = find_next_push_cpu(rq);
- if (cpu >= nr_cpu_ids)
+ /* Only one CPU can initiate a loop at a time */
+ if (!rto_start_trylock(&rq->rd->rto_loop_start))
return;
- rq->rt.push_flags = RT_PUSH_IPI_EXECUTING;
+ raw_spin_lock(&rq->rd->rto_lock);
+
+ /*
+ * The rto_cpu is updated under the lock, if it has a valid cpu
+ * then the IPI is still running and will continue due to the
+ * update to loop_next, and nothing needs to be done here.
+ * Otherwise it is finishing up and an ipi needs to be sent.
+ */
+ if (rq->rd->rto_cpu < 0)
+ cpu = rto_next_cpu(rq);
+
+ raw_spin_unlock(&rq->rd->rto_lock);
- irq_work_queue_on(&rq->rt.push_work, cpu);
+ rto_start_unlock(&rq->rd->rto_loop_start);
+
+ if (cpu >= 0)
+ irq_work_queue_on(&rq->rd->rto_push_work, cpu);
}
/* Called from hardirq context */
-static void try_to_push_tasks(void *arg)
+void rto_push_irq_work_func(struct irq_work *work)
{
- struct rt_rq *rt_rq = arg;
- struct rq *rq, *src_rq;
- int this_cpu;
+ struct rq *rq;
int cpu;
- this_cpu = rt_rq->push_cpu;
+ rq = this_rq();
- /* Paranoid check */
- BUG_ON(this_cpu != smp_processor_id());
-
- rq = cpu_rq(this_cpu);
- src_rq = rq_of_rt_rq(rt_rq);
-
-again:
+ /*
+ * We do not need to grab the lock to check for has_pushable_tasks.
+ * When it gets updated, a check is made if a push is possible.
+ */
if (has_pushable_tasks(rq)) {
raw_spin_lock(&rq->lock);
- push_rt_task(rq);
+ push_rt_tasks(rq);
raw_spin_unlock(&rq->lock);
}
- /* Pass the IPI to the next rt overloaded queue */
- raw_spin_lock(&rt_rq->push_lock);
- /*
- * If the source queue changed since the IPI went out,
- * we need to restart the search from that CPU again.
- */
- if (rt_rq->push_flags & RT_PUSH_IPI_RESTART) {
- rt_rq->push_flags &= ~RT_PUSH_IPI_RESTART;
- rt_rq->push_cpu = src_rq->cpu;
- }
+ raw_spin_lock(&rq->rd->rto_lock);
- cpu = find_next_push_cpu(src_rq);
+ /* Pass the IPI to the next rt overloaded queue */
+ cpu = rto_next_cpu(rq);
- if (cpu >= nr_cpu_ids)
- rt_rq->push_flags &= ~RT_PUSH_IPI_EXECUTING;
- raw_spin_unlock(&rt_rq->push_lock);
+ raw_spin_unlock(&rq->rd->rto_lock);
- if (cpu >= nr_cpu_ids)
+ if (cpu < 0)
return;
- /*
- * It is possible that a restart caused this CPU to be
- * chosen again. Don't bother with an IPI, just see if we
- * have more to push.
- */
- if (unlikely(cpu == rq->cpu))
- goto again;
-
/* Try the next RT overloaded CPU */
- irq_work_queue_on(&rt_rq->push_work, cpu);
-}
-
-static void push_irq_work_func(struct irq_work *work)
-{
- struct rt_rq *rt_rq = container_of(work, struct rt_rq, push_work);
-
- try_to_push_tasks(rt_rq);
+ irq_work_queue_on(&rq->rd->rto_push_work, cpu);
}
#endif /* HAVE_RT_PUSH_IPI */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 7b31973c6db3..ca2294d06f44 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -532,7 +532,7 @@ static inline int rt_bandwidth_enabled(void)
}
/* RT IPI pull logic requires IRQ_WORK */
-#ifdef CONFIG_IRQ_WORK
+#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_SMP)
# define HAVE_RT_PUSH_IPI
#endif
@@ -553,12 +553,6 @@ struct rt_rq {
unsigned long rt_nr_total;
int overloaded;
struct plist_head pushable_tasks;
-#ifdef HAVE_RT_PUSH_IPI
- int push_flags;
- int push_cpu;
- struct irq_work push_work;
- raw_spinlock_t push_lock;
-#endif
#endif /* CONFIG_SMP */
int rt_queued;
@@ -651,6 +645,19 @@ struct root_domain {
struct dl_bw dl_bw;
struct cpudl cpudl;
+#ifdef HAVE_RT_PUSH_IPI
+ /*
+ * For IPI pull requests, loop across the rto_mask.
+ */
+ struct irq_work rto_push_work;
+ raw_spinlock_t rto_lock;
+ /* These are only updated and read within rto_lock */
+ int rto_loop;
+ int rto_cpu;
+ /* These atomics are updated outside of a lock */
+ atomic_t rto_loop_next;
+ atomic_t rto_loop_start;
+#endif
/*
* The "RT overload" flag: it gets set if a CPU has more than
* one runnable RT task.
@@ -667,6 +674,9 @@ struct root_domain {
extern struct root_domain def_root_domain;
+#ifdef HAVE_RT_PUSH_IPI
+extern void rto_push_irq_work_func(struct irq_work *work);
+#endif
#endif /* CONFIG_SMP */
/*
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 738f3467d169..fc86fdcce932 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -70,6 +70,10 @@ static inline void tk_normalize_xtime(struct timekeeper *tk)
tk->tkr_mono.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_mono.shift;
tk->xtime_sec++;
}
+ while (tk->tkr_raw.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_raw.shift)) {
+ tk->tkr_raw.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_raw.shift;
+ tk->raw_sec++;
+ }
}
static inline struct timespec64 tk_xtime(struct timekeeper *tk)
@@ -277,18 +281,19 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
/* Go back from cycles -> shifted ns */
tk->xtime_interval = (u64) interval * clock->mult;
tk->xtime_remainder = ntpinterval - tk->xtime_interval;
- tk->raw_interval =
- ((u64) interval * clock->mult) >> clock->shift;
+ tk->raw_interval = interval * clock->mult;
/* if changing clocks, convert xtime_nsec shift units */
if (old_clock) {
int shift_change = clock->shift - old_clock->shift;
- if (shift_change < 0)
+ if (shift_change < 0) {
tk->tkr_mono.xtime_nsec >>= -shift_change;
- else
+ tk->tkr_raw.xtime_nsec >>= -shift_change;
+ } else {
tk->tkr_mono.xtime_nsec <<= shift_change;
+ tk->tkr_raw.xtime_nsec <<= shift_change;
+ }
}
- tk->tkr_raw.xtime_nsec = 0;
tk->tkr_mono.shift = clock->shift;
tk->tkr_raw.shift = clock->shift;
@@ -617,9 +622,6 @@ static inline void tk_update_ktime_data(struct timekeeper *tk)
nsec = (u32) tk->wall_to_monotonic.tv_nsec;
tk->tkr_mono.base = ns_to_ktime(seconds * NSEC_PER_SEC + nsec);
- /* Update the monotonic raw base */
- tk->tkr_raw.base = timespec64_to_ktime(tk->raw_time);
-
/*
* The sum of the nanoseconds portions of xtime and
* wall_to_monotonic can be greater/equal one second. Take
@@ -629,6 +631,11 @@ static inline void tk_update_ktime_data(struct timekeeper *tk)
if (nsec >= NSEC_PER_SEC)
seconds++;
tk->ktime_sec = seconds;
+
+ /* Update the monotonic raw base */
+ seconds = tk->raw_sec;
+ nsec = (u32)(tk->tkr_raw.xtime_nsec >> tk->tkr_raw.shift);
+ tk->tkr_raw.base = ns_to_ktime(seconds * NSEC_PER_SEC + nsec);
}
/* must hold timekeeper_lock */
@@ -670,7 +677,6 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
static void timekeeping_forward_now(struct timekeeper *tk)
{
cycle_t cycle_now, delta;
- s64 nsec;
cycle_now = tk_clock_read(&tk->tkr_mono);
delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask);
@@ -682,10 +688,13 @@ static void timekeeping_forward_now(struct timekeeper *tk)
/* If arch requires, add in get_arch_timeoffset() */
tk->tkr_mono.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr_mono.shift;
- tk_normalize_xtime(tk);
- nsec = clocksource_cyc2ns(delta, tk->tkr_raw.mult, tk->tkr_raw.shift);
- timespec64_add_ns(&tk->raw_time, nsec);
+ tk->tkr_raw.xtime_nsec += delta * tk->tkr_raw.mult;
+
+ /* If arch requires, add in get_arch_timeoffset() */
+ tk->tkr_raw.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr_raw.shift;
+
+ tk_normalize_xtime(tk);
}
/**
@@ -1179,19 +1188,18 @@ int timekeeping_notify(struct clocksource *clock)
void getrawmonotonic64(struct timespec64 *ts)
{
struct timekeeper *tk = &tk_core.timekeeper;
- struct timespec64 ts64;
unsigned long seq;
s64 nsecs;
do {
seq = read_seqcount_begin(&tk_core.seq);
+ ts->tv_sec = tk->raw_sec;
nsecs = timekeeping_get_ns(&tk->tkr_raw);
- ts64 = tk->raw_time;
} while (read_seqcount_retry(&tk_core.seq, seq));
- timespec64_add_ns(&ts64, nsecs);
- *ts = ts64;
+ ts->tv_nsec = 0;
+ timespec64_add_ns(ts, nsecs);
}
EXPORT_SYMBOL(getrawmonotonic64);
@@ -1315,8 +1323,7 @@ void __init timekeeping_init(void)
tk_setup_internals(tk, clock);
tk_set_xtime(tk, &now);
- tk->raw_time.tv_sec = 0;
- tk->raw_time.tv_nsec = 0;
+ tk->raw_sec = 0;
if (boot.tv_sec == 0 && boot.tv_nsec == 0)
boot = tk_xtime(tk);
@@ -1796,7 +1803,7 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
unsigned int *clock_set)
{
cycle_t interval = tk->cycle_interval << shift;
- u64 raw_nsecs;
+ u64 snsec_per_sec;
/* If the offset is smaller than a shifted interval, do nothing */
if (offset < interval)
@@ -1811,14 +1818,12 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
*clock_set |= accumulate_nsecs_to_secs(tk);
/* Accumulate raw time */
- raw_nsecs = (u64)tk->raw_interval << shift;
- raw_nsecs += tk->raw_time.tv_nsec;
- if (raw_nsecs >= NSEC_PER_SEC) {
- u64 raw_secs = raw_nsecs;
- raw_nsecs = do_div(raw_secs, NSEC_PER_SEC);
- tk->raw_time.tv_sec += raw_secs;
+ tk->tkr_raw.xtime_nsec += tk->raw_interval << shift;
+ snsec_per_sec = (u64)NSEC_PER_SEC << tk->tkr_raw.shift;
+ while (tk->tkr_raw.xtime_nsec >= snsec_per_sec) {
+ tk->tkr_raw.xtime_nsec -= snsec_per_sec;
+ tk->raw_sec++;
}
- tk->raw_time.tv_nsec = raw_nsecs;
/* Accumulate error between NTP and clock interval */
tk->ntp_error += tk->ntp_tick << shift;
diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h
index 45215870ac6c..3fa9c146fccb 100644
--- a/kernel/workqueue_internal.h
+++ b/kernel/workqueue_internal.h
@@ -9,6 +9,7 @@
#include <linux/workqueue.h>
#include <linux/kthread.h>
+#include <linux/preempt.h>
struct worker_pool;
@@ -59,7 +60,7 @@ struct worker {
*/
static inline struct worker *current_wq_worker(void)
{
- if (current->flags & PF_WQ_WORKER)
+ if (in_task() && (current->flags & PF_WQ_WORKER))
return kthread_data(current);
return NULL;
}