summaryrefslogtreecommitdiff
path: root/kernel/sched/fair.c
diff options
context:
space:
mode:
authorBrendan Jackman <brendan.jackman@arm.com>2017-08-07 15:46:13 +0100
committerJoel Fernandes <joelaf@google.com>2017-10-27 18:50:59 +0000
commit38ddcff85af052ad99e4f3f0b6e9659b0ca10dcf (patch)
tree4b77f5c865db07da3dbce204daf85c865bdf6140 /kernel/sched/fair.c
parent43bd960dfe728284e1059f11d6c686d23887c1c6 (diff)
FROMLIST: sched/fair: Use wake_q length as a hint for wake_wide
(from https://patchwork.kernel.org/patch/9895261/) This patch adds a parameter to select_task_rq, sibling_count_hint allowing the caller, where it has this information, to inform the sched_class the number of tasks that are being woken up as part of the same event. The wake_q mechanism is one case where this information is available. select_task_rq_fair can then use the information to detect that it needs to widen the search space for task placement in order to avoid overloading the last-level cache domain's CPUs. * * * The reason I am investigating this change is the following use case on ARM big.LITTLE (asymmetrical CPU capacity): 1 task per CPU, which all repeatedly do X amount of work then pthread_barrier_wait (i.e. sleep until the last task finishes its X and hits the barrier). On big.LITTLE, the tasks which get a "big" CPU finish faster, and then those CPUs pull over the tasks that are still running: v CPU v ->time-> ------------- 0 (big) 11111 /333 ------------- 1 (big) 22222 /444| ------------- 2 (LITTLE) 333333/ ------------- 3 (LITTLE) 444444/ ------------- Now when task 4 hits the barrier (at |) and wakes the others up, there are 4 tasks with prev_cpu=<big> and 0 tasks with prev_cpu=<little>. want_affine therefore means that we'll only look in CPUs 0 and 1 (sd_llc), so tasks will be unnecessarily coscheduled on the bigs until the next load balance, something like this: v CPU v ->time-> ------------------------ 0 (big) 11111 /333 31313\33333 ------------------------ 1 (big) 22222 /444|424\4444444 ------------------------ 2 (LITTLE) 333333/ \222222 ------------------------ 3 (LITTLE) 444444/ \1111 ------------------------ ^^^ underutilization So, I'm trying to get want_affine = 0 for these tasks. I don't _think_ any incarnation of the wakee_flips mechanism can help us here because which task is waker and which tasks are wakees generally changes with each iteration. However pthread_barrier_wait (or more accurately FUTEX_WAKE) has the nice property that we know exactly how many tasks are being woken, so we can cheat. It might be a disadvantage that we "widen" _every_ task that's woken in an event, while select_idle_sibling would work fine for the first sd_llc_size - 1 tasks. IIUC, if wake_affine() behaves correctly this trick wouldn't be necessary on SMP systems, so it might be best guarded by the presence of SD_ASYM_CPUCAPACITY? * * * Final note.. In order to observe "perfect" behaviour for this use case, I also had to disable the TTWU_QUEUE sched feature. Suppose during the wakeup above we are working through the work queue and have placed tasks 3 and 2, and are about to place task 1: v CPU v ->time-> -------------- 0 (big) 11111 /333 3 -------------- 1 (big) 22222 /444|4 -------------- 2 (LITTLE) 333333/ 2 -------------- 3 (LITTLE) 444444/ <- Task 1 should go here -------------- If TTWU_QUEUE is enabled, we will not yet have enqueued task 2 (having instead sent a reschedule IPI) or attached its load to CPU 2. So we are likely to also place task 1 on cpu 2. Disabling TTWU_QUEUE means that we enqueue task 2 before placing task 1, solving this issue. TTWU_QUEUE is there to minimise rq lock contention, and I guess that this contention is less of an issue on big.LITTLE systems since they have relatively few CPUs, which suggests the trade-off makes sense here. Change-Id: I2080302839a263e0841a89efea8589ea53bbda9c Signed-off-by: Brendan Jackman <brendan.jackman@arm.com> Signed-off-by: Chris Redpath <chris.redpath@arm.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Josef Bacik <josef@toxicpanda.com> Cc: Joel Fernandes <joelaf@google.com> Cc: Mike Galbraith <efault@gmx.de> Cc: Matt Fleming <matt@codeblueprint.co.uk>
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r--kernel/sched/fair.c21
1 files changed, 14 insertions, 7 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 9bc717ef81f5..b904a023be95 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5773,15 +5773,18 @@ energy_diff(struct energy_env *eenv)
* being client/server, worker/dispatcher, interrupt source or whatever is
* irrelevant, spread criteria is apparent partner count exceeds socket size.
*/
-static int wake_wide(struct task_struct *p)
+static int wake_wide(struct task_struct *p, int sibling_count_hint)
{
unsigned int master = current->wakee_flips;
unsigned int slave = p->wakee_flips;
- int factor = this_cpu_read(sd_llc_size);
+ int llc_size = this_cpu_read(sd_llc_size);
+
+ if (sibling_count_hint >= llc_size)
+ return 1;
if (master < slave)
swap(master, slave);
- if (slave < factor || master < slave * factor)
+ if (slave < llc_size || master < slave * llc_size)
return 0;
return 1;
}
@@ -6754,7 +6757,8 @@ unlock:
* preempt must be disabled.
*/
static int
-select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
+select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags,
+ int sibling_count_hint)
{
struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
int cpu = smp_processor_id();
@@ -6762,9 +6766,12 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
int want_affine = 0;
int sync = wake_flags & WF_SYNC;
- if (sd_flag & SD_BALANCE_WAKE)
- want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu)
- && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
+ if (sd_flag & SD_BALANCE_WAKE) {
+ record_wakee(p);
+ want_affine = !wake_wide(p, sibling_count_hint) &&
+ !wake_cap(p, cpu, prev_cpu) &&
+ cpumask_test_cpu(cpu, &p->cpus_allowed);
+ }
if (energy_aware() && !(cpu_rq(prev_cpu)->rd->overutilized))
return select_energy_cpu_brute(p, prev_cpu, sync);