diff options
Diffstat (limited to 'kernel/sched/rt.c')
-rw-r--r-- | kernel/sched/rt.c | 331 |
1 files changed, 228 insertions, 103 deletions
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index ac81704e14d9..9d7f6998edd5 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1439,6 +1439,25 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) } /* + * Keep track of whether each cpu has an RT task that will + * soon schedule on that core. The problem this is intended + * to address is that we want to avoid entering a non-preemptible + * softirq handler if we are about to schedule a real-time + * task on that core. Ideally, we could just check whether + * the RT runqueue on that core had a runnable task, but the + * window between choosing to schedule a real-time task + * on a core and actually enqueueing it on that run-queue + * is large enough to lose races at an unacceptably high rate. + * + * This variable attempts to reduce that window by indicating + * when we have decided to schedule an RT task on a core + * but not yet enqueued it. + * This variable is a heuristic only: it is not guaranteed + * to be correct and may be updated without synchronization. + */ +DEFINE_PER_CPU(bool, incoming_rt_task); + +/* * Adding/removing a task to/from a priority array: */ static void @@ -1459,6 +1478,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) if (!task_current(rq, p) && p->nr_cpus_allowed > 1) enqueue_pushable_task(rq, p); + *per_cpu_ptr(&incoming_rt_task, cpu_of(rq)) = false; if (!schedtune_task_boost(p)) return; @@ -1551,8 +1571,19 @@ static void yield_task_rt(struct rq *rq) requeue_task_rt(rq, rq->curr, 0); } +/* + * Return whether the given cpu has (or will shortly have) an RT task + * ready to run. NB: This is a heuristic and is subject to races. + */ +bool +cpu_has_rt_task(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + return rq->rt.rt_nr_running > 0 || per_cpu(incoming_rt_task, cpu); +} + #ifdef CONFIG_SMP -static int find_lowest_rq(struct task_struct *task); +static int find_lowest_rq(struct task_struct *task, int sync); #ifdef CONFIG_SCHED_HMP static int @@ -1561,7 +1592,7 @@ select_task_rq_rt_hmp(struct task_struct *p, int cpu, int sd_flag, int flags) int target; rcu_read_lock(); - target = find_lowest_rq(p); + target = find_lowest_rq(p, 0); if (target != -1) cpu = target; rcu_read_unlock(); @@ -1573,8 +1604,10 @@ select_task_rq_rt_hmp(struct task_struct *p, int cpu, int sd_flag, int flags) /* * Return whether the task on the given cpu is currently non-preemptible * while handling a potentially long softint, or if the task is likely - * to block preemptions soon because it is a ksoftirq thread that is - * handling slow softints. + * to block preemptions soon because (a) it is a ksoftirq thread that is + * handling slow softints, (b) it is idle and therefore likely to start + * processing the irq's immediately, (c) the cpu is currently handling + * hard irq's and will soon move on to the softirq handler. */ bool task_may_not_preempt(struct task_struct *task, int cpu) @@ -1584,8 +1617,9 @@ task_may_not_preempt(struct task_struct *task, int cpu) struct task_struct *cpu_ksoftirqd = per_cpu(ksoftirqd, cpu); return ((softirqs & LONG_SOFTIRQ_MASK) && - (task == cpu_ksoftirqd || - task_thread_info(task)->preempt_count & SOFTIRQ_MASK)); + (task == cpu_ksoftirqd || is_idle_task(task) || + (task_thread_info(task)->preempt_count + & (HARDIRQ_MASK | SOFTIRQ_MASK)))); } /* @@ -1618,9 +1652,11 @@ static int select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags, int sibling_count_hint) { - struct task_struct *curr; + struct task_struct *curr, *tgt_task; struct rq *rq; bool may_not_preempt; + int target; + int sync = flags & WF_SYNC; #ifdef CONFIG_SCHED_HMP return select_task_rq_rt_hmp(p, cpu, sd_flag, flags); @@ -1635,58 +1671,28 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags, rcu_read_lock(); curr = READ_ONCE(rq->curr); /* unlocked access */ - /* - * If the current task on @p's runqueue is a softirq task, - * it may run without preemption for a time that is - * ill-suited for a waiting RT task. Therefore, try to - * wake this RT task on another runqueue. - * - * Also, if the current task on @p's runqueue is an RT task, then - * it may run without preemption for a time that is - * ill-suited for a waiting RT task. Therefore, try to - * wake this RT task on another runqueue. - * - * Also, if the current task on @p's runqueue is an RT task, then - * try to see if we can wake this RT task up on another - * runqueue. Otherwise simply start this RT task - * on its current runqueue. - * - * We want to avoid overloading runqueues. If the woken - * task is a higher priority, then it will stay on this CPU - * and the lower prio task should be moved to another CPU. - * Even though this will probably make the lower prio task - * lose its cache, we do not want to bounce a higher task - * around just because it gave up its CPU, perhaps for a - * lock? - * - * For equal prio tasks, we just let the scheduler sort it out. - * - * Otherwise, just let it ride on the affined RQ and the - * post-schedule router will push the preempted task away - * - * This test is optimistic, if we get it wrong the load-balancer - * will have to sort it out. - */ may_not_preempt = task_may_not_preempt(curr, cpu); - if (may_not_preempt || - (unlikely(rt_task(curr)) && - (curr->nr_cpus_allowed < 2 || - curr->prio <= p->prio))) { - int target = find_lowest_rq(p); + target = find_lowest_rq(p, sync); - /* - * If cpu is non-preemptible, prefer remote cpu - * even if it's running a higher-prio task. - * Otherwise: Don't bother moving it if the - * destination CPU is not running a lower priority task. - */ - if (target != -1 && - (may_not_preempt || - p->prio < cpu_rq(target)->rt.highest_prio.curr)) - cpu = target; + /* + * Check once for losing a race with the other core's irq handler. + * This does not happen frequently, but it can avoid delaying + * the execution of the RT task in those cases. + */ + if (target != -1) { + tgt_task = READ_ONCE(cpu_rq(target)->curr); + if (task_may_not_preempt(tgt_task, target)) + target = find_lowest_rq(p, sync); } + /* + * Possible race. Don't bother moving it if the + * destination CPU is not running a lower priority task. + */ + if (target != -1 && + (may_not_preempt || p->prio < cpu_rq(target)->rt.highest_prio.curr)) + cpu = target; + *per_cpu_ptr(&incoming_rt_task, cpu) = true; rcu_read_unlock(); - out: /* * If previous CPU was different, make sure to cancel any active @@ -1730,7 +1736,6 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) requeue_task_rt(rq, p, 1); resched_curr(rq); } - #endif /* CONFIG_SMP */ /* @@ -1994,12 +1999,108 @@ retry: } #endif /* CONFIG_SCHED_HMP */ -static int find_lowest_rq(struct task_struct *task) +static int find_best_rt_target(struct task_struct* task, int cpu, + struct cpumask* lowest_mask, + bool boosted, bool prefer_idle) { + int iter_cpu; + int target_cpu = -1; + int boosted_cpu = -1; + int backup_cpu = -1; + int boosted_orig_capacity = capacity_orig_of(0); + int backup_capacity = 0; + int best_idle_cpu = -1; + unsigned long target_util = 0; + unsigned long new_util; + /* We want to elect the best one based on task class, + * idleness, and utilization. + */ + for (iter_cpu = 0; iter_cpu < NR_CPUS; iter_cpu++) { + int cur_capacity; + /* + * Iterate from higher cpus for boosted tasks. + */ + int i = boosted ? NR_CPUS-iter_cpu-1 : iter_cpu; + if (!cpu_online(i) || !cpumask_test_cpu(i, tsk_cpus_allowed(task))) + continue; + + new_util = cpu_util(i) + task_util(task); + + if (new_util > capacity_orig_of(i)) + continue; + + /* + * Unconditionally favoring tasks that prefer idle cpus to + * improve latency. + */ + if (idle_cpu(i) && prefer_idle + && cpumask_test_cpu(i, lowest_mask) && best_idle_cpu < 0) { + best_idle_cpu = i; + continue; + } + + if (cpumask_test_cpu(i, lowest_mask)) { + /* Bias cpu selection towards cpu with higher original + * capacity if task is boosted. + * Assumption: Higher cpus are exclusively alloted for + * boosted tasks. + */ + if (boosted && boosted_cpu < 0 + && boosted_orig_capacity < capacity_orig_of(i)) { + boosted_cpu = i; + boosted_orig_capacity = capacity_orig_of(i); + } + cur_capacity = capacity_curr_of(i); + if (new_util < cur_capacity && cpu_rq(i)->nr_running) { + if(!boosted) { + /* Find a target cpu with highest utilization.*/ + if (target_util < new_util) { + target_cpu = i; + target_util = new_util; + } + } else { + if (target_util == 0 || target_util > new_util) { + /* Find a target cpu with lowest utilization.*/ + target_cpu = i; + target_util = new_util; + } + } + } else if (backup_capacity == 0 || backup_capacity < cur_capacity) { + /* Select a backup CPU with highest capacity.*/ + backup_capacity = cur_capacity; + backup_cpu = i; + } + } + } + + if (boosted && boosted_cpu >=0 && boosted_cpu > best_idle_cpu) + target_cpu = boosted_cpu; + else if (prefer_idle && best_idle_cpu >= 0) + target_cpu = best_idle_cpu; + + if (target_cpu < 0) { + if (backup_cpu >= 0) + return backup_cpu; + + /* Select current cpu if it is present in the mask.*/ + if (cpumask_test_cpu(cpu, lowest_mask)) + return cpu; + + /* Pick a random cpu from lowest_mask */ + target_cpu = cpumask_any(lowest_mask); + if (target_cpu < nr_cpu_ids) + return target_cpu; + return -1; + } + return target_cpu; +} + +static int find_lowest_rq(struct task_struct *task, int sync) { struct sched_domain *sd; struct cpumask *lowest_mask = this_cpu_cpumask_var_ptr(local_cpu_mask); int this_cpu = smp_processor_id(); int cpu = task_cpu(task); + bool boosted, prefer_idle; #ifdef CONFIG_SCHED_HMP return find_lowest_rq_hmp(task); @@ -2012,64 +2113,88 @@ static int find_lowest_rq(struct task_struct *task) if (task->nr_cpus_allowed == 1) return -1; /* No other targets possible */ + /* Constructing cpumask of lowest priorities */ if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask)) return -1; /* No targets found */ - /* - * At this point we have built a mask of cpus representing the - * lowest priority tasks in the system. Now we want to elect - * the best one based on our affinity and topology. - * - * We prioritize the last cpu that the task executed on since - * it is most likely cache-hot in that location. + /* Return current cpu if WF_SYNC hint is set and present in + * lowest_mask. Improves data locality. */ - if (cpumask_test_cpu(cpu, lowest_mask)) - return cpu; + if (sysctl_sched_sync_hint_enable && sync) { + cpumask_t search_cpus; + cpumask_and(&search_cpus, tsk_cpus_allowed(task), lowest_mask); + if (cpumask_test_cpu(cpu, &search_cpus)) + return cpu; + } /* - * Otherwise, we consult the sched_domains span maps to figure - * out which cpu is logically closest to our hot cache data. + * At this point we have built a mask of cpus representing the + * lowest priority tasks in the system. */ - if (!cpumask_test_cpu(this_cpu, lowest_mask)) - this_cpu = -1; /* Skip this_cpu opt if not among lowest */ - rcu_read_lock(); - for_each_domain(cpu, sd) { - if (sd->flags & SD_WAKE_AFFINE) { - int best_cpu; - - /* - * "this_cpu" is cheaper to preempt than a - * remote processor. - */ - if (this_cpu != -1 && - cpumask_test_cpu(this_cpu, sched_domain_span(sd))) { - rcu_read_unlock(); - return this_cpu; - } + boosted = schedtune_task_boost(task) > 0; + prefer_idle = schedtune_prefer_idle(task) > 0; + if(boosted || prefer_idle) { + return find_best_rt_target(task, cpu, lowest_mask, boosted, prefer_idle); + } else { + /* Now we want to elect the best one based on on our affinity + * and topology. + * We prioritize the last cpu that the task executed on since + * it is most likely cache-hot in that location. + */ + struct task_struct* curr; + if (!cpumask_test_cpu(this_cpu, lowest_mask)) + this_cpu = -1; /* Skip this_cpu opt if not among lowest */ + rcu_read_lock(); + for_each_domain(cpu, sd) { + if (sd->flags & SD_WAKE_AFFINE) { + int best_cpu; + /* + * "this_cpu" is cheaper to preempt than a + * remote processor. + */ + if (this_cpu != -1 && + cpumask_test_cpu(this_cpu, sched_domain_span(sd))) { + curr = cpu_rq(this_cpu)->curr; + /* Ensuring that boosted/prefer idle + * tasks are not pre-empted even if low + * priority*/ + if (!curr || (schedtune_task_boost(curr) == 0 + && schedtune_prefer_idle(curr) == 0)) { + rcu_read_unlock(); + return this_cpu; + } + } - best_cpu = cpumask_first_and(lowest_mask, - sched_domain_span(sd)); - if (best_cpu < nr_cpu_ids) { - rcu_read_unlock(); - return best_cpu; + best_cpu = cpumask_first_and(lowest_mask, + sched_domain_span(sd)); + if (best_cpu < nr_cpu_ids) { + curr = cpu_rq(best_cpu)->curr; + /* Ensuring that boosted/prefer idle + * tasks are not pre-empted even if low + * priority*/ + if(!curr || (schedtune_task_boost(curr) == 0 + && schedtune_prefer_idle(curr) == 0)) { + rcu_read_unlock(); + return best_cpu; + } + } } } - } - rcu_read_unlock(); + rcu_read_unlock(); - /* - * And finally, if there were no matches within the domains - * just give the caller *something* to work with from the compatible - * locations. - */ - if (this_cpu != -1) - return this_cpu; + /* And finally, if there were no matches within the domains just + * give the caller *something* to work with from the compatible + * locations. + */ + if (this_cpu != -1) + return this_cpu; - cpu = cpumask_any(lowest_mask); - if (cpu < nr_cpu_ids) - return cpu; - return -1; + cpu = cpumask_any(lowest_mask); + if (cpu < nr_cpu_ids) + return cpu; + return -1; + } } /* Will lock the rq it finds */ @@ -2080,7 +2205,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) int cpu; for (tries = 0; tries < RT_MAX_TRIES; tries++) { - cpu = find_lowest_rq(task); + cpu = find_lowest_rq(task, 0); if ((cpu == -1) || (cpu == rq->cpu)) break; |