diff options
| author | Syed Rameez Mustafa <rameezmustafa@codeaurora.org> | 2016-11-09 17:22:23 -0800 |
|---|---|---|
| committer | Syed Rameez Mustafa <rameezmustafa@codeaurora.org> | 2016-11-16 17:57:35 -0800 |
| commit | 8b74c7eb5f0b8badc7d76122d6e359bd43115b58 (patch) | |
| tree | e75245ccdb25ff0808b096fdf920579deeaf0e9d /kernel | |
| parent | 61f26e3aa5eee4fd8947ecca1cc45b5634e146b8 (diff) | |
sched: Remove thread group iteration from colocation
Iterating a leader task's thread group in order to add them to a
colocation group involves a complex locking chain that ends up
causing a deadlock. The deadlock is as follows when the same task
is being referenced on three different CPUs:
----- ------ -----
CPU 0 CPU 1 CPU 2
----- ------ -----
add_task_to_group(p)
__schedule(prev = p) write_lock( ttwu(p)
related_thread_grp_lock)
lock(pi_lock)
idle_balance() wait for
p->on_cpu
load_balance() unable to acquire
p->pi_lock
send_notification()
wait for read_lock(
related_thread_grp_lock)
unable to set p->on_cpu
There are a couple of ways to resolve this deadlock in the kernel,
however, they are not trivial. For the sake of simplicity, move
the responsibility of thread group iteration back to userspace. This
would apply to both adding and removing the leader task from a
colocation group. The kernel would continue to automatically add
newly forked children of the colocated leader to the colocation
group.
This still leaves an issue with the locking order of the pi_lock and
the related_thread_group_lock. To solve all deadlocks, we need to avoid
taking the pi_lock in reset_all_task_stats() and instead rely on a more
heavy handed approach of taking all rq locks. The pi_lock was taken to
avoid a race between reset_all_task_stats() and sched_exit(). The race
can be avoided with rq locks as well.
Change-Id: I15323e3ef91401142d3841db59c18fd8fee753fd
Signed-off-by: Syed Rameez Mustafa <rameezmustafa@codeaurora.org>
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/sched/hmp.c | 126 |
1 files changed, 28 insertions, 98 deletions
diff --git a/kernel/sched/hmp.c b/kernel/sched/hmp.c index 30391aae0822..c1c35bb582fb 100644 --- a/kernel/sched/hmp.c +++ b/kernel/sched/hmp.c @@ -3110,37 +3110,9 @@ static void reset_all_task_stats(void) { struct task_struct *g, *p; - read_lock(&tasklist_lock); do_each_thread(g, p) { - raw_spin_lock_irq(&p->pi_lock); reset_task_stats(p); - raw_spin_unlock_irq(&p->pi_lock); } while_each_thread(g, p); - read_unlock(&tasklist_lock); -} - -static void disable_window_stats(void) -{ - unsigned long flags; - int i; - - local_irq_save(flags); - for_each_possible_cpu(i) - raw_spin_lock(&cpu_rq(i)->lock); - - sched_disable_window_stats = 1; - - for_each_possible_cpu(i) - raw_spin_unlock(&cpu_rq(i)->lock); - - local_irq_restore(flags); -} - -/* Called with all cpu's rq->lock held */ -static void enable_window_stats(void) -{ - sched_disable_window_stats = 0; - } enum reset_reason_code { @@ -3166,17 +3138,22 @@ void reset_all_window_stats(u64 window_start, unsigned int window_size) unsigned int old = 0, new = 0; struct related_thread_group *grp; - read_lock(&related_thread_group_lock); - - disable_window_stats(); + local_irq_save(flags); - reset_all_task_stats(); + read_lock(&tasklist_lock); - local_irq_save(flags); + read_lock(&related_thread_group_lock); + /* Taking all runqueue locks prevents race with sched_exit(). */ for_each_possible_cpu(cpu) raw_spin_lock(&cpu_rq(cpu)->lock); + sched_disable_window_stats = 1; + + reset_all_task_stats(); + + read_unlock(&tasklist_lock); + list_for_each_entry(grp, &related_thread_groups, list) { int j; @@ -3196,7 +3173,7 @@ void reset_all_window_stats(u64 window_start, unsigned int window_size) sched_load_granule = sched_ravg_window / NUM_LOAD_INDICES; } - enable_window_stats(); + sched_disable_window_stats = 0; for_each_possible_cpu(cpu) { struct rq *rq = cpu_rq(cpu); @@ -3239,10 +3216,10 @@ void reset_all_window_stats(u64 window_start, unsigned int window_size) for_each_possible_cpu(cpu) raw_spin_unlock(&cpu_rq(cpu)->lock); - local_irq_restore(flags); - read_unlock(&related_thread_group_lock); + local_irq_restore(flags); + trace_sched_reset_all_window_stats(window_start, window_size, sched_ktime_clock() - start_ts, reason, old, new); } @@ -3867,12 +3844,7 @@ static void _set_preferred_cluster(struct related_thread_group *grp) void set_preferred_cluster(struct related_thread_group *grp) { - /* - * Prevent possible deadlock with update_children(). Not updating - * the preferred cluster once is not a big deal. - */ - if (!raw_spin_trylock(&grp->lock)) - return; + raw_spin_lock(&grp->lock); _set_preferred_cluster(grp); raw_spin_unlock(&grp->lock); } @@ -4116,64 +4088,19 @@ static void free_related_thread_group(struct rcu_head *rcu) kfree(grp); } -/* - * The thread group for a task can change while we are here. However, - * add_new_task_to_grp() will take care of any tasks that we miss here. - * When a parent exits, and a child thread is simultaneously exiting, - * sched_set_group_id() will synchronize those operations. - */ -static void update_children(struct task_struct *leader, - struct related_thread_group *grp, int event) -{ - struct task_struct *child; - struct rq *rq; - unsigned long flags; - - if (!thread_group_leader(leader)) - return; - - if (event == ADD_TASK && !sysctl_sched_enable_thread_grouping) - return; - - if (thread_group_empty(leader)) - return; - - child = next_thread(leader); - - do { - rq = task_rq_lock(child, &flags); - - if (event == REM_TASK && child->grp && grp == child->grp) { - transfer_busy_time(rq, grp, child, event); - list_del_init(&child->grp_list); - rcu_assign_pointer(child->grp, NULL); - } else if (event == ADD_TASK && !child->grp) { - transfer_busy_time(rq, grp, child, event); - list_add(&child->grp_list, &grp->tasks); - rcu_assign_pointer(child->grp, grp); - } - - task_rq_unlock(rq, child, &flags); - } while_each_thread(leader, child); - -} - static void remove_task_from_group(struct task_struct *p) { struct related_thread_group *grp = p->grp; struct rq *rq; int empty_group = 1; - unsigned long flags; raw_spin_lock(&grp->lock); - rq = task_rq_lock(p, &flags); + rq = __task_rq_lock(p); transfer_busy_time(rq, p->grp, p, REM_TASK); list_del_init(&p->grp_list); rcu_assign_pointer(p->grp, NULL); - task_rq_unlock(rq, p, &flags); - - update_children(p, grp, REM_TASK); + __task_rq_unlock(rq); if (!list_empty(&grp->tasks)) { empty_group = 0; @@ -4192,7 +4119,6 @@ static int add_task_to_group(struct task_struct *p, struct related_thread_group *grp) { struct rq *rq; - unsigned long flags; raw_spin_lock(&grp->lock); @@ -4200,13 +4126,11 @@ add_task_to_group(struct task_struct *p, struct related_thread_group *grp) * Change p->grp under rq->lock. Will prevent races with read-side * reference of p->grp in various hot-paths */ - rq = task_rq_lock(p, &flags); + rq = __task_rq_lock(p); transfer_busy_time(rq, grp, p, ADD_TASK); list_add(&p->grp_list, &grp->tasks); rcu_assign_pointer(p->grp, grp); - task_rq_unlock(rq, p, &flags); - - update_children(p, grp, ADD_TASK); + __task_rq_unlock(rq); _set_preferred_cluster(grp); @@ -4235,7 +4159,12 @@ void add_new_task_to_grp(struct task_struct *new) grp = task_related_thread_group(parent); rcu_read_unlock(); - /* Its possible that update_children() already added us to the group */ + /* + * It's possible that someone already added the new task to the + * group. A leader's thread group is updated prior to calling + * this function. It's also possible that the leader has exited + * the group. In either case, there is nothing else to do. + */ if (!grp || new->grp) { write_unlock_irqrestore(&related_thread_group_lock, flags); return; @@ -4256,8 +4185,8 @@ int sched_set_group_id(struct task_struct *p, unsigned int group_id) unsigned long flags; struct related_thread_group *grp = NULL; - /* Prevents tasks from exiting while we are managing groups. */ - write_lock_irqsave(&related_thread_group_lock, flags); + raw_spin_lock_irqsave(&p->pi_lock, flags); + write_lock(&related_thread_group_lock); /* Switching from one group to another directly is not permitted */ if ((current != p && p->flags & PF_EXITING) || @@ -4284,7 +4213,8 @@ int sched_set_group_id(struct task_struct *p, unsigned int group_id) BUG_ON(!grp); rc = add_task_to_group(p, grp); done: - write_unlock_irqrestore(&related_thread_group_lock, flags); + write_unlock(&related_thread_group_lock); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); return rc; } |
