From bd0c1ed954159c6457feb246b7682144d782829a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 2 Aug 2021 21:46:14 +0800 Subject: futex: Rename free_pi_state() to put_pi_state() [ Upstream commit 29e9ee5d48c35d6cf8afe09bdf03f77125c9ac11 ] free_pi_state() is confusing as it is in fact only freeing/caching the pi state when the last reference is gone. Rename it to put_pi_state() which reflects better what it is doing. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Darren Hart Cc: Davidlohr Bueso Cc: Bhuvanesh_Surachari@mentor.com Cc: Andy Lowe Link: http://lkml.kernel.org/r/20151219200607.259636467@linutronix.de Signed-off-by: Thomas Gleixner Signed-off-by: Zhen Lei Acked-by: Joe Korty Signed-off-by: Greg Kroah-Hartman --- kernel/futex.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'kernel/futex.c') diff --git a/kernel/futex.c b/kernel/futex.c index ff5499b0c5b3..dbb38e14f6fc 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -859,9 +859,12 @@ static void pi_state_update_owner(struct futex_pi_state *pi_state, } /* + * Drops a reference to the pi_state object and frees or caches it + * when the last reference is gone. + * * Must be called with the hb lock held. */ -static void free_pi_state(struct futex_pi_state *pi_state) +static void put_pi_state(struct futex_pi_state *pi_state) { if (!pi_state) return; @@ -2121,7 +2124,7 @@ retry_private: case 0: break; case -EFAULT: - free_pi_state(pi_state); + put_pi_state(pi_state); pi_state = NULL; double_unlock_hb(hb1, hb2); hb_waiters_dec(hb2); @@ -2139,7 +2142,7 @@ retry_private: * exit to complete. * - EAGAIN: The user space value changed. */ - free_pi_state(pi_state); + put_pi_state(pi_state); pi_state = NULL; double_unlock_hb(hb1, hb2); hb_waiters_dec(hb2); @@ -2214,7 +2217,7 @@ retry_private: } else if (ret) { /* -EDEADLK */ this->pi_state = NULL; - free_pi_state(pi_state); + put_pi_state(pi_state); goto out_unlock; } } @@ -2223,7 +2226,7 @@ retry_private: } out_unlock: - free_pi_state(pi_state); + put_pi_state(pi_state); double_unlock_hb(hb1, hb2); wake_up_q(&wake_q); hb_waiters_dec(hb2); @@ -2376,7 +2379,7 @@ static void unqueue_me_pi(struct futex_q *q) __unqueue_futex(q); BUG_ON(!q->pi_state); - free_pi_state(q->pi_state); + put_pi_state(q->pi_state); q->pi_state = NULL; spin_unlock(q->lock_ptr); @@ -3210,7 +3213,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, * Drop the reference to the pi state which * the requeue_pi() code acquired for us. */ - free_pi_state(q.pi_state); + put_pi_state(q.pi_state); spin_unlock(q.lock_ptr); /* * Adjust the return value. It's either -EFAULT or -- cgit v1.2.3 From 52937382542020d72bd8708dc10ac829d8b06ffe Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 2 Aug 2021 21:46:15 +0800 Subject: futex: Cleanup refcounting [ Upstream commit bf92cf3a5100f5a0d5f9834787b130159397cb22 ] Add a put_pit_state() as counterpart for get_pi_state() so the refcounting becomes consistent. Signed-off-by: Peter Zijlstra (Intel) Cc: juri.lelli@arm.com Cc: bigeasy@linutronix.de Cc: xlpang@redhat.com Cc: rostedt@goodmis.org Cc: mathieu.desnoyers@efficios.com Cc: jdesfossez@efficios.com Cc: dvhart@infradead.org Cc: bristot@redhat.com Link: http://lkml.kernel.org/r/20170322104151.801778516@infradead.org Signed-off-by: Thomas Gleixner Signed-off-by: Zhen Lei Acked-by: Joe Korty Signed-off-by: Greg Kroah-Hartman --- kernel/futex.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'kernel/futex.c') diff --git a/kernel/futex.c b/kernel/futex.c index dbb38e14f6fc..dab9c79a931a 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -825,7 +825,7 @@ static int refill_pi_state_cache(void) return 0; } -static struct futex_pi_state * alloc_pi_state(void) +static struct futex_pi_state *alloc_pi_state(void) { struct futex_pi_state *pi_state = current->pi_state_cache; @@ -858,6 +858,11 @@ static void pi_state_update_owner(struct futex_pi_state *pi_state, } } +static void get_pi_state(struct futex_pi_state *pi_state) +{ + WARN_ON_ONCE(!atomic_inc_not_zero(&pi_state->refcount)); +} + /* * Drops a reference to the pi_state object and frees or caches it * when the last reference is gone. @@ -901,7 +906,7 @@ static void put_pi_state(struct futex_pi_state *pi_state) * Look up the task based on what TID userspace gave us. * We dont trust it. */ -static struct task_struct * futex_find_get_task(pid_t pid) +static struct task_struct *futex_find_get_task(pid_t pid) { struct task_struct *p; @@ -1149,7 +1154,7 @@ static int attach_to_pi_state(u32 __user *uaddr, u32 uval, goto out_einval; out_attach: - atomic_inc(&pi_state->refcount); + get_pi_state(pi_state); raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); *ps = pi_state; return 0; @@ -2204,7 +2209,7 @@ retry_private: */ if (requeue_pi) { /* Prepare the waiter to take the rt_mutex. */ - atomic_inc(&pi_state->refcount); + get_pi_state(pi_state); this->pi_state = pi_state; ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex, this->rt_waiter, -- cgit v1.2.3 From 50801cdc86003c4e20b9ae668cf2659d0218cfcc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 2 Aug 2021 21:46:16 +0800 Subject: futex,rt_mutex: Introduce rt_mutex_init_waiter() [ Upstream commit 50809358dd7199aa7ce232f6877dd09ec30ef374 ] Since there's already two copies of this code, introduce a helper now before adding a third one. Signed-off-by: Peter Zijlstra (Intel) Cc: juri.lelli@arm.com Cc: bigeasy@linutronix.de Cc: xlpang@redhat.com Cc: rostedt@goodmis.org Cc: mathieu.desnoyers@efficios.com Cc: jdesfossez@efficios.com Cc: dvhart@infradead.org Cc: bristot@redhat.com Link: http://lkml.kernel.org/r/20170322104151.950039479@infradead.org Signed-off-by: Thomas Gleixner Signed-off-by: Zhen Lei Acked-by: Joe Korty Signed-off-by: Greg Kroah-Hartman --- kernel/futex.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel/futex.c') diff --git a/kernel/futex.c b/kernel/futex.c index dab9c79a931a..53a085a378f3 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -3156,10 +3156,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, * The waiter is allocated on our stack, manipulated by the requeue * code while we sleep on uaddr. */ - debug_rt_mutex_init_waiter(&rt_waiter); - RB_CLEAR_NODE(&rt_waiter.pi_tree_entry); - RB_CLEAR_NODE(&rt_waiter.tree_entry); - rt_waiter.task = NULL; + rt_mutex_init_waiter(&rt_waiter); ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE); if (unlikely(ret != 0)) -- cgit v1.2.3 From 89cb69dd360a0a582dbe3c3bd75ddac1ba830a9a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 2 Aug 2021 21:46:17 +0800 Subject: futex: Pull rt_mutex_futex_unlock() out from under hb->lock [ Upstream commit 16ffa12d742534d4ff73e8b3a4e81c1de39196f0 ] There's a number of 'interesting' problems, all caused by holding hb->lock while doing the rt_mutex_unlock() equivalient. Notably: - a PI inversion on hb->lock; and, - a SCHED_DEADLINE crash because of pointer instability. The previous changes: - changed the locking rules to cover {uval,pi_state} with wait_lock. - allow to do rt_mutex_futex_unlock() without dropping wait_lock; which in turn allows to rely on wait_lock atomicity completely. - simplified the waiter conundrum. It's now sufficient to hold rtmutex::wait_lock and a reference on the pi_state to protect the state consistency, so hb->lock can be dropped before calling rt_mutex_futex_unlock(). Signed-off-by: Peter Zijlstra (Intel) Cc: juri.lelli@arm.com Cc: bigeasy@linutronix.de Cc: xlpang@redhat.com Cc: rostedt@goodmis.org Cc: mathieu.desnoyers@efficios.com Cc: jdesfossez@efficios.com Cc: dvhart@infradead.org Cc: bristot@redhat.com Link: http://lkml.kernel.org/r/20170322104151.900002056@infradead.org Signed-off-by: Thomas Gleixner Signed-off-by: Zhen Lei Acked-by: Joe Korty Signed-off-by: Greg Kroah-Hartman --- kernel/futex.c | 111 +++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 68 insertions(+), 43 deletions(-) (limited to 'kernel/futex.c') diff --git a/kernel/futex.c b/kernel/futex.c index 53a085a378f3..dcea7b214e94 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -966,10 +966,12 @@ static void exit_pi_state_list(struct task_struct *curr) pi_state->owner = NULL; raw_spin_unlock_irq(&curr->pi_lock); - rt_mutex_futex_unlock(&pi_state->pi_mutex); - + get_pi_state(pi_state); spin_unlock(&hb->lock); + rt_mutex_futex_unlock(&pi_state->pi_mutex); + put_pi_state(pi_state); + raw_spin_lock_irq(&curr->pi_lock); } raw_spin_unlock_irq(&curr->pi_lock); @@ -1083,6 +1085,11 @@ static int attach_to_pi_state(u32 __user *uaddr, u32 uval, * has dropped the hb->lock in between queue_me() and unqueue_me_pi(), * which in turn means that futex_lock_pi() still has a reference on * our pi_state. + * + * The waiter holding a reference on @pi_state also protects against + * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi() + * and futex_wait_requeue_pi() as it cannot go to 0 and consequently + * free pi_state before we can take a reference ourselves. */ WARN_ON(!atomic_read(&pi_state->refcount)); @@ -1537,48 +1544,40 @@ static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q) q->lock_ptr = NULL; } -static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this, - struct futex_hash_bucket *hb) +/* + * Caller must hold a reference on @pi_state. + */ +static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state) { - struct task_struct *new_owner; - struct futex_pi_state *pi_state = this->pi_state; u32 uninitialized_var(curval), newval; + struct task_struct *new_owner; + bool deboost = false; WAKE_Q(wake_q); - bool deboost; int ret = 0; - if (!pi_state) - return -EINVAL; - - /* - * If current does not own the pi_state then the futex is - * inconsistent and user space fiddled with the futex value. - */ - if (pi_state->owner != current) - return -EINVAL; - raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); - - /* - * When we interleave with futex_lock_pi() where it does - * rt_mutex_timed_futex_lock(), we might observe @this futex_q waiter, - * but the rt_mutex's wait_list can be empty (either still, or again, - * depending on which side we land). - * - * When this happens, give up our locks and try again, giving the - * futex_lock_pi() instance time to complete, either by waiting on the - * rtmutex or removing itself from the futex queue. - */ if (!new_owner) { - raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); - return -EAGAIN; + /* + * Since we held neither hb->lock nor wait_lock when coming + * into this function, we could have raced with futex_lock_pi() + * such that we might observe @this futex_q waiter, but the + * rt_mutex's wait_list can be empty (either still, or again, + * depending on which side we land). + * + * When this happens, give up our locks and try again, giving + * the futex_lock_pi() instance time to complete, either by + * waiting on the rtmutex or removing itself from the futex + * queue. + */ + ret = -EAGAIN; + goto out_unlock; } /* - * We pass it to the next owner. The WAITERS bit is always - * kept enabled while there is PI state around. We cleanup the - * owner died bit, because we are the owner. + * We pass it to the next owner. The WAITERS bit is always kept + * enabled while there is PI state around. We cleanup the owner + * died bit, because we are the owner. */ newval = FUTEX_WAITERS | task_pid_vnr(new_owner); @@ -1611,15 +1610,15 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this, deboost = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q); } +out_unlock: raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); - spin_unlock(&hb->lock); if (deboost) { wake_up_q(&wake_q); rt_mutex_adjust_prio(current); } - return 0; + return ret; } /* @@ -2462,7 +2461,7 @@ retry: if (get_futex_value_locked(&uval, uaddr)) goto handle_fault; - while (1) { + for (;;) { newval = (uval & FUTEX_OWNER_DIED) | newtid; if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) @@ -2975,10 +2974,36 @@ retry: */ match = futex_top_waiter(hb, &key); if (match) { - ret = wake_futex_pi(uaddr, uval, match, hb); + struct futex_pi_state *pi_state = match->pi_state; + + ret = -EINVAL; + if (!pi_state) + goto out_unlock; + /* - * In case of success wake_futex_pi dropped the hash - * bucket lock. + * If current does not own the pi_state then the futex is + * inconsistent and user space fiddled with the futex value. + */ + if (pi_state->owner != current) + goto out_unlock; + + /* + * Grab a reference on the pi_state and drop hb->lock. + * + * The reference ensures pi_state lives, dropping the hb->lock + * is tricky.. wake_futex_pi() will take rt_mutex::wait_lock to + * close the races against futex_lock_pi(), but in case of + * _any_ fail we'll abort and retry the whole deal. + */ + get_pi_state(pi_state); + spin_unlock(&hb->lock); + + ret = wake_futex_pi(uaddr, uval, pi_state); + + put_pi_state(pi_state); + + /* + * Success, we're done! No tricky corner cases. */ if (!ret) goto out_putkey; @@ -2993,7 +3018,6 @@ retry: * setting the FUTEX_WAITERS bit. Try again. */ if (ret == -EAGAIN) { - spin_unlock(&hb->lock); put_futex_key(&key); goto retry; } @@ -3001,7 +3025,7 @@ retry: * wake_futex_pi has detected invalid state. Tell user * space. */ - goto out_unlock; + goto out_putkey; } /* @@ -3011,8 +3035,10 @@ retry: * preserve the WAITERS bit not the OWNER_DIED one. We are the * owner. */ - if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0)) + if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0)) { + spin_unlock(&hb->lock); goto pi_faulted; + } /* * If uval has changed, let user space handle it. @@ -3026,7 +3052,6 @@ out_putkey: return ret; pi_faulted: - spin_unlock(&hb->lock); put_futex_key(&key); ret = fault_in_user_writeable(uaddr); -- cgit v1.2.3 From b5dac38eb0ff3cbef23afd36d6822291a2a757a5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 2 Aug 2021 21:46:18 +0800 Subject: futex: Rework futex_lock_pi() to use rt_mutex_*_proxy_lock() [ Upstream commit cfafcd117da0216520568c195cb2f6cd1980c4bb ] By changing futex_lock_pi() to use rt_mutex_*_proxy_lock() all wait_list modifications are done under both hb->lock and wait_lock. This closes the obvious interleave pattern between futex_lock_pi() and futex_unlock_pi(), but not entirely so. See below: Before: futex_lock_pi() futex_unlock_pi() unlock hb->lock lock hb->lock unlock hb->lock lock rt_mutex->wait_lock unlock rt_mutex_wait_lock -EAGAIN lock rt_mutex->wait_lock list_add unlock rt_mutex->wait_lock schedule() lock rt_mutex->wait_lock list_del unlock rt_mutex->wait_lock -EAGAIN lock hb->lock After: futex_lock_pi() futex_unlock_pi() lock hb->lock lock rt_mutex->wait_lock list_add unlock rt_mutex->wait_lock unlock hb->lock schedule() lock hb->lock unlock hb->lock lock hb->lock lock rt_mutex->wait_lock list_del unlock rt_mutex->wait_lock lock rt_mutex->wait_lock unlock rt_mutex_wait_lock -EAGAIN unlock hb->lock It does however solve the earlier starvation/live-lock scenario which got introduced with the -EAGAIN since unlike the before scenario; where the -EAGAIN happens while futex_unlock_pi() doesn't hold any locks; in the after scenario it happens while futex_unlock_pi() actually holds a lock, and then it is serialized on that lock. Signed-off-by: Peter Zijlstra (Intel) Cc: juri.lelli@arm.com Cc: bigeasy@linutronix.de Cc: xlpang@redhat.com Cc: rostedt@goodmis.org Cc: mathieu.desnoyers@efficios.com Cc: jdesfossez@efficios.com Cc: dvhart@infradead.org Cc: bristot@redhat.com Link: http://lkml.kernel.org/r/20170322104152.062785528@infradead.org Signed-off-by: Thomas Gleixner Signed-off-by: Zhen Lei Acked-by: Joe Korty Signed-off-by: Greg Kroah-Hartman --- kernel/futex.c | 77 +++++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 55 insertions(+), 22 deletions(-) (limited to 'kernel/futex.c') diff --git a/kernel/futex.c b/kernel/futex.c index dcea7b214e94..45f00a2fb59c 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2284,20 +2284,7 @@ queue_unlock(struct futex_hash_bucket *hb) hb_waiters_dec(hb); } -/** - * queue_me() - Enqueue the futex_q on the futex_hash_bucket - * @q: The futex_q to enqueue - * @hb: The destination hash bucket - * - * The hb->lock must be held by the caller, and is released here. A call to - * queue_me() is typically paired with exactly one call to unqueue_me(). The - * exceptions involve the PI related operations, which may use unqueue_me_pi() - * or nothing if the unqueue is done as part of the wake process and the unqueue - * state is implicit in the state of woken task (see futex_wait_requeue_pi() for - * an example). - */ -static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) - __releases(&hb->lock) +static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb) { int prio; @@ -2314,6 +2301,24 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) plist_node_init(&q->list, prio); plist_add(&q->list, &hb->chain); q->task = current; +} + +/** + * queue_me() - Enqueue the futex_q on the futex_hash_bucket + * @q: The futex_q to enqueue + * @hb: The destination hash bucket + * + * The hb->lock must be held by the caller, and is released here. A call to + * queue_me() is typically paired with exactly one call to unqueue_me(). The + * exceptions involve the PI related operations, which may use unqueue_me_pi() + * or nothing if the unqueue is done as part of the wake process and the unqueue + * state is implicit in the state of woken task (see futex_wait_requeue_pi() for + * an example). + */ +static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) + __releases(&hb->lock) +{ + __queue_me(q, hb); spin_unlock(&hb->lock); } @@ -2819,6 +2824,7 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, { struct hrtimer_sleeper timeout, *to = NULL; struct task_struct *exiting = NULL; + struct rt_mutex_waiter rt_waiter; struct futex_hash_bucket *hb; struct futex_q q = futex_q_init; int res, ret; @@ -2879,24 +2885,51 @@ retry_private: } } + WARN_ON(!q.pi_state); + /* * Only actually queue now that the atomic ops are done: */ - queue_me(&q, hb); + __queue_me(&q, hb); - WARN_ON(!q.pi_state); - /* - * Block on the PI mutex: - */ - if (!trylock) { - ret = rt_mutex_timed_futex_lock(&q.pi_state->pi_mutex, to); - } else { + if (trylock) { ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex); /* Fixup the trylock return value: */ ret = ret ? 0 : -EWOULDBLOCK; + goto no_block; + } + + /* + * We must add ourselves to the rt_mutex waitlist while holding hb->lock + * such that the hb and rt_mutex wait lists match. + */ + rt_mutex_init_waiter(&rt_waiter); + ret = rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current); + if (ret) { + if (ret == 1) + ret = 0; + + goto no_block; } + spin_unlock(q.lock_ptr); + + if (unlikely(to)) + hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS); + + ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter); + spin_lock(q.lock_ptr); + /* + * If we failed to acquire the lock (signal/timeout), we must + * first acquire the hb->lock before removing the lock from the + * rt_mutex waitqueue, such that we can keep the hb and rt_mutex + * wait lists consistent. + */ + if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter)) + ret = 0; + +no_block: /* * Fixup the pi_state owner and possibly acquire the lock if we * haven't already. -- cgit v1.2.3 From 1f705af703b314e969264de54f3bcdfffabf2cf5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 2 Aug 2021 21:46:19 +0800 Subject: futex: Futex_unlock_pi() determinism [ Upstream commit bebe5b514345f09be2c15e414d076b02ecb9cce8 ] The problem with returning -EAGAIN when the waiter state mismatches is that it becomes very hard to proof a bounded execution time on the operation. And seeing that this is a RT operation, this is somewhat important. While in practise; given the previous patch; it will be very unlikely to ever really take more than one or two rounds, proving so becomes rather hard. However, now that modifying wait_list is done while holding both hb->lock and wait_lock, the scenario can be avoided entirely by acquiring wait_lock while still holding hb-lock. Doing a hand-over, without leaving a hole. Signed-off-by: Peter Zijlstra (Intel) Cc: juri.lelli@arm.com Cc: bigeasy@linutronix.de Cc: xlpang@redhat.com Cc: rostedt@goodmis.org Cc: mathieu.desnoyers@efficios.com Cc: jdesfossez@efficios.com Cc: dvhart@infradead.org Cc: bristot@redhat.com Link: http://lkml.kernel.org/r/20170322104152.112378812@infradead.org Signed-off-by: Thomas Gleixner Signed-off-by: Zhen Lei Acked-by: Joe Korty Signed-off-by: Greg Kroah-Hartman --- kernel/futex.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) (limited to 'kernel/futex.c') diff --git a/kernel/futex.c b/kernel/futex.c index 45f00a2fb59c..8f6372d3a1fe 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1555,15 +1555,10 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_ WAKE_Q(wake_q); int ret = 0; - raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); - if (!new_owner) { + if (WARN_ON_ONCE(!new_owner)) { /* - * Since we held neither hb->lock nor wait_lock when coming - * into this function, we could have raced with futex_lock_pi() - * such that we might observe @this futex_q waiter, but the - * rt_mutex's wait_list can be empty (either still, or again, - * depending on which side we land). + * As per the comment in futex_unlock_pi() this should not happen. * * When this happens, give up our locks and try again, giving * the futex_lock_pi() instance time to complete, either by @@ -3020,15 +3015,18 @@ retry: if (pi_state->owner != current) goto out_unlock; + get_pi_state(pi_state); /* - * Grab a reference on the pi_state and drop hb->lock. + * Since modifying the wait_list is done while holding both + * hb->lock and wait_lock, holding either is sufficient to + * observe it. * - * The reference ensures pi_state lives, dropping the hb->lock - * is tricky.. wake_futex_pi() will take rt_mutex::wait_lock to - * close the races against futex_lock_pi(), but in case of - * _any_ fail we'll abort and retry the whole deal. + * By taking wait_lock while still holding hb->lock, we ensure + * there is no point where we hold neither; and therefore + * wake_futex_pi() must observe a state consistent with what we + * observed. */ - get_pi_state(pi_state); + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); spin_unlock(&hb->lock); ret = wake_futex_pi(uaddr, uval, pi_state); -- cgit v1.2.3 From 6255b40352498beb0309c99367542302711231e4 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Mon, 2 Aug 2021 21:46:21 +0800 Subject: futex: Handle transient "ownerless" rtmutex state correctly [ Upstream commit 9f5d1c336a10c0d24e83e40b4c1b9539f7dba627 ] Gratian managed to trigger the BUG_ON(!newowner) in fixup_pi_state_owner(). This is one possible chain of events leading to this: Task Prio Operation T1 120 lock(F) T2 120 lock(F) -> blocks (top waiter) T3 50 (RT) lock(F) -> boosts T1 and blocks (new top waiter) XX timeout/ -> wakes T2 signal T1 50 unlock(F) -> wakes T3 (rtmutex->owner == NULL, waiter bit is set) T2 120 cleanup -> try_to_take_mutex() fails because T3 is the top waiter and the lower priority T2 cannot steal the lock. -> fixup_pi_state_owner() sees newowner == NULL -> BUG_ON() The comment states that this is invalid and rt_mutex_real_owner() must return a non NULL owner when the trylock failed, but in case of a queued and woken up waiter rt_mutex_real_owner() == NULL is a valid transient state. The higher priority waiter has simply not yet managed to take over the rtmutex. The BUG_ON() is therefore wrong and this is just another retry condition in fixup_pi_state_owner(). Drop the locks, so that T3 can make progress, and then try the fixup again. Gratian provided a great analysis, traces and a reproducer. The analysis is to the point, but it confused the hell out of that tglx dude who had to page in all the futex horrors again. Condensed version is above. [ tglx: Wrote comment and changelog ] Fixes: c1e2f0eaf015 ("futex: Avoid violating the 10th rule of futex") Reported-by: Gratian Crisan Signed-off-by: Mike Galbraith Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/87a6w6x7bb.fsf@ni.com Link: https://lore.kernel.org/r/87sg9pkvf7.fsf@nanos.tec.linutronix.de Signed-off-by: Zhen Lei Acked-by: Joe Korty Signed-off-by: Greg Kroah-Hartman --- kernel/futex.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'kernel/futex.c') diff --git a/kernel/futex.c b/kernel/futex.c index 8f6372d3a1fe..e7c2e552aef4 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2437,10 +2437,22 @@ retry: } /* - * Since we just failed the trylock; there must be an owner. + * The trylock just failed, so either there is an owner or + * there is a higher priority waiter than this one. */ newowner = rt_mutex_owner(&pi_state->pi_mutex); - BUG_ON(!newowner); + /* + * If the higher priority waiter has not yet taken over the + * rtmutex then newowner is NULL. We can't return here with + * that state because it's inconsistent vs. the user space + * state. So drop the locks and try again. It's a valid + * situation and not any different from the other retry + * conditions. + */ + if (unlikely(!newowner)) { + err = -EAGAIN; + goto handle_fault; + } } else { WARN_ON_ONCE(argowner != current); if (oldowner == current) { -- cgit v1.2.3 From 9b8d748d4bb027f9c2aa7ce03801d270dc0819e5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 2 Aug 2021 21:46:22 +0800 Subject: futex: Avoid freeing an active timer [ Upstream commit 97181f9bd57405b879403763284537e27d46963d ] Alexander reported a hrtimer debug_object splat: ODEBUG: free active (active state 0) object type: hrtimer hint: hrtimer_wakeup (kernel/time/hrtimer.c:1423) debug_object_free (lib/debugobjects.c:603) destroy_hrtimer_on_stack (kernel/time/hrtimer.c:427) futex_lock_pi (kernel/futex.c:2740) do_futex (kernel/futex.c:3399) SyS_futex (kernel/futex.c:3447 kernel/futex.c:3415) do_syscall_64 (arch/x86/entry/common.c:284) entry_SYSCALL64_slow_path (arch/x86/entry/entry_64.S:249) Which was caused by commit: cfafcd117da0 ("futex: Rework futex_lock_pi() to use rt_mutex_*_proxy_lock()") ... losing the hrtimer_cancel() in the shuffle. Where previously the hrtimer_cancel() was done by rt_mutex_slowlock() we now need to do it manually. Reported-by: Alexander Levin Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Fixes: cfafcd117da0 ("futex: Rework futex_lock_pi() to use rt_mutex_*_proxy_lock()") Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1704101802370.2906@nanos Signed-off-by: Ingo Molnar Signed-off-by: Zhen Lei Acked-by: Joe Korty Signed-off-by: Greg Kroah-Hartman --- kernel/futex.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel/futex.c') diff --git a/kernel/futex.c b/kernel/futex.c index e7c2e552aef4..6d47b7dc1cfb 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2960,8 +2960,10 @@ out_unlock_put_key: out_put_key: put_futex_key(&q.key); out: - if (to) + if (to) { + hrtimer_cancel(&to->timer); destroy_hrtimer_on_stack(&to->timer); + } return ret != -EINTR ? ret : -ERESTARTNOINTR; uaddr_faulted: -- cgit v1.2.3