summaryrefslogtreecommitdiff
path: root/kernel/futex.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/futex.c')
-rw-r--r--kernel/futex.c941
1 files changed, 775 insertions, 166 deletions
diff --git a/kernel/futex.c b/kernel/futex.c
index 1c7619e6edd0..5990a8f95ae0 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -44,6 +44,7 @@
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
+#include <linux/compat.h>
#include <linux/slab.h>
#include <linux/poll.h>
#include <linux/fs.h>
@@ -171,8 +172,10 @@
* double_lock_hb() and double_unlock_hb(), respectively.
*/
-#ifndef CONFIG_HAVE_FUTEX_CMPXCHG
-int __read_mostly futex_cmpxchg_enabled;
+#ifdef CONFIG_HAVE_FUTEX_CMPXCHG
+#define futex_cmpxchg_enabled 1
+#else
+static int __read_mostly futex_cmpxchg_enabled;
#endif
/*
@@ -328,6 +331,12 @@ static inline bool should_fail_futex(bool fshared)
}
#endif /* CONFIG_FAIL_FUTEX */
+#ifdef CONFIG_COMPAT
+static void compat_exit_robust_list(struct task_struct *curr);
+#else
+static inline void compat_exit_robust_list(struct task_struct *curr) { }
+#endif
+
static inline void futex_get_mm(union futex_key *key)
{
atomic_inc(&key->private.mm->mm_count);
@@ -378,9 +387,9 @@ static inline int hb_waiters_pending(struct futex_hash_bucket *hb)
*/
static struct futex_hash_bucket *hash_futex(union futex_key *key)
{
- u32 hash = jhash2((u32*)&key->both.word,
- (sizeof(key->both.word)+sizeof(key->both.ptr))/4,
+ u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4,
key->both.offset);
+
return &futex_queues[hash & (futex_hashsize - 1)];
}
@@ -816,7 +825,7 @@ static int refill_pi_state_cache(void)
return 0;
}
-static struct futex_pi_state * alloc_pi_state(void)
+static struct futex_pi_state *alloc_pi_state(void)
{
struct futex_pi_state *pi_state = current->pi_state_cache;
@@ -849,10 +858,18 @@ static void pi_state_update_owner(struct futex_pi_state *pi_state,
}
}
+static void get_pi_state(struct futex_pi_state *pi_state)
+{
+ WARN_ON_ONCE(!atomic_inc_not_zero(&pi_state->refcount));
+}
+
/*
+ * Drops a reference to the pi_state object and frees or caches it
+ * when the last reference is gone.
+ *
* Must be called with the hb lock held.
*/
-static void free_pi_state(struct futex_pi_state *pi_state)
+static void put_pi_state(struct futex_pi_state *pi_state)
{
if (!pi_state)
return;
@@ -865,7 +882,9 @@ static void free_pi_state(struct futex_pi_state *pi_state)
* and has cleaned up the pi_state already
*/
if (pi_state->owner) {
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
pi_state_update_owner(pi_state, NULL);
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
rt_mutex_proxy_unlock(&pi_state->pi_mutex);
}
@@ -887,7 +906,7 @@ static void free_pi_state(struct futex_pi_state *pi_state)
* Look up the task based on what TID userspace gave us.
* We dont trust it.
*/
-static struct task_struct * futex_find_get_task(pid_t pid)
+static struct task_struct *futex_find_get_task(pid_t pid)
{
struct task_struct *p;
@@ -906,7 +925,7 @@ static struct task_struct * futex_find_get_task(pid_t pid)
* Kernel cleans up PI-state, but userspace is likely hosed.
* (Robust-futex cleanup is separate and might save the day for userspace.)
*/
-void exit_pi_state_list(struct task_struct *curr)
+static void exit_pi_state_list(struct task_struct *curr)
{
struct list_head *next, *head = &curr->pi_state_list;
struct futex_pi_state *pi_state;
@@ -947,10 +966,12 @@ void exit_pi_state_list(struct task_struct *curr)
pi_state->owner = NULL;
raw_spin_unlock_irq(&curr->pi_lock);
- rt_mutex_futex_unlock(&pi_state->pi_mutex);
-
+ get_pi_state(pi_state);
spin_unlock(&hb->lock);
+ rt_mutex_futex_unlock(&pi_state->pi_mutex);
+ put_pi_state(pi_state);
+
raw_spin_lock_irq(&curr->pi_lock);
}
raw_spin_unlock_irq(&curr->pi_lock);
@@ -1005,6 +1026,39 @@ void exit_pi_state_list(struct task_struct *curr)
* [10] There is no transient state which leaves owner and user space
* TID out of sync. Except one error case where the kernel is denied
* write access to the user address, see fixup_pi_state_owner().
+ *
+ *
+ * Serialization and lifetime rules:
+ *
+ * hb->lock:
+ *
+ * hb -> futex_q, relation
+ * futex_q -> pi_state, relation
+ *
+ * (cannot be raw because hb can contain arbitrary amount
+ * of futex_q's)
+ *
+ * pi_mutex->wait_lock:
+ *
+ * {uval, pi_state}
+ *
+ * (and pi_mutex 'obviously')
+ *
+ * p->pi_lock:
+ *
+ * p->pi_state_list -> pi_state->list, relation
+ *
+ * pi_state->refcount:
+ *
+ * pi_state lifetime
+ *
+ *
+ * Lock order:
+ *
+ * hb->lock
+ * pi_mutex->wait_lock
+ * p->pi_lock
+ *
*/
/*
@@ -1012,10 +1066,12 @@ void exit_pi_state_list(struct task_struct *curr)
* the pi_state against the user space value. If correct, attach to
* it.
*/
-static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
+static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
+ struct futex_pi_state *pi_state,
struct futex_pi_state **ps)
{
pid_t pid = uval & FUTEX_TID_MASK;
+ int ret, uval2;
/*
* Userspace might have messed up non-PI and PI futexes [3]
@@ -1023,9 +1079,39 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
if (unlikely(!pi_state))
return -EINVAL;
+ /*
+ * We get here with hb->lock held, and having found a
+ * futex_top_waiter(). This means that futex_lock_pi() of said futex_q
+ * has dropped the hb->lock in between queue_me() and unqueue_me_pi(),
+ * which in turn means that futex_lock_pi() still has a reference on
+ * our pi_state.
+ *
+ * The waiter holding a reference on @pi_state also protects against
+ * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi()
+ * and futex_wait_requeue_pi() as it cannot go to 0 and consequently
+ * free pi_state before we can take a reference ourselves.
+ */
WARN_ON(!atomic_read(&pi_state->refcount));
/*
+ * Now that we have a pi_state, we can acquire wait_lock
+ * and do the state validation.
+ */
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+
+ /*
+ * Since {uval, pi_state} is serialized by wait_lock, and our current
+ * uval was read without holding it, it can have changed. Verify it
+ * still is what we expect it to be, otherwise retry the entire
+ * operation.
+ */
+ if (get_futex_value_locked(&uval2, uaddr))
+ goto out_efault;
+
+ if (uval != uval2)
+ goto out_eagain;
+
+ /*
* Handle the owner died case:
*/
if (uval & FUTEX_OWNER_DIED) {
@@ -1040,11 +1126,11 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
* is not 0. Inconsistent state. [5]
*/
if (pid)
- return -EINVAL;
+ goto out_einval;
/*
* Take a ref on the state and return success. [4]
*/
- goto out_state;
+ goto out_attach;
}
/*
@@ -1056,14 +1142,14 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
* Take a ref on the state and return success. [6]
*/
if (!pid)
- goto out_state;
+ goto out_attach;
} else {
/*
* If the owner died bit is not set, then the pi_state
* must have an owner. [7]
*/
if (!pi_state->owner)
- return -EINVAL;
+ goto out_einval;
}
/*
@@ -1072,19 +1158,124 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
* user space TID. [9/10]
*/
if (pid != task_pid_vnr(pi_state->owner))
- return -EINVAL;
-out_state:
- atomic_inc(&pi_state->refcount);
+ goto out_einval;
+
+out_attach:
+ get_pi_state(pi_state);
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
*ps = pi_state;
return 0;
+
+out_einval:
+ ret = -EINVAL;
+ goto out_error;
+
+out_eagain:
+ ret = -EAGAIN;
+ goto out_error;
+
+out_efault:
+ ret = -EFAULT;
+ goto out_error;
+
+out_error:
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+ return ret;
+}
+
+/**
+ * wait_for_owner_exiting - Block until the owner has exited
+ * @exiting: Pointer to the exiting task
+ *
+ * Caller must hold a refcount on @exiting.
+ */
+static void wait_for_owner_exiting(int ret, struct task_struct *exiting)
+{
+ if (ret != -EBUSY) {
+ WARN_ON_ONCE(exiting);
+ return;
+ }
+
+ if (WARN_ON_ONCE(ret == -EBUSY && !exiting))
+ return;
+
+ mutex_lock(&exiting->futex_exit_mutex);
+ /*
+ * No point in doing state checking here. If the waiter got here
+ * while the task was in exec()->exec_futex_release() then it can
+ * have any FUTEX_STATE_* value when the waiter has acquired the
+ * mutex. OK, if running, EXITING or DEAD if it reached exit()
+ * already. Highly unlikely and not a problem. Just one more round
+ * through the futex maze.
+ */
+ mutex_unlock(&exiting->futex_exit_mutex);
+
+ put_task_struct(exiting);
+}
+
+static int handle_exit_race(u32 __user *uaddr, u32 uval,
+ struct task_struct *tsk)
+{
+ u32 uval2;
+
+ /*
+ * If the futex exit state is not yet FUTEX_STATE_DEAD, tell the
+ * caller that the alleged owner is busy.
+ */
+ if (tsk && tsk->futex_state != FUTEX_STATE_DEAD)
+ return -EBUSY;
+
+ /*
+ * Reread the user space value to handle the following situation:
+ *
+ * CPU0 CPU1
+ *
+ * sys_exit() sys_futex()
+ * do_exit() futex_lock_pi()
+ * futex_lock_pi_atomic()
+ * exit_signals(tsk) No waiters:
+ * tsk->flags |= PF_EXITING; *uaddr == 0x00000PID
+ * mm_release(tsk) Set waiter bit
+ * exit_robust_list(tsk) { *uaddr = 0x80000PID;
+ * Set owner died attach_to_pi_owner() {
+ * *uaddr = 0xC0000000; tsk = get_task(PID);
+ * } if (!tsk->flags & PF_EXITING) {
+ * ... attach();
+ * tsk->futex_state = } else {
+ * FUTEX_STATE_DEAD; if (tsk->futex_state !=
+ * FUTEX_STATE_DEAD)
+ * return -EAGAIN;
+ * return -ESRCH; <--- FAIL
+ * }
+ *
+ * Returning ESRCH unconditionally is wrong here because the
+ * user space value has been changed by the exiting task.
+ *
+ * The same logic applies to the case where the exiting task is
+ * already gone.
+ */
+ if (get_futex_value_locked(&uval2, uaddr))
+ return -EFAULT;
+
+ /* If the user space value has changed, try again. */
+ if (uval2 != uval)
+ return -EAGAIN;
+
+ /*
+ * The exiting task did not have a robust list, the robust list was
+ * corrupted or the user space value in *uaddr is simply bogus.
+ * Give up and tell user space.
+ */
+ return -ESRCH;
}
/*
* Lookup the task for the TID provided from user space and attach to
* it after doing proper sanity checks.
*/
-static int attach_to_pi_owner(u32 uval, union futex_key *key,
- struct futex_pi_state **ps)
+static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,
+ struct futex_pi_state **ps,
+ struct task_struct **exiting)
{
pid_t pid = uval & FUTEX_TID_MASK;
struct futex_pi_state *pi_state;
@@ -1093,12 +1284,15 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key,
/*
* We are the first waiter - try to look up the real owner and attach
* the new pi_state to it, but bail out when TID = 0 [1]
+ *
+ * The !pid check is paranoid. None of the call sites should end up
+ * with pid == 0, but better safe than sorry. Let the caller retry
*/
if (!pid)
- return -ESRCH;
+ return -EAGAIN;
p = futex_find_get_task(pid);
if (!p)
- return -ESRCH;
+ return handle_exit_race(uaddr, uval, NULL);
if (unlikely(p->flags & PF_KTHREAD)) {
put_task_struct(p);
@@ -1106,27 +1300,41 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key,
}
/*
- * We need to look at the task state flags to figure out,
- * whether the task is exiting. To protect against the do_exit
- * change of the task flags, we do this protected by
- * p->pi_lock:
+ * We need to look at the task state to figure out, whether the
+ * task is exiting. To protect against the change of the task state
+ * in futex_exit_release(), we do this protected by p->pi_lock:
*/
raw_spin_lock_irq(&p->pi_lock);
- if (unlikely(p->flags & PF_EXITING)) {
+ if (unlikely(p->futex_state != FUTEX_STATE_OK)) {
/*
- * The task is on the way out. When PF_EXITPIDONE is
- * set, we know that the task has finished the
- * cleanup:
+ * The task is on the way out. When the futex state is
+ * FUTEX_STATE_DEAD, we know that the task has finished
+ * the cleanup:
*/
- int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN;
+ int ret = handle_exit_race(uaddr, uval, p);
raw_spin_unlock_irq(&p->pi_lock);
- put_task_struct(p);
+ /*
+ * If the owner task is between FUTEX_STATE_EXITING and
+ * FUTEX_STATE_DEAD then store the task pointer and keep
+ * the reference on the task struct. The calling code will
+ * drop all locks, wait for the task to reach
+ * FUTEX_STATE_DEAD and then drop the refcount. This is
+ * required to prevent a live lock when the current task
+ * preempted the exiting task between the two states.
+ */
+ if (ret == -EBUSY)
+ *exiting = p;
+ else
+ put_task_struct(p);
return ret;
}
/*
* No existing pi state. First waiter. [2]
+ *
+ * This creates pi_state, we have hb->lock held, this means nothing can
+ * observe this state, wait_lock is irrelevant.
*/
pi_state = alloc_pi_state();
@@ -1151,8 +1359,10 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key,
return 0;
}
-static int lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
- union futex_key *key, struct futex_pi_state **ps)
+static int lookup_pi_state(u32 __user *uaddr, u32 uval,
+ struct futex_hash_bucket *hb,
+ union futex_key *key, struct futex_pi_state **ps,
+ struct task_struct **exiting)
{
struct futex_q *match = futex_top_waiter(hb, key);
@@ -1161,13 +1371,13 @@ static int lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
* attach to the pi_state when the validation succeeds.
*/
if (match)
- return attach_to_pi_state(uval, match->pi_state, ps);
+ return attach_to_pi_state(uaddr, uval, match->pi_state, ps);
/*
* We are the first waiter - try to look up the owner based on
* @uval and attach to it.
*/
- return attach_to_pi_owner(uval, key, ps);
+ return attach_to_pi_owner(uaddr, uval, key, ps, exiting);
}
static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
@@ -1180,7 +1390,7 @@ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
return -EFAULT;
- /*If user space value changed, let the caller retry */
+ /* If user space value changed, let the caller retry */
return curval != uval ? -EAGAIN : 0;
}
@@ -1193,6 +1403,8 @@ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
* lookup
* @task: the task to perform the atomic lock work for. This will
* be "current" except in the case of requeue pi.
+ * @exiting: Pointer to store the task pointer of the owner task
+ * which is in the middle of exiting
* @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
*
* Return:
@@ -1201,11 +1413,17 @@ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
* <0 - error
*
* The hb->lock and futex_key refs shall be held by the caller.
+ *
+ * @exiting is only set when the return value is -EBUSY. If so, this holds
+ * a refcount on the exiting task on return and the caller needs to drop it
+ * after waiting for the exit to complete.
*/
static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
union futex_key *key,
struct futex_pi_state **ps,
- struct task_struct *task, int set_waiters)
+ struct task_struct *task,
+ struct task_struct **exiting,
+ int set_waiters)
{
u32 uval, newval, vpid = task_pid_vnr(task);
struct futex_q *match;
@@ -1236,7 +1454,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
*/
match = futex_top_waiter(hb, key);
if (match)
- return attach_to_pi_state(uval, match->pi_state, ps);
+ return attach_to_pi_state(uaddr, uval, match->pi_state, ps);
/*
* No waiter and user TID is 0. We are here because the
@@ -1275,7 +1493,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
* attach to the owner. If that fails, no harm done, we only
* set the FUTEX_WAITERS bit in the user space variable.
*/
- return attach_to_pi_owner(uval, key, ps);
+ return attach_to_pi_owner(uaddr, newval, key, ps, exiting);
}
/**
@@ -1326,48 +1544,35 @@ static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q)
q->lock_ptr = NULL;
}
-static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
- struct futex_hash_bucket *hb)
+/*
+ * Caller must hold a reference on @pi_state.
+ */
+static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state)
{
- struct task_struct *new_owner;
- struct futex_pi_state *pi_state = this->pi_state;
u32 uninitialized_var(curval), newval;
+ struct task_struct *new_owner;
+ bool deboost = false;
WAKE_Q(wake_q);
- bool deboost;
int ret = 0;
- if (!pi_state)
- return -EINVAL;
-
- /*
- * If current does not own the pi_state then the futex is
- * inconsistent and user space fiddled with the futex value.
- */
- if (pi_state->owner != current)
- return -EINVAL;
-
- raw_spin_lock(&pi_state->pi_mutex.wait_lock);
new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
-
- /*
- * When we interleave with futex_lock_pi() where it does
- * rt_mutex_timed_futex_lock(), we might observe @this futex_q waiter,
- * but the rt_mutex's wait_list can be empty (either still, or again,
- * depending on which side we land).
- *
- * When this happens, give up our locks and try again, giving the
- * futex_lock_pi() instance time to complete, either by waiting on the
- * rtmutex or removing itself from the futex queue.
- */
- if (!new_owner) {
- raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
- return -EAGAIN;
+ if (WARN_ON_ONCE(!new_owner)) {
+ /*
+ * As per the comment in futex_unlock_pi() this should not happen.
+ *
+ * When this happens, give up our locks and try again, giving
+ * the futex_lock_pi() instance time to complete, either by
+ * waiting on the rtmutex or removing itself from the futex
+ * queue.
+ */
+ ret = -EAGAIN;
+ goto out_unlock;
}
/*
- * We pass it to the next owner. The WAITERS bit is always
- * kept enabled while there is PI state around. We cleanup the
- * owner died bit, because we are the owner.
+ * We pass it to the next owner. The WAITERS bit is always kept
+ * enabled while there is PI state around. We cleanup the owner
+ * died bit, because we are the owner.
*/
newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
@@ -1376,6 +1581,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) {
ret = -EFAULT;
+
} else if (curval != uval) {
/*
* If a unconditional UNLOCK_PI operation (user space did not
@@ -1399,15 +1605,15 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
deboost = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
}
+out_unlock:
raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
- spin_unlock(&hb->lock);
if (deboost) {
wake_up_q(&wake_q);
rt_mutex_adjust_prio(current);
}
- return 0;
+ return ret;
}
/*
@@ -1495,8 +1701,16 @@ static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr)
int oldval, ret;
if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) {
- if (oparg < 0 || oparg > 31)
- return -EINVAL;
+ if (oparg < 0 || oparg > 31) {
+ char comm[sizeof(current->comm)];
+ /*
+ * kill this print and return -EINVAL when userspace
+ * is sane again
+ */
+ pr_info_ratelimited("futex_wake_op: %s tries to shift op by %d; fix this program\n",
+ get_task_comm(comm, current), oparg);
+ oparg &= 31;
+ }
oparg = 1 << oparg;
}
@@ -1688,6 +1902,8 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
* @key1: the from futex key
* @key2: the to futex key
* @ps: address to store the pi_state pointer
+ * @exiting: Pointer to store the task pointer of the owner task
+ * which is in the middle of exiting
* @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
*
* Try and get the lock on behalf of the top waiter if we can do it atomically.
@@ -1695,16 +1911,20 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
* then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit.
* hb1 and hb2 must be held by the caller.
*
+ * @exiting is only set when the return value is -EBUSY. If so, this holds
+ * a refcount on the exiting task on return and the caller needs to drop it
+ * after waiting for the exit to complete.
+ *
* Return:
* 0 - failed to acquire the lock atomically;
* >0 - acquired the lock, return value is vpid of the top_waiter
* <0 - error
*/
-static int futex_proxy_trylock_atomic(u32 __user *pifutex,
- struct futex_hash_bucket *hb1,
- struct futex_hash_bucket *hb2,
- union futex_key *key1, union futex_key *key2,
- struct futex_pi_state **ps, int set_waiters)
+static int
+futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1,
+ struct futex_hash_bucket *hb2, union futex_key *key1,
+ union futex_key *key2, struct futex_pi_state **ps,
+ struct task_struct **exiting, int set_waiters)
{
struct futex_q *top_waiter = NULL;
u32 curval;
@@ -1741,7 +1961,7 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
*/
vpid = task_pid_vnr(top_waiter->task);
ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
- set_waiters);
+ exiting, set_waiters);
if (ret == 1) {
requeue_pi_wake_futex(top_waiter, key2, hb2);
return vpid;
@@ -1861,6 +2081,8 @@ retry_private:
}
if (requeue_pi && (task_count - nr_wake < nr_requeue)) {
+ struct task_struct *exiting = NULL;
+
/*
* Attempt to acquire uaddr2 and wake the top waiter. If we
* intend to requeue waiters, force setting the FUTEX_WAITERS
@@ -1868,7 +2090,8 @@ retry_private:
* faults rather in the requeue loop below.
*/
ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1,
- &key2, &pi_state, nr_requeue);
+ &key2, &pi_state,
+ &exiting, nr_requeue);
/*
* At this point the top_waiter has either taken uaddr2 or is
@@ -1892,14 +2115,15 @@ retry_private:
* rereading and handing potential crap to
* lookup_pi_state.
*/
- ret = lookup_pi_state(ret, hb2, &key2, &pi_state);
+ ret = lookup_pi_state(uaddr2, ret, hb2, &key2,
+ &pi_state, &exiting);
}
switch (ret) {
case 0:
break;
case -EFAULT:
- free_pi_state(pi_state);
+ put_pi_state(pi_state);
pi_state = NULL;
double_unlock_hb(hb1, hb2);
hb_waiters_dec(hb2);
@@ -1909,19 +2133,26 @@ retry_private:
if (!ret)
goto retry;
goto out;
+ case -EBUSY:
case -EAGAIN:
/*
* Two reasons for this:
- * - Owner is exiting and we just wait for the
+ * - EBUSY: Owner is exiting and we just wait for the
* exit to complete.
- * - The user space value changed.
+ * - EAGAIN: The user space value changed.
*/
- free_pi_state(pi_state);
+ put_pi_state(pi_state);
pi_state = NULL;
double_unlock_hb(hb1, hb2);
hb_waiters_dec(hb2);
put_futex_key(&key2);
put_futex_key(&key1);
+ /*
+ * Handle the case where the owner is in the middle of
+ * exiting. Wait for the exit to complete otherwise
+ * this task might loop forever, aka. live lock.
+ */
+ wait_for_owner_exiting(ret, exiting);
cond_resched();
goto retry;
default:
@@ -1972,7 +2203,7 @@ retry_private:
*/
if (requeue_pi) {
/* Prepare the waiter to take the rt_mutex. */
- atomic_inc(&pi_state->refcount);
+ get_pi_state(pi_state);
this->pi_state = pi_state;
ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
this->rt_waiter,
@@ -1985,7 +2216,7 @@ retry_private:
} else if (ret) {
/* -EDEADLK */
this->pi_state = NULL;
- free_pi_state(pi_state);
+ put_pi_state(pi_state);
goto out_unlock;
}
}
@@ -1994,7 +2225,7 @@ retry_private:
}
out_unlock:
- free_pi_state(pi_state);
+ put_pi_state(pi_state);
double_unlock_hb(hb1, hb2);
wake_up_q(&wake_q);
hb_waiters_dec(hb2);
@@ -2048,20 +2279,7 @@ queue_unlock(struct futex_hash_bucket *hb)
hb_waiters_dec(hb);
}
-/**
- * queue_me() - Enqueue the futex_q on the futex_hash_bucket
- * @q: The futex_q to enqueue
- * @hb: The destination hash bucket
- *
- * The hb->lock must be held by the caller, and is released here. A call to
- * queue_me() is typically paired with exactly one call to unqueue_me(). The
- * exceptions involve the PI related operations, which may use unqueue_me_pi()
- * or nothing if the unqueue is done as part of the wake process and the unqueue
- * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
- * an example).
- */
-static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
- __releases(&hb->lock)
+static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
{
int prio;
@@ -2078,6 +2296,24 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
plist_node_init(&q->list, prio);
plist_add(&q->list, &hb->chain);
q->task = current;
+}
+
+/**
+ * queue_me() - Enqueue the futex_q on the futex_hash_bucket
+ * @q: The futex_q to enqueue
+ * @hb: The destination hash bucket
+ *
+ * The hb->lock must be held by the caller, and is released here. A call to
+ * queue_me() is typically paired with exactly one call to unqueue_me(). The
+ * exceptions involve the PI related operations, which may use unqueue_me_pi()
+ * or nothing if the unqueue is done as part of the wake process and the unqueue
+ * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
+ * an example).
+ */
+static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
+ __releases(&hb->lock)
+{
+ __queue_me(q, hb);
spin_unlock(&hb->lock);
}
@@ -2147,7 +2383,7 @@ static void unqueue_me_pi(struct futex_q *q)
__unqueue_futex(q);
BUG_ON(!q->pi_state);
- free_pi_state(q->pi_state);
+ put_pi_state(q->pi_state);
q->pi_state = NULL;
spin_unlock(q->lock_ptr);
@@ -2162,11 +2398,6 @@ static int __fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
int err = 0;
oldowner = pi_state->owner;
-
- /* Owner died? */
- if (!pi_state->owner)
- newtid |= FUTEX_OWNER_DIED;
-
/*
* We are here because either:
*
@@ -2185,11 +2416,10 @@ static int __fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
* because we can fault here. Imagine swapped out pages or a fork
* that marked all the anonymous memory readonly for cow.
*
- * Modifying pi_state _before_ the user space value would
- * leave the pi_state in an inconsistent state when we fault
- * here, because we need to drop the hash bucket lock to
- * handle the fault. This might be observed in the PID check
- * in lookup_pi_state.
+ * Modifying pi_state _before_ the user space value would leave the
+ * pi_state in an inconsistent state when we fault here, because we
+ * need to drop the locks to handle the fault. This might be observed
+ * in the PID check in lookup_pi_state.
*/
retry:
if (!argowner) {
@@ -2202,15 +2432,27 @@ retry:
}
if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) {
- /* We got the lock after all, nothing to fix. */
+ /* We got the lock. pi_state is correct. Tell caller */
return 1;
}
/*
- * Since we just failed the trylock; there must be an owner.
+ * The trylock just failed, so either there is an owner or
+ * there is a higher priority waiter than this one.
*/
newowner = rt_mutex_owner(&pi_state->pi_mutex);
- BUG_ON(!newowner);
+ /*
+ * If the higher priority waiter has not yet taken over the
+ * rtmutex then newowner is NULL. We can't return here with
+ * that state because it's inconsistent vs. the user space
+ * state. So drop the locks and try again. It's a valid
+ * situation and not any different from the other retry
+ * conditions.
+ */
+ if (unlikely(!newowner)) {
+ err = -EAGAIN;
+ goto handle_fault;
+ }
} else {
WARN_ON_ONCE(argowner != current);
if (oldowner == current) {
@@ -2224,11 +2466,14 @@ retry:
}
newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
+ /* Owner died? */
+ if (!pi_state->owner)
+ newtid |= FUTEX_OWNER_DIED;
if (get_futex_value_locked(&uval, uaddr))
goto handle_fault;
- while (1) {
+ for (;;) {
newval = (uval & FUTEX_OWNER_DIED) | newtid;
if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
@@ -2244,24 +2489,29 @@ retry:
*/
pi_state_update_owner(pi_state, newowner);
- return 0;
+ return argowner == current;
/*
- * To handle the page fault we need to drop the hash bucket
- * lock here. That gives the other task (either the highest priority
- * waiter itself or the task which stole the rtmutex) the
- * chance to try the fixup of the pi_state. So once we are
- * back from handling the fault we need to check the pi_state
- * after reacquiring the hash bucket lock and before trying to
- * do another fixup. When the fixup has been done already we
- * simply return.
+ * To handle the page fault we need to drop the locks here. That gives
+ * the other task (either the highest priority waiter itself or the
+ * task which stole the rtmutex) the chance to try the fixup of the
+ * pi_state. So once we are back from handling the fault we need to
+ * check the pi_state after reacquiring the locks and before trying to
+ * do another fixup. When the fixup has been done already we simply
+ * return.
+ *
+ * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
+ * drop hb->lock since the caller owns the hb -> futex_q relation.
+ * Dropping the pi_mutex->wait_lock requires the state revalidate.
*/
handle_fault:
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
spin_unlock(q->lock_ptr);
err = fault_in_user_writeable(uaddr);
spin_lock(q->lock_ptr);
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
/*
* Check if someone else fixed it for us:
@@ -2327,8 +2577,6 @@ static long futex_wait_restart(struct restart_block *restart);
*/
static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
{
- int ret = 0;
-
if (locked) {
/*
* Got the lock. We might not be the anticipated owner if we
@@ -2339,8 +2587,8 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
* stable state, anything else needs more attention.
*/
if (q->pi_state->owner != current)
- ret = fixup_pi_state_owner(uaddr, q, current);
- goto out;
+ return fixup_pi_state_owner(uaddr, q, current);
+ return 1;
}
/*
@@ -2351,10 +2599,8 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
* Another speculative read; pi_state->owner == current is unstable
* but needs our attention.
*/
- if (q->pi_state->owner == current) {
- ret = fixup_pi_state_owner(uaddr, q, NULL);
- goto out;
- }
+ if (q->pi_state->owner == current)
+ return fixup_pi_state_owner(uaddr, q, NULL);
/*
* Paranoia check. If we did not take the lock, then we should not be
@@ -2363,8 +2609,7 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
if (WARN_ON_ONCE(rt_mutex_owner(&q->pi_state->pi_mutex) == current))
return fixup_pi_state_owner(uaddr, q, current);
-out:
- return ret ? ret : locked;
+ return 0;
}
/**
@@ -2585,6 +2830,8 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
ktime_t *time, int trylock)
{
struct hrtimer_sleeper timeout, *to = NULL;
+ struct task_struct *exiting = NULL;
+ struct rt_mutex_waiter rt_waiter;
struct futex_hash_bucket *hb;
struct futex_q q = futex_q_init;
int res, ret;
@@ -2608,7 +2855,8 @@ retry:
retry_private:
hb = queue_lock(&q);
- ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0);
+ ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current,
+ &exiting, 0);
if (unlikely(ret)) {
/*
* Atomic work succeeded and we got the lock,
@@ -2621,15 +2869,22 @@ retry_private:
goto out_unlock_put_key;
case -EFAULT:
goto uaddr_faulted;
+ case -EBUSY:
case -EAGAIN:
/*
* Two reasons for this:
- * - Task is exiting and we just wait for the
+ * - EBUSY: Task is exiting and we just wait for the
* exit to complete.
- * - The user space value changed.
+ * - EAGAIN: The user space value changed.
*/
queue_unlock(hb);
put_futex_key(&q.key);
+ /*
+ * Handle the case where the owner is in the middle of
+ * exiting. Wait for the exit to complete otherwise
+ * this task might loop forever, aka. live lock.
+ */
+ wait_for_owner_exiting(ret, exiting);
cond_resched();
goto retry;
default:
@@ -2637,25 +2892,52 @@ retry_private:
}
}
+ WARN_ON(!q.pi_state);
+
/*
* Only actually queue now that the atomic ops are done:
*/
- queue_me(&q, hb);
+ __queue_me(&q, hb);
- WARN_ON(!q.pi_state);
- /*
- * Block on the PI mutex:
- */
- if (!trylock) {
- ret = rt_mutex_timed_futex_lock(&q.pi_state->pi_mutex, to);
- } else {
+ if (trylock) {
ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex);
/* Fixup the trylock return value: */
ret = ret ? 0 : -EWOULDBLOCK;
+ goto no_block;
}
+ /*
+ * We must add ourselves to the rt_mutex waitlist while holding hb->lock
+ * such that the hb and rt_mutex wait lists match.
+ */
+ rt_mutex_init_waiter(&rt_waiter);
+ ret = rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
+ if (ret) {
+ if (ret == 1)
+ ret = 0;
+
+ goto no_block;
+ }
+
+ spin_unlock(q.lock_ptr);
+
+ if (unlikely(to))
+ hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS);
+
+ ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
+
spin_lock(q.lock_ptr);
/*
+ * If we failed to acquire the lock (signal/timeout), we must
+ * first acquire the hb->lock before removing the lock from the
+ * rt_mutex waitqueue, such that we can keep the hb and rt_mutex
+ * wait lists consistent.
+ */
+ if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
+ ret = 0;
+
+no_block:
+ /*
* Fixup the pi_state owner and possibly acquire the lock if we
* haven't already.
*/
@@ -2678,8 +2960,10 @@ out_unlock_put_key:
out_put_key:
put_futex_key(&q.key);
out:
- if (to)
+ if (to) {
+ hrtimer_cancel(&to->timer);
destroy_hrtimer_on_stack(&to->timer);
+ }
return ret != -EINTR ? ret : -ERESTARTNOINTR;
uaddr_faulted:
@@ -2732,10 +3016,39 @@ retry:
*/
match = futex_top_waiter(hb, &key);
if (match) {
- ret = wake_futex_pi(uaddr, uval, match, hb);
+ struct futex_pi_state *pi_state = match->pi_state;
+
+ ret = -EINVAL;
+ if (!pi_state)
+ goto out_unlock;
+
+ /*
+ * If current does not own the pi_state then the futex is
+ * inconsistent and user space fiddled with the futex value.
+ */
+ if (pi_state->owner != current)
+ goto out_unlock;
+
+ get_pi_state(pi_state);
+ /*
+ * Since modifying the wait_list is done while holding both
+ * hb->lock and wait_lock, holding either is sufficient to
+ * observe it.
+ *
+ * By taking wait_lock while still holding hb->lock, we ensure
+ * there is no point where we hold neither; and therefore
+ * wake_futex_pi() must observe a state consistent with what we
+ * observed.
+ */
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+ spin_unlock(&hb->lock);
+
+ ret = wake_futex_pi(uaddr, uval, pi_state);
+
+ put_pi_state(pi_state);
+
/*
- * In case of success wake_futex_pi dropped the hash
- * bucket lock.
+ * Success, we're done! No tricky corner cases.
*/
if (!ret)
goto out_putkey;
@@ -2750,7 +3063,6 @@ retry:
* setting the FUTEX_WAITERS bit. Try again.
*/
if (ret == -EAGAIN) {
- spin_unlock(&hb->lock);
put_futex_key(&key);
goto retry;
}
@@ -2758,7 +3070,7 @@ retry:
* wake_futex_pi has detected invalid state. Tell user
* space.
*/
- goto out_unlock;
+ goto out_putkey;
}
/*
@@ -2768,8 +3080,10 @@ retry:
* preserve the WAITERS bit not the OWNER_DIED one. We are the
* owner.
*/
- if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0))
+ if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0)) {
+ spin_unlock(&hb->lock);
goto pi_faulted;
+ }
/*
* If uval has changed, let user space handle it.
@@ -2783,7 +3097,6 @@ out_putkey:
return ret;
pi_faulted:
- spin_unlock(&hb->lock);
put_futex_key(&key);
ret = fault_in_user_writeable(uaddr);
@@ -2913,10 +3226,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
* The waiter is allocated on our stack, manipulated by the requeue
* code while we sleep on uaddr.
*/
- debug_rt_mutex_init_waiter(&rt_waiter);
- RB_CLEAR_NODE(&rt_waiter.pi_tree_entry);
- RB_CLEAR_NODE(&rt_waiter.tree_entry);
- rt_waiter.task = NULL;
+ rt_mutex_init_waiter(&rt_waiter);
ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
if (unlikely(ret != 0))
@@ -2975,8 +3285,13 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
* Drop the reference to the pi state which
* the requeue_pi() code acquired for us.
*/
- free_pi_state(q.pi_state);
+ put_pi_state(q.pi_state);
spin_unlock(q.lock_ptr);
+ /*
+ * Adjust the return value. It's either -EFAULT or
+ * success (1) but the caller expects 0 for success.
+ */
+ ret = ret < 0 ? ret : 0;
}
} else {
struct rt_mutex *pi_mutex;
@@ -3120,7 +3435,7 @@ err_unlock:
* Process a futex-list entry, check whether it's owned by the
* dying task, and do notification if so:
*/
-int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi)
+static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi)
{
u32 uval, uninitialized_var(nval), mval;
@@ -3195,7 +3510,7 @@ static inline int fetch_robust_entry(struct robust_list __user **entry,
*
* We silently return on any sign of list-walking problem.
*/
-void exit_robust_list(struct task_struct *curr)
+static void exit_robust_list(struct task_struct *curr)
{
struct robust_list_head __user *head = curr->robust_list;
struct robust_list __user *entry, *next_entry, *pending;
@@ -3258,6 +3573,114 @@ void exit_robust_list(struct task_struct *curr)
curr, pip);
}
+static void futex_cleanup(struct task_struct *tsk)
+{
+ if (unlikely(tsk->robust_list)) {
+ exit_robust_list(tsk);
+ tsk->robust_list = NULL;
+ }
+
+#ifdef CONFIG_COMPAT
+ if (unlikely(tsk->compat_robust_list)) {
+ compat_exit_robust_list(tsk);
+ tsk->compat_robust_list = NULL;
+ }
+#endif
+
+ if (unlikely(!list_empty(&tsk->pi_state_list)))
+ exit_pi_state_list(tsk);
+}
+
+/**
+ * futex_exit_recursive - Set the tasks futex state to FUTEX_STATE_DEAD
+ * @tsk: task to set the state on
+ *
+ * Set the futex exit state of the task lockless. The futex waiter code
+ * observes that state when a task is exiting and loops until the task has
+ * actually finished the futex cleanup. The worst case for this is that the
+ * waiter runs through the wait loop until the state becomes visible.
+ *
+ * This is called from the recursive fault handling path in do_exit().
+ *
+ * This is best effort. Either the futex exit code has run already or
+ * not. If the OWNER_DIED bit has been set on the futex then the waiter can
+ * take it over. If not, the problem is pushed back to user space. If the
+ * futex exit code did not run yet, then an already queued waiter might
+ * block forever, but there is nothing which can be done about that.
+ */
+void futex_exit_recursive(struct task_struct *tsk)
+{
+ /* If the state is FUTEX_STATE_EXITING then futex_exit_mutex is held */
+ if (tsk->futex_state == FUTEX_STATE_EXITING)
+ mutex_unlock(&tsk->futex_exit_mutex);
+ tsk->futex_state = FUTEX_STATE_DEAD;
+}
+
+static void futex_cleanup_begin(struct task_struct *tsk)
+{
+ /*
+ * Prevent various race issues against a concurrent incoming waiter
+ * including live locks by forcing the waiter to block on
+ * tsk->futex_exit_mutex when it observes FUTEX_STATE_EXITING in
+ * attach_to_pi_owner().
+ */
+ mutex_lock(&tsk->futex_exit_mutex);
+
+ /*
+ * Switch the state to FUTEX_STATE_EXITING under tsk->pi_lock.
+ *
+ * This ensures that all subsequent checks of tsk->futex_state in
+ * attach_to_pi_owner() must observe FUTEX_STATE_EXITING with
+ * tsk->pi_lock held.
+ *
+ * It guarantees also that a pi_state which was queued right before
+ * the state change under tsk->pi_lock by a concurrent waiter must
+ * be observed in exit_pi_state_list().
+ */
+ raw_spin_lock_irq(&tsk->pi_lock);
+ tsk->futex_state = FUTEX_STATE_EXITING;
+ raw_spin_unlock_irq(&tsk->pi_lock);
+}
+
+static void futex_cleanup_end(struct task_struct *tsk, int state)
+{
+ /*
+ * Lockless store. The only side effect is that an observer might
+ * take another loop until it becomes visible.
+ */
+ tsk->futex_state = state;
+ /*
+ * Drop the exit protection. This unblocks waiters which observed
+ * FUTEX_STATE_EXITING to reevaluate the state.
+ */
+ mutex_unlock(&tsk->futex_exit_mutex);
+}
+
+void futex_exec_release(struct task_struct *tsk)
+{
+ /*
+ * The state handling is done for consistency, but in the case of
+ * exec() there is no way to prevent futher damage as the PID stays
+ * the same. But for the unlikely and arguably buggy case that a
+ * futex is held on exec(), this provides at least as much state
+ * consistency protection which is possible.
+ */
+ futex_cleanup_begin(tsk);
+ futex_cleanup(tsk);
+ /*
+ * Reset the state to FUTEX_STATE_OK. The task is alive and about
+ * exec a new binary.
+ */
+ futex_cleanup_end(tsk, FUTEX_STATE_OK);
+}
+
+void futex_exit_release(struct task_struct *tsk)
+{
+ futex_cleanup_begin(tsk);
+ futex_cleanup(tsk);
+ futex_cleanup_end(tsk, FUTEX_STATE_DEAD);
+}
+
long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
u32 __user *uaddr2, u32 val2, u32 val3)
{
@@ -3350,6 +3773,192 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
}
+#ifdef CONFIG_COMPAT
+/*
+ * Fetch a robust-list pointer. Bit 0 signals PI futexes:
+ */
+static inline int
+compat_fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
+ compat_uptr_t __user *head, unsigned int *pi)
+{
+ if (get_user(*uentry, head))
+ return -EFAULT;
+
+ *entry = compat_ptr((*uentry) & ~1);
+ *pi = (unsigned int)(*uentry) & 1;
+
+ return 0;
+}
+
+static void __user *futex_uaddr(struct robust_list __user *entry,
+ compat_long_t futex_offset)
+{
+ compat_uptr_t base = ptr_to_compat(entry);
+ void __user *uaddr = compat_ptr(base + futex_offset);
+
+ return uaddr;
+}
+
+/*
+ * Walk curr->robust_list (very carefully, it's a userspace list!)
+ * and mark any locks found there dead, and notify any waiters.
+ *
+ * We silently return on any sign of list-walking problem.
+ */
+void compat_exit_robust_list(struct task_struct *curr)
+{
+ struct compat_robust_list_head __user *head = curr->compat_robust_list;
+ struct robust_list __user *entry, *next_entry, *pending;
+ unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
+ unsigned int uninitialized_var(next_pi);
+ compat_uptr_t uentry, next_uentry, upending;
+ compat_long_t futex_offset;
+ int rc;
+
+ if (!futex_cmpxchg_enabled)
+ return;
+
+ /*
+ * Fetch the list head (which was registered earlier, via
+ * sys_set_robust_list()):
+ */
+ if (compat_fetch_robust_entry(&uentry, &entry, &head->list.next, &pi))
+ return;
+ /*
+ * Fetch the relative futex offset:
+ */
+ if (get_user(futex_offset, &head->futex_offset))
+ return;
+ /*
+ * Fetch any possibly pending lock-add first, and handle it
+ * if it exists:
+ */
+ if (compat_fetch_robust_entry(&upending, &pending,
+ &head->list_op_pending, &pip))
+ return;
+
+ next_entry = NULL; /* avoid warning with gcc */
+ while (entry != (struct robust_list __user *) &head->list) {
+ /*
+ * Fetch the next entry in the list before calling
+ * handle_futex_death:
+ */
+ rc = compat_fetch_robust_entry(&next_uentry, &next_entry,
+ (compat_uptr_t __user *)&entry->next, &next_pi);
+ /*
+ * A pending lock might already be on the list, so
+ * dont process it twice:
+ */
+ if (entry != pending) {
+ void __user *uaddr = futex_uaddr(entry, futex_offset);
+
+ if (handle_futex_death(uaddr, curr, pi))
+ return;
+ }
+ if (rc)
+ return;
+ uentry = next_uentry;
+ entry = next_entry;
+ pi = next_pi;
+ /*
+ * Avoid excessively long or circular lists:
+ */
+ if (!--limit)
+ break;
+
+ cond_resched();
+ }
+ if (pending) {
+ void __user *uaddr = futex_uaddr(pending, futex_offset);
+
+ handle_futex_death(uaddr, curr, pip);
+ }
+}
+
+COMPAT_SYSCALL_DEFINE2(set_robust_list,
+ struct compat_robust_list_head __user *, head,
+ compat_size_t, len)
+{
+ if (!futex_cmpxchg_enabled)
+ return -ENOSYS;
+
+ if (unlikely(len != sizeof(*head)))
+ return -EINVAL;
+
+ current->compat_robust_list = head;
+
+ return 0;
+}
+
+COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid,
+ compat_uptr_t __user *, head_ptr,
+ compat_size_t __user *, len_ptr)
+{
+ struct compat_robust_list_head __user *head;
+ unsigned long ret;
+ struct task_struct *p;
+
+ if (!futex_cmpxchg_enabled)
+ return -ENOSYS;
+
+ rcu_read_lock();
+
+ ret = -ESRCH;
+ if (!pid)
+ p = current;
+ else {
+ p = find_task_by_vpid(pid);
+ if (!p)
+ goto err_unlock;
+ }
+
+ ret = -EPERM;
+ if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
+ goto err_unlock;
+
+ head = p->compat_robust_list;
+ rcu_read_unlock();
+
+ if (put_user(sizeof(*head), len_ptr))
+ return -EFAULT;
+ return put_user(ptr_to_compat(head), head_ptr);
+
+err_unlock:
+ rcu_read_unlock();
+
+ return ret;
+}
+
+COMPAT_SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
+ struct compat_timespec __user *, utime, u32 __user *, uaddr2,
+ u32, val3)
+{
+ struct timespec ts;
+ ktime_t t, *tp = NULL;
+ int val2 = 0;
+ int cmd = op & FUTEX_CMD_MASK;
+
+ if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
+ cmd == FUTEX_WAIT_BITSET ||
+ cmd == FUTEX_WAIT_REQUEUE_PI)) {
+ if (compat_get_timespec(&ts, utime))
+ return -EFAULT;
+ if (!timespec_valid(&ts))
+ return -EINVAL;
+
+ t = timespec_to_ktime(ts);
+ if (cmd == FUTEX_WAIT)
+ t = ktime_add_safe(ktime_get(), t);
+ tp = &t;
+ }
+ if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
+ cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)
+ val2 = (int) (unsigned long) utime;
+
+ return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
+}
+#endif /* CONFIG_COMPAT */
+
static void __init futex_detect_cmpxchg(void)
{
#ifndef CONFIG_HAVE_FUTEX_CMPXCHG