Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit dc3f2ff1 authored by Peter Zijlstra's avatar Peter Zijlstra Committed by Greg Kroah-Hartman
Browse files

futex: Change locking rules



Currently futex-pi relies on hb->lock to serialize everything. But hb->lock
creates another set of problems, especially priority inversions on RT where
hb->lock becomes a rt_mutex itself.

The rt_mutex::wait_lock is the most obvious protection for keeping the
futex user space value and the kernel internal pi_state in sync.

Rework and document the locking so rt_mutex::wait_lock is held accross all
operations which modify the user space value and the pi state.

This allows to invoke rt_mutex_unlock() (including deboost) without holding
hb->lock as a next step.

Nothing yet relies on the new locking rules.

Signed-off-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
Cc: juri.lelli@arm.com
Cc: bigeasy@linutronix.de
Cc: xlpang@redhat.com
Cc: rostedt@goodmis.org
Cc: mathieu.desnoyers@efficios.com
Cc: jdesfossez@efficios.com
Cc: dvhart@infradead.org
Cc: bristot@redhat.com
Link: http://lkml.kernel.org/r/20170322104151.751993333@infradead.org


Signed-off-by: default avatarThomas Gleixner <tglx@linutronix.de>
[Lee: Back-ported in support of a previous futex back-port attempt]
Signed-off-by: default avatarLee Jones <lee.jones@linaro.org>
Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
parent ad8fdbab
Loading
Loading
Loading
Loading
+112 −26
Original line number Diff line number Diff line
@@ -1019,6 +1019,39 @@ static void exit_pi_state_list(struct task_struct *curr)
 * [10] There is no transient state which leaves owner and user space
 *	TID out of sync. Except one error case where the kernel is denied
 *	write access to the user address, see fixup_pi_state_owner().
 *
 *
 * Serialization and lifetime rules:
 *
 * hb->lock:
 *
 *	hb -> futex_q, relation
 *	futex_q -> pi_state, relation
 *
 *	(cannot be raw because hb can contain arbitrary amount
 *	 of futex_q's)
 *
 * pi_mutex->wait_lock:
 *
 *	{uval, pi_state}
 *
 *	(and pi_mutex 'obviously')
 *
 * p->pi_lock:
 *
 *	p->pi_state_list -> pi_state->list, relation
 *
 * pi_state->refcount:
 *
 *	pi_state lifetime
 *
 *
 * Lock order:
 *
 *   hb->lock
 *     pi_mutex->wait_lock
 *       p->pi_lock
 *
 */

/*
@@ -1026,10 +1059,12 @@ static void exit_pi_state_list(struct task_struct *curr)
 * the pi_state against the user space value. If correct, attach to
 * it.
 */
static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
			      struct futex_pi_state *pi_state,
			      struct futex_pi_state **ps)
{
	pid_t pid = uval & FUTEX_TID_MASK;
	int ret, uval2;

	/*
	 * Userspace might have messed up non-PI and PI futexes [3]
@@ -1037,8 +1072,33 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
	if (unlikely(!pi_state))
		return -EINVAL;

	/*
	 * We get here with hb->lock held, and having found a
	 * futex_top_waiter(). This means that futex_lock_pi() of said futex_q
	 * has dropped the hb->lock in between queue_me() and unqueue_me_pi(),
	 * which in turn means that futex_lock_pi() still has a reference on
	 * our pi_state.
	 */
	WARN_ON(!atomic_read(&pi_state->refcount));

	/*
	 * Now that we have a pi_state, we can acquire wait_lock
	 * and do the state validation.
	 */
	raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);

	/*
	 * Since {uval, pi_state} is serialized by wait_lock, and our current
	 * uval was read without holding it, it can have changed. Verify it
	 * still is what we expect it to be, otherwise retry the entire
	 * operation.
	 */
	if (get_futex_value_locked(&uval2, uaddr))
		goto out_efault;

	if (uval != uval2)
		goto out_eagain;

	/*
	 * Handle the owner died case:
	 */
@@ -1054,11 +1114,11 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
			 * is not 0. Inconsistent state. [5]
			 */
			if (pid)
				return -EINVAL;
				goto out_einval;
			/*
			 * Take a ref on the state and return success. [4]
			 */
			goto out_state;
			goto out_attach;
		}

		/*
@@ -1070,14 +1130,14 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
		 * Take a ref on the state and return success. [6]
		 */
		if (!pid)
			goto out_state;
			goto out_attach;
	} else {
		/*
		 * If the owner died bit is not set, then the pi_state
		 * must have an owner. [7]
		 */
		if (!pi_state->owner)
			return -EINVAL;
			goto out_einval;
	}

	/*
@@ -1086,11 +1146,29 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
	 * user space TID. [9/10]
	 */
	if (pid != task_pid_vnr(pi_state->owner))
		return -EINVAL;
out_state:
		goto out_einval;

out_attach:
	atomic_inc(&pi_state->refcount);
	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
	*ps = pi_state;
	return 0;

out_einval:
	ret = -EINVAL;
	goto out_error;

out_eagain:
	ret = -EAGAIN;
	goto out_error;

out_efault:
	ret = -EFAULT;
	goto out_error;

out_error:
	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
	return ret;
}

/**
@@ -1183,6 +1261,9 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key,

	/*
	 * No existing pi state. First waiter. [2]
	 *
	 * This creates pi_state, we have hb->lock held, this means nothing can
	 * observe this state, wait_lock is irrelevant.
	 */
	pi_state = alloc_pi_state();

@@ -1207,7 +1288,8 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key,
	return 0;
}

static int lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
static int lookup_pi_state(u32 __user *uaddr, u32 uval,
			   struct futex_hash_bucket *hb,
			   union futex_key *key, struct futex_pi_state **ps,
			   struct task_struct **exiting)
{
@@ -1218,7 +1300,7 @@ static int lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
	 * attach to the pi_state when the validation succeeds.
	 */
	if (match)
		return attach_to_pi_state(uval, match->pi_state, ps);
		return attach_to_pi_state(uaddr, uval, match->pi_state, ps);

	/*
	 * We are the first waiter - try to look up the owner based on
@@ -1301,7 +1383,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
	 */
	match = futex_top_waiter(hb, key);
	if (match)
		return attach_to_pi_state(uval, match->pi_state, ps);
		return attach_to_pi_state(uaddr, uval, match->pi_state, ps);

	/*
	 * No waiter and user TID is 0. We are here because the
@@ -1441,6 +1523,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,

	if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) {
		ret = -EFAULT;

	} else if (curval != uval) {
		/*
		 * If a unconditional UNLOCK_PI operation (user space did not
@@ -1977,7 +2060,7 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
			 * If that call succeeds then we have pi_state and an
			 * initial refcount on it.
			 */
			ret = lookup_pi_state(ret, hb2, &key2,
			ret = lookup_pi_state(uaddr2, ret, hb2, &key2,
					      &pi_state, &exiting);
		}

@@ -2282,7 +2365,6 @@ static int __fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
	int err = 0;

	oldowner = pi_state->owner;

	/* Owner died? */
	if (!pi_state->owner)
		newtid |= FUTEX_OWNER_DIED;
@@ -2305,11 +2387,10 @@ static int __fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
	 * because we can fault here. Imagine swapped out pages or a fork
	 * that marked all the anonymous memory readonly for cow.
	 *
	 * Modifying pi_state _before_ the user space value would
	 * leave the pi_state in an inconsistent state when we fault
	 * here, because we need to drop the hash bucket lock to
	 * handle the fault. This might be observed in the PID check
	 * in lookup_pi_state.
	 * Modifying pi_state _before_ the user space value would leave the
	 * pi_state in an inconsistent state when we fault here, because we
	 * need to drop the locks to handle the fault. This might be observed
	 * in the PID check in lookup_pi_state.
	 */
retry:
	if (!argowner) {
@@ -2367,21 +2448,26 @@ static int __fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
	return argowner == current;

	/*
	 * To handle the page fault we need to drop the hash bucket
	 * lock here. That gives the other task (either the highest priority
	 * waiter itself or the task which stole the rtmutex) the
	 * chance to try the fixup of the pi_state. So once we are
	 * back from handling the fault we need to check the pi_state
	 * after reacquiring the hash bucket lock and before trying to
	 * do another fixup. When the fixup has been done already we
	 * simply return.
	 * To handle the page fault we need to drop the locks here. That gives
	 * the other task (either the highest priority waiter itself or the
	 * task which stole the rtmutex) the chance to try the fixup of the
	 * pi_state. So once we are back from handling the fault we need to
	 * check the pi_state after reacquiring the locks and before trying to
	 * do another fixup. When the fixup has been done already we simply
	 * return.
	 *
	 * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
	 * drop hb->lock since the caller owns the hb -> futex_q relation.
	 * Dropping the pi_mutex->wait_lock requires the state revalidate.
	 */
handle_fault:
	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
	spin_unlock(q->lock_ptr);

	err = fault_in_user_writeable(uaddr);

	spin_lock(q->lock_ptr);
	raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);

	/*
	 * Check if someone else fixed it for us: