From: Darren Hart on
This version pulls in the bits mistakenly left in 3/4.


From 9f8b4faac79518f98131464c2d21a1c64fb841d2 Mon Sep 17 00:00:00 2001
From: Darren Hart <dvhltc(a)us.ibm.com>
Date: Fri, 9 Jul 2010 16:44:47 -0400
Subject: [PATCH 4/4 V2] futex: convert hash_bucket locks to raw_spinlock_t

The requeue_pi mechanism introduced proxy locking of the rtmutex. This creates
a scenario where a task can wake-up, not knowing it has been enqueued on an
rtmutex. In order to detect this, the task would have to be able to take either
task->pi_blocked_on->lock->wait_lock and/or the hb->lock. Unfortunately,
without already holding one of these, the pi_blocked_on variable can change
from NULL to valid or from valid to NULL. Therefor, the task cannot be allowed
to take a sleeping lock after wakeup or it could end up trying to block on two
locks, the second overwriting a valid pi_blocked_on value. This obviously
breaks the pi mechanism.

This patch increases latency, while running the ltp pthread_cond_many test
which Michal reported the bug with, I see double digit hrtimer latencies
(typically only on the first run after boo):

kernel: hrtimer: interrupt took 75911 ns

This might be addressed by changing the various loops in the futex code to be
incremental, probably at an additional throughput hit. The private hash_bucket
lists discussed in the past could reduce hb->lock contention in some scenarios.
It should be noted that pthread_cond_many is a rather pathological case.

This also introduces problems for plists which want a spinlock_t rather
than a raw_spinlock_t. Any thoughts on how to address this?

Signed-off-by: Darren Hart <dvhltc(a)us.ibm.com>
Cc: Thomas Gleixner <tglx(a)linutronix.de>
Cc: Peter Zijlstra <peterz(a)infradead.org>
Cc: Ingo Molnar <mingo(a)elte.hu>
Cc: Eric Dumazet <eric.dumazet(a)gmail.com>
Cc: John Kacur <jkacur(a)redhat.com>
Cc: Steven Rostedt <rostedt(a)goodmis.org>
Cc: Mike Galbraith <efault(a)gmx.de>
---
kernel/futex.c | 73 ++++++++++++++++++++++++++++++--------------------------
1 files changed, 39 insertions(+), 34 deletions(-)

diff --git a/kernel/futex.c b/kernel/futex.c
index 2cd58a2..0ad5a85 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -114,7 +114,7 @@ struct futex_q {
struct plist_node list;

struct task_struct *task;
- spinlock_t *lock_ptr;
+ raw_spinlock_t *lock_ptr;
union futex_key key;
struct futex_pi_state *pi_state;
struct rt_mutex_waiter *rt_waiter;
@@ -128,7 +128,7 @@ struct futex_q {
* waiting on a futex.
*/
struct futex_hash_bucket {
- spinlock_t lock;
+ raw_spinlock_t lock;
struct plist_head chain;
};

@@ -479,7 +479,7 @@ void exit_pi_state_list(struct task_struct *curr)
hb = hash_futex(&key);
raw_spin_unlock_irq(&curr->pi_lock);

- spin_lock(&hb->lock);
+ raw_spin_lock(&hb->lock);

raw_spin_lock_irq(&curr->pi_lock);
/*
@@ -487,7 +487,7 @@ void exit_pi_state_list(struct task_struct *curr)
* task still owns the PI-state:
*/
if (head->next != next) {
- spin_unlock(&hb->lock);
+ raw_spin_unlock(&hb->lock);
continue;
}

@@ -499,7 +499,7 @@ void exit_pi_state_list(struct task_struct *curr)

rt_mutex_unlock(&pi_state->pi_mutex);

- spin_unlock(&hb->lock);
+ raw_spin_unlock(&hb->lock);

raw_spin_lock_irq(&curr->pi_lock);
}
@@ -860,21 +860,21 @@ static inline void
double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
{
if (hb1 <= hb2) {
- spin_lock(&hb1->lock);
+ raw_spin_lock(&hb1->lock);
if (hb1 < hb2)
- spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING);
+ raw_spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING);
} else { /* hb1 > hb2 */
- spin_lock(&hb2->lock);
- spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING);
+ raw_spin_lock(&hb2->lock);
+ raw_spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING);
}
}

static inline void
double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
{
- spin_unlock(&hb1->lock);
+ raw_spin_unlock(&hb1->lock);
if (hb1 != hb2)
- spin_unlock(&hb2->lock);
+ raw_spin_unlock(&hb2->lock);
}

/*
@@ -896,7 +896,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
goto out;

hb = hash_futex(&key);
- spin_lock(&hb->lock);
+ raw_spin_lock(&hb->lock);
head = &hb->chain;

plist_for_each_entry_safe(this, next, head, list) {
@@ -916,7 +916,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
}
}

- spin_unlock(&hb->lock);
+ raw_spin_unlock(&hb->lock);
put_futex_key(fshared, &key);
out:
return ret;
@@ -1070,6 +1070,7 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,

q->lock_ptr = &hb->lock;
#ifdef CONFIG_DEBUG_PI_LIST
+ /* FIXME: we're converting this to a raw lock... */
q->list.plist.spinlock = &hb->lock;
#endif

@@ -1377,14 +1378,14 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
hb = hash_futex(&q->key);
q->lock_ptr = &hb->lock;

- spin_lock(&hb->lock);
+ raw_spin_lock(&hb->lock);
return hb;
}

static inline void
queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
{
- spin_unlock(&hb->lock);
+ raw_spin_unlock(&hb->lock);
drop_futex_key_refs(&q->key);
}

@@ -1416,11 +1417,12 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)

plist_node_init(&q->list, prio);
#ifdef CONFIG_DEBUG_PI_LIST
+ /* FIXME: we're converting this to a raw_spinlock */
q->list.plist.spinlock = &hb->lock;
#endif
plist_add(&q->list, &hb->chain);
q->task = current;
- spin_unlock(&hb->lock);
+ raw_spin_unlock(&hb->lock);
}

/**
@@ -1436,7 +1438,7 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
*/
static int unqueue_me(struct futex_q *q)
{
- spinlock_t *lock_ptr;
+ raw_spinlock_t *lock_ptr;
int ret = 0;

/* In the common case we don't take the spinlock, which is nice. */
@@ -1444,7 +1446,7 @@ retry:
lock_ptr = q->lock_ptr;
barrier();
if (lock_ptr != NULL) {
- spin_lock(lock_ptr);
+ raw_spin_lock(lock_ptr);
/*
* q->lock_ptr can change between reading it and
* spin_lock(), causing us to take the wrong lock. This
@@ -1459,7 +1461,7 @@ retry:
* we can detect whether we acquired the correct lock.
*/
if (unlikely(lock_ptr != q->lock_ptr)) {
- spin_unlock(lock_ptr);
+ raw_spin_unlock(lock_ptr);
goto retry;
}
WARN_ON(plist_node_empty(&q->list));
@@ -1467,7 +1469,7 @@ retry:

BUG_ON(q->pi_state);

- spin_unlock(lock_ptr);
+ raw_spin_unlock(lock_ptr);
ret = 1;
}

@@ -1491,7 +1493,7 @@ static void unqueue_me_pi(struct futex_q *q)
pi_state = q->pi_state;
q->pi_state = NULL;

- spin_unlock(q->lock_ptr);
+ raw_spin_unlock(q->lock_ptr);
drop_futex_key_refs(&q->key);

free_pi_state(pi_state);
@@ -1579,11 +1581,11 @@ retry:
* simply return.
*/
handle_fault:
- spin_unlock(q->lock_ptr);
+ raw_spin_unlock(q->lock_ptr);

ret = fault_in_user_writeable(uaddr);

- spin_lock(q->lock_ptr);
+ raw_spin_lock(q->lock_ptr);

/*
* Check if someone else fixed it for us:
@@ -1976,7 +1978,7 @@ retry_private:
ret = ret ? 0 : -EWOULDBLOCK;
}

- spin_lock(q.lock_ptr);
+ raw_spin_lock(q.lock_ptr);
/*
* Fixup the pi_state owner and possibly acquire the lock if we
* haven't already.
@@ -2053,7 +2055,7 @@ retry:
goto out;

hb = hash_futex(&key);
- spin_lock(&hb->lock);
+ raw_spin_lock(&hb->lock);

/*
* To avoid races, try to do the TID -> 0 atomic transition
@@ -2102,14 +2104,14 @@ retry:
}

out_unlock:
- spin_unlock(&hb->lock);
+ raw_spin_unlock(&hb->lock);
put_futex_key(fshared, &key);

out:
return ret;

pi_faulted:
- spin_unlock(&hb->lock);
+ raw_spin_unlock(&hb->lock);
put_futex_key(fshared, &key);

ret = fault_in_user_writeable(uaddr);
@@ -2257,9 +2259,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
/* Queue the futex_q, drop the hb lock, wait for wakeup. */
futex_wait_queue_me(hb, &q, to);

- spin_lock(&hb->lock);
+ raw_spin_lock(&hb->lock);
ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
- spin_unlock(&hb->lock);
+ raw_spin_unlock(&hb->lock);
if (ret)
goto out_put_keys;

@@ -2277,10 +2279,10 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
* did a lock-steal - fix up the PI-state in that case.
*/
if (q.pi_state && (q.pi_state->owner != current)) {
- spin_lock(q.lock_ptr);
+ raw_spin_lock(q.lock_ptr);
ret = fixup_pi_state_owner(uaddr2, &q, current,
fshared);
- spin_unlock(q.lock_ptr);
+ raw_spin_unlock(q.lock_ptr);
}
} else {
/*
@@ -2293,7 +2295,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1);
debug_rt_mutex_free_waiter(&rt_waiter);

- spin_lock(q.lock_ptr);
+ raw_spin_lock(q.lock_ptr);
/*
* Fixup the pi_state owner and possibly acquire the lock if we
* haven't already.
@@ -2668,8 +2670,11 @@ static int __init futex_init(void)
futex_cmpxchg_enabled = 1;

for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
- plist_head_init(&futex_queues[i].chain, &futex_queues[i].lock);
- spin_lock_init(&futex_queues[i].lock);
+ /*
+ * FIXME: plist wants a spinlock, but the hb->lock is a raw_spinlock_t
+ */
+ plist_head_init(&futex_queues[i].chain, NULL /*&futex_queues[i].lock*/);
+ raw_spin_lock_init(&futex_queues[i].lock);
}

return 0;
--
1.7.0.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
From: Mike Galbraith on
On Fri, 2010-07-09 at 15:33 -0700, Darren Hart wrote:
> The requeue_pi mechanism introduced proxy locking of the rtmutex. This creates
> a scenario where a task can wake-up, not knowing it has been enqueued on an
> rtmutex. In order to detect this, the task would have to be able to take either
> task->pi_blocked_on->lock->wait_lock and/or the hb->lock. Unfortunately,
> without already holding one of these, the pi_blocked_on variable can change
> from NULL to valid or from valid to NULL. Therefor, the task cannot be allowed
> to take a sleeping lock after wakeup or it could end up trying to block on two
> locks, the second overwriting a valid pi_blocked_on value. This obviously
> breaks the pi mechanism.

copy/paste offline query/reply at Darren's request..

On Sat, 2010-07-10 at 10:26 -0700, Darren Hart wrote:
On 07/09/2010 09:32 PM, Mike Galbraith wrote:
> > On Fri, 2010-07-09 at 13:05 -0700, Darren Hart wrote:
> >
> >> The core of the problem is that the proxy_lock blocks a task on a lock
> >> the task knows nothing about. So when it wakes up inside of
> >> futex_wait_requeue_pi, it immediately tries to block on hb->lock to
> >> check why it woke up. This has the potential to block the task on two
> >> locks (thus overwriting the pi_blocked_on). Any attempt preventing this
> >> involves a lock, and ultimiately the hb->lock. The only solution I see
> >> is to make the hb->locks raw locks (thanks to Steven Rostedt for
> >> original idea and batting this around with me in IRC).
> >
> > Hm, so wakee _was_ munging his own state after all.
> >
> > Out of curiosity, what's wrong with holding his pi_lock across the
> > wakeup? He can _try_ to block, but can't until pi state is stable.
> >
> > I presume there's a big fat gotcha that's just not obvious to futex
> > locking newbie :)
>
> It'll take me more time that I have right now to positive, but:
>
>
> rt_mutex_set_owner(lock, pendowner, RT_MUTEX_OWNER_PENDING);
>
> raw_spin_unlock(&current->pi_lock);
>
> Your patch moved the unlock before the set_owner. I _believe_ this can
> break the pi boosting logic - current is the owner until it calls
> set_owner to be pendowner. I haven't traced this entire path yet, but
> that's my gut feel.

I _think_ it should be fine to do that. Setting an owner seems to only
require holding the wait_lock. I could easily be missing subtleties
though. Looking around, I didn't see any reason not to unlock the
owner's pi_lock after twiddling pi_waiters (and still don't, but...).

> However, you're idea has merit as we have to take our ->pi_lock before
> we can block on the hb->lock (inside task_blocks_on_rt_mutex()).
>
> If we can't move the unlock above before set_owner, then we may need a:
>
> retry:
> cur->lock()
> top_waiter = get_top_waiter()
> cur->unlock()
>
> double_lock(cur, topwaiter)
> if top_waiter != get_top_waiter()
> double_unlock(cur, topwaiter)
> goto retry
>
> Not ideal, but I think I prefer that to making all the hb locks raw.
>
> You dropped the CC list for some reason, probably a good idea to send
> this back out in response to my raw lock patch (4/4) - your question and
> my reply. This is crazy stuff, no harm in putting the question out there.
>
> I'll take a closer look at this when I can, if not tonight, Monday morning.

-Mike

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
From: Mike Galbraith on
On Sat, 2010-07-10 at 21:41 +0200, Mike Galbraith wrote:
> On Fri, 2010-07-09 at 15:33 -0700, Darren Hart wrote:

> > If we can't move the unlock above before set_owner, then we may need a:
> >
> > retry:
> > cur->lock()
> > top_waiter = get_top_waiter()
> > cur->unlock()
> >
> > double_lock(cur, topwaiter)
> > if top_waiter != get_top_waiter()
> > double_unlock(cur, topwaiter)
> > goto retry
> >
> > Not ideal, but I think I prefer that to making all the hb locks raw.

Another option: only scratch the itchy spot.

futex: non-blocking synchronization point for futex_wait_requeue_pi() and futex_requeue().

Problem analysis by Darren Hart;
The requeue_pi mechanism introduced proxy locking of the rtmutex. This creates
a scenario where a task can wake-up, not knowing it has been enqueued on an
rtmutex. In order to detect this, the task would have to be able to take either
task->pi_blocked_on->lock->wait_lock and/or the hb->lock. Unfortunately,
without already holding one of these, the pi_blocked_on variable can change
from NULL to valid or from valid to NULL. Therefor, the task cannot be allowed
to take a sleeping lock after wakeup or it could end up trying to block on two
locks, the second overwriting a valid pi_blocked_on value. This obviously
breaks the pi mechanism.

Rather than convert the bh-lock to a raw spinlock, do so only in the spot where
blocking cannot be allowed, ie before we know that lock handoff has completed.

Signed-off-by: Mike Galbraith <efault(a)gmx.de>
Cc: Darren Hart <dvhltc(a)us.ibm.com>
Cc: Thomas Gleixner <tglx(a)linutronix.de>
Cc: Peter Zijlstra <peterz(a)infradead.org>
Cc: Ingo Molnar <mingo(a)elte.hu>
Cc: Eric Dumazet <eric.dumazet(a)gmail.com>
Cc: John Kacur <jkacur(a)redhat.com>
Cc: Steven Rostedt <rostedt(a)goodmis.org>

diff --git a/kernel/futex.c b/kernel/futex.c
index a6cec32..ef489f3 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2255,7 +2255,14 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
/* Queue the futex_q, drop the hb lock, wait for wakeup. */
futex_wait_queue_me(hb, &q, to);

- spin_lock(&hb->lock);
+ /*
+ * Non-blocking synchronization point with futex_requeue().
+ *
+ * We dare not block here because this will alter PI state, possibly
+ * before our waker finishes modifying same in wakeup_next_waiter().
+ */
+ while(!spin_trylock(&hb->lock))
+ cpu_relax();
ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
spin_unlock(&hb->lock);
if (ret)


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
From: Darren Hart on
On 07/11/2010 06:33 AM, Mike Galbraith wrote:
> On Sat, 2010-07-10 at 21:41 +0200, Mike Galbraith wrote:
>> On Fri, 2010-07-09 at 15:33 -0700, Darren Hart wrote:
>
>>> If we can't move the unlock above before set_owner, then we may need a:
>>>
>>> retry:
>>> cur->lock()
>>> top_waiter = get_top_waiter()
>>> cur->unlock()
>>>
>>> double_lock(cur, topwaiter)
>>> if top_waiter != get_top_waiter()
>>> double_unlock(cur, topwaiter)
>>> goto retry
>>>
>>> Not ideal, but I think I prefer that to making all the hb locks raw.
>
> Another option: only scratch the itchy spot.
>
> futex: non-blocking synchronization point for futex_wait_requeue_pi() and futex_requeue().
>
> Problem analysis by Darren Hart;
> The requeue_pi mechanism introduced proxy locking of the rtmutex. This creates
> a scenario where a task can wake-up, not knowing it has been enqueued on an
> rtmutex. In order to detect this, the task would have to be able to take either
> task->pi_blocked_on->lock->wait_lock and/or the hb->lock. Unfortunately,
> without already holding one of these, the pi_blocked_on variable can change
> from NULL to valid or from valid to NULL. Therefor, the task cannot be allowed
> to take a sleeping lock after wakeup or it could end up trying to block on two
> locks, the second overwriting a valid pi_blocked_on value. This obviously
> breaks the pi mechanism.
>
> Rather than convert the bh-lock to a raw spinlock, do so only in the spot where
> blocking cannot be allowed, ie before we know that lock handoff has completed.

I like it. I especially like the change is only evident if you are using
the code path that introduced the problem in the first place. If you're
doing a lot of requeue_pi operations, then the waking waiters have an
advantage over new pending waiters or other tasks with futex keyed on
the same hash-bucket... but that seems acceptable to me.

I'd like to confirm that holding the pendowner->pi-lock across the
wakeup in wakeup_next_waiter() isn't feasible first. If it can work, I
think the impact would be lower. I'll have a look tomorrow.

Nice work Mike.

--
Darrem

> Signed-off-by: Mike Galbraith<efault(a)gmx.de>
> Cc: Darren Hart<dvhltc(a)us.ibm.com>
> Cc: Thomas Gleixner<tglx(a)linutronix.de>
> Cc: Peter Zijlstra<peterz(a)infradead.org>
> Cc: Ingo Molnar<mingo(a)elte.hu>
> Cc: Eric Dumazet<eric.dumazet(a)gmail.com>
> Cc: John Kacur<jkacur(a)redhat.com>
> Cc: Steven Rostedt<rostedt(a)goodmis.org>
>
> diff --git a/kernel/futex.c b/kernel/futex.c
> index a6cec32..ef489f3 100644
> --- a/kernel/futex.c
> +++ b/kernel/futex.c
> @@ -2255,7 +2255,14 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
> /* Queue the futex_q, drop the hb lock, wait for wakeup. */
> futex_wait_queue_me(hb,&q, to);
>
> - spin_lock(&hb->lock);
> + /*
> + * Non-blocking synchronization point with futex_requeue().
> + *
> + * We dare not block here because this will alter PI state, possibly
> + * before our waker finishes modifying same in wakeup_next_waiter().
> + */
> + while(!spin_trylock(&hb->lock))
> + cpu_relax();
> ret = handle_early_requeue_pi_wakeup(hb,&q,&key2, to);
> spin_unlock(&hb->lock);
> if (ret)
>
>


--
Darren Hart
IBM Linux Technology Center
Real-Time Linux Team
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
From: Steven Rostedt on
On Sun, 2010-07-11 at 15:33 +0200, Mike Galbraith wrote:
> On Sat, 2010-07-10 at 21:41 +0200, Mike Galbraith wrote:

> diff --git a/kernel/futex.c b/kernel/futex.c
> index a6cec32..ef489f3 100644
> --- a/kernel/futex.c
> +++ b/kernel/futex.c
> @@ -2255,7 +2255,14 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
> /* Queue the futex_q, drop the hb lock, wait for wakeup. */
> futex_wait_queue_me(hb, &q, to);
>
> - spin_lock(&hb->lock);
> + /*
> + * Non-blocking synchronization point with futex_requeue().
> + *
> + * We dare not block here because this will alter PI state, possibly
> + * before our waker finishes modifying same in wakeup_next_waiter().
> + */
> + while(!spin_trylock(&hb->lock))
> + cpu_relax();

I agree that this would work. But I wonder if this should have an:

#ifdef PREEMPT_RT
[...]
#else
spin_lock(&hb->lock);
#endif

around it. Or encapsulate this lock in a macro that does the same thing
(just to keep the actual code cleaner)

-- Steve

> ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
> spin_unlock(&hb->lock);
> if (ret)
>
>


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/