[PATCH] pi-futex: Fix exit races and locking problems

author Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>

Fri, 8 Jun 2007 10:29:30 +0000 (10:29 +0000)

committer Willy Tarreau <w@1wt.eu>

Wed, 15 Aug 2007 08:02:26 +0000 (10:02 +0200)
author Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
Fri, 8 Jun 2007 10:29:30 +0000 (10:29 +0000)
committer Willy Tarreau <w@1wt.eu>
Wed, 15 Aug 2007 08:02:26 +0000 (10:02 +0200)
diff --git a/include/linux/sched.h b/include/linux/sched.h

index 4463735351904f039b1c3239f6161f52ce88ad77..7a0cc6708e528f79df2edc90ca87e41ba7d6a268 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1137,6 +1137,7 @@ static inline void put_task_struct(struct task_struct *t)
                                         /* Not implemented yet, only for 486*/
  #define PF_STARTING    0x00000002      /* being created */
  #define PF_EXITING     0x00000004      /* getting shut down */
+#define PF_EXITPIDONE  0x00000008      /* pi exit done on shut down */
  #define PF_FORKNOEXEC  0x00000040      /* forked but didn't exec */
  #define PF_SUPERPRIV   0x00000100      /* used super-user privileges */
  #define PF_DUMPCORE    0x00000200      /* dumped core */
diff --git a/kernel/exit.c b/kernel/exit.c

index fec12eb124716f7e05e2564f241beef2049834cb..d306845681938eb7d5ad4951ffcf2ef915881c39 100644 (file)
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -883,13 +883,29 @@ fastcall NORET_TYPE void do_exit(long code)
         if (unlikely(tsk->flags & PF_EXITING)) {
                 printk(KERN_ALERT
                         "Fixing recursive fault but reboot is needed!\n");
+               /*
+                * We can do this unlocked here. The futex code uses
+                * this flag just to verify whether the pi state
+                * cleanup has been done or not. In the worst case it
+                * loops once more. We pretend that the cleanup was
+                * done as there is no way to return. Either the
+                * OWNER_DIED bit is set by now or we push the blocked
+                * task into the wait for ever nirwana as well.
+                */
+               tsk->flags |= PF_EXITPIDONE;
                 if (tsk->io_context)
                         exit_io_context();
                 set_current_state(TASK_UNINTERRUPTIBLE);
                 schedule();
         }
  
+       /*
+        * tsk->flags are checked in the futex code to protect against
+        * an exiting task cleaning up the robust pi futexes.
+        */
+       spin_lock_irq(&tsk->pi_lock);
         tsk->flags |= PF_EXITING;
+       spin_unlock_irq(&tsk->pi_lock);
  
         if (unlikely(in_atomic()))
                 printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",
@@ -956,6 +972,12 @@ fastcall NORET_TYPE void do_exit(long code)
          * Make sure we are holding no locks:
          */
         debug_check_no_locks_held(tsk);
+       /*
+        * We can do this unlocked here. The futex code uses this flag
+        * just to verify whether the pi state cleanup has been done
+        * or not. In the worst case it loops once more.
+        */
+       tsk->flags |= PF_EXITPIDONE;
  
         if (tsk->io_context)
                 exit_io_context();
diff --git a/kernel/futex.c b/kernel/futex.c

index 1df411e6be3331cfd5eb8180552c1af60b27de5b..c93ffbffdde4f1b868b848558511fe6aa906ac16 100644 (file)
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -396,10 +396,6 @@ static struct task_struct * futex_find_get_task(pid_t pid)
                 p = NULL;
                 goto out_unlock;
         }
-       if (p->exit_state != 0) {
-               p = NULL;
-               goto out_unlock;
-       }
         get_task_struct(p);
  out_unlock:
         rcu_read_unlock();
@@ -467,7 +463,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
         struct futex_q *this, *next;
         struct list_head *head;
         struct task_struct *p;
-       pid_t pid;
+       pid_t pid = uval & FUTEX_TID_MASK;
  
         head = &hb->chain;
  
@@ -485,6 +481,8 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
                                 return -EINVAL;
  
                         WARN_ON(!atomic_read(&pi_state->refcount));
+                       WARN_ON(pid && pi_state->owner &&
+                               pi_state->owner->pid != pid);
  
                         atomic_inc(&pi_state->refcount);
                         me->pi_state = pi_state;
@@ -495,15 +493,33 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
  
         /*
          * We are the first waiter - try to look up the real owner and attach
-        * the new pi_state to it, but bail out when the owner died bit is set
-        * and TID = 0:
+        * the new pi_state to it, but bail out when TID = 0
          */
-       pid = uval & FUTEX_TID_MASK;
-       if (!pid && (uval & FUTEX_OWNER_DIED))
+       if (!pid)
                 return -ESRCH;
         p = futex_find_get_task(pid);
-       if (!p)
-               return -ESRCH;
+       if (IS_ERR(p))
+               return PTR_ERR(p);
+
+       /*
+        * We need to look at the task state flags to figure out,
+        * whether the task is exiting. To protect against the do_exit
+        * change of the task flags, we do this protected by
+        * p->pi_lock:
+        */
+       spin_lock_irq(&p->pi_lock);
+       if (unlikely(p->flags & PF_EXITING)) {
+               /*
+                * The task is on the way out. When PF_EXITPIDONE is
+                * set, we know that the task has finished the
+                * cleanup:
+                */
+               int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN;
+
+               spin_unlock_irq(&p->pi_lock);
+               put_task_struct(p);
+               return ret;
+       }
  
         pi_state = alloc_pi_state();
  
@@ -516,7 +532,6 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
         /* Store the key for possible exit cleanups: */
         pi_state->key = me->key;
  
-       spin_lock_irq(&p->pi_lock);
         WARN_ON(!list_empty(&pi_state->list));
         list_add(&pi_state->list, &p->pi_state_list);
         pi_state->owner = p;
@@ -583,15 +598,22 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
          * preserve the owner died bit.)
          */
         if (!(uval & FUTEX_OWNER_DIED)) {
+               int ret = 0;
+
                 newval = FUTEX_WAITERS | new_owner->pid;
  
                 pagefault_disable();
                 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
                 pagefault_enable();
+
                 if (curval == -EFAULT)
-                       return -EFAULT;
+                       ret = -EFAULT;
                 if (curval != uval)
-                       return -EINVAL;
+                       ret = -EINVAL;
+               if (ret) {
+                       spin_unlock(&pi_state->pi_mutex.wait_lock);
+                       return ret;
+               }
         }
  
         spin_lock_irq(&pi_state->owner->pi_lock);
@@ -1149,6 +1171,7 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
         if (unlikely(ret != 0))
                 goto out_release_sem;
  
+ retry_unlocked:
         hb = queue_lock(&q, -1, NULL);
  
   retry_locked:
@@ -1200,34 +1223,58 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
         ret = lookup_pi_state(uval, hb, &q);
  
         if (unlikely(ret)) {
-               /*
-                * There were no waiters and the owner task lookup
-                * failed. When the OWNER_DIED bit is set, then we
-                * know that this is a robust futex and we actually
-                * take the lock. This is safe as we are protected by
-                * the hash bucket lock. We also set the waiters bit
-                * unconditionally here, to simplify glibc handling of
-                * multiple tasks racing to acquire the lock and
-                * cleanup the problems which were left by the dead
-                * owner.
-                */
-               if (curval & FUTEX_OWNER_DIED) {
-                       uval = newval;
-                       newval = current->pid |
-                               FUTEX_OWNER_DIED | FUTEX_WAITERS;
+               switch (ret) {
  
-                       pagefault_disable();
-                       curval = futex_atomic_cmpxchg_inatomic(uaddr,
-                                                              uval, newval);
-                       pagefault_enable();
+               case -EAGAIN:
+                       /*
+                        * Task is exiting and we just wait for the
+                        * exit to complete.
+                        */
+                       queue_unlock(&q, hb);
+                       up_read(&curr->mm->mmap_sem);
+                       cond_resched();
+                       goto retry;
  
-                       if (unlikely(curval == -EFAULT))
+               case -ESRCH:
+                       /*
+                        * No owner found for this futex. Check if the
+                        * OWNER_DIED bit is set to figure out whether
+                        * this is a robust futex or not.
+                        */
+                       if (get_futex_value_locked(&curval, uaddr))
                                 goto uaddr_faulted;
-                       if (unlikely(curval != uval))
-                               goto retry_locked;
-                       ret = 0;
+
+                       /*
+                        * There were no waiters and the owner task lookup
+                        * failed. When the OWNER_DIED bit is set, then we
+                        * know that this is a robust futex and we actually
+                        * take the lock. This is safe as we are protected by
+                        * the hash bucket lock. We also set the waiters bit
+                        * unconditionally here, to simplify glibc handling of
+                        * multiple tasks racing to acquire the lock and
+                        * cleanup the problems which were left by the dead
+                        * owner.
+                        */
+                       if (curval & FUTEX_OWNER_DIED) {
+                               uval = newval;
+                               newval = current->pid |
+                                       FUTEX_OWNER_DIED | FUTEX_WAITERS;
+
+                               pagefault_disable();
+                               curval = futex_atomic_cmpxchg_inatomic(uaddr,
+                                                                      uval,
+                                                                      newval);
+                               pagefault_enable();
+
+                               if (unlikely(curval == -EFAULT))
+                                       goto uaddr_faulted;
+                               if (unlikely(curval != uval))
+                                       goto retry_locked;
+                               ret = 0;
+                       }
+               default:
+                       goto out_unlock_release_sem;
                 }
-               goto out_unlock_release_sem;
         }
  
         /*
@@ -1279,39 +1326,52 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
                 list_add(&q.pi_state->list, &current->pi_state_list);
                 spin_unlock_irq(&current->pi_lock);
  
-               /* Unqueue and drop the lock */
-               unqueue_me_pi(&q, hb);
-               up_read(&curr->mm->mmap_sem);
                 /*
                  * We own it, so we have to replace the pending owner
-                * TID. This must be atomic as we have preserve the
+                * TID. This must be atomic as we have to preserve the
                  * owner died bit here.
                  */
-               ret = get_user(uval, uaddr);
+               ret = get_futex_value_locked(&uval, uaddr);
                 while (!ret) {
                         newval = (uval & FUTEX_OWNER_DIED) | newtid;
+
+                       pagefault_disable();
                         curval = futex_atomic_cmpxchg_inatomic(uaddr,
                                                                uval, newval);
+                       pagefault_enable();
+
                         if (curval == -EFAULT)
                                 ret = -EFAULT;
                         if (curval == uval)
                                 break;
                         uval = curval;
                 }
-       } else {
+       } else if (ret) {
                 /*
                  * Catch the rare case, where the lock was released
                  * when we were on the way back before we locked
                  * the hash bucket.
                  */
-               if (ret && q.pi_state->owner == curr) {
-                       if (rt_mutex_trylock(&q.pi_state->pi_mutex))
-                               ret = 0;
+               if (q.pi_state->owner == curr &&
+                   rt_mutex_trylock(&q.pi_state->pi_mutex)) {
+                       ret = 0;
+               } else {
+                       /*
+                        * Paranoia check. If we did not take the lock
+                        * in the trylock above, then we should not be
+                        * the owner of the rtmutex, neither the real
+                        * nor the pending one:
+                        */
+                       if (rt_mutex_owner(&q.pi_state->pi_mutex) == curr)
+                               printk(KERN_ERR "futex_lock_pi: ret = %d "
+                                      "pi-mutex: %p pi-state %p\n", ret,
+                                      q.pi_state->pi_mutex.owner,
+                                      q.pi_state->owner);
                 }
-               /* Unqueue and drop the lock */
-               unqueue_me_pi(&q, hb);
-               up_read(&curr->mm->mmap_sem);
         }
+       /* Unqueue and drop the lock */
+       unqueue_me_pi(&q, hb);
+       up_read(&curr->mm->mmap_sem);
  
         if (!detect && ret == -EDEADLK && 0)
                 force_sig(SIGKILL, current);
@@ -1331,16 +1391,18 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
          * non-atomically.  Therefore, if get_user below is not
          * enough, we need to handle the fault ourselves, while
          * still holding the mmap_sem.
+        *
+        * ... and hb->lock. :-) --ANK
          */
+       queue_unlock(&q, hb);
+
         if (attempt++) {
-               if (futex_handle_fault((unsigned long)uaddr, attempt)) {
-                       ret = -EFAULT;
-                       goto out_unlock_release_sem;
-               }
-               goto retry_locked;
+               ret = futex_handle_fault((unsigned long)uaddr, attempt);
+               if (ret)
+                       goto out_release_sem;
+               goto retry_unlocked;
         }
  
-       queue_unlock(&q, hb);
         up_read(&curr->mm->mmap_sem);
  
         ret = get_user(uval, uaddr);
@@ -1382,9 +1444,9 @@ retry:
                 goto out;
  
         hb = hash_futex(&key);
+retry_unlocked:
         spin_lock(&hb->lock);
  
-retry_locked:
         /*
          * To avoid races, try to do the TID -> 0 atomic transition
          * again. If it succeeds then we can return without waking
@@ -1446,16 +1508,17 @@ pi_faulted:
          * non-atomically.  Therefore, if get_user below is not
          * enough, we need to handle the fault ourselves, while
          * still holding the mmap_sem.
+        *
+        * ... and hb->lock. :-) --ANK
          */
+       spin_unlock(&hb->lock);
+
         if (attempt++) {
-               if (futex_handle_fault((unsigned long)uaddr, attempt)) {
-                       ret = -EFAULT;
-                       goto out_unlock;
-               }
-               goto retry_locked;
+               ret = futex_handle_fault((unsigned long)uaddr, attempt);
+               if (ret)
+                       goto out;
+               goto retry_unlocked;
         }
-
-       spin_unlock(&hb->lock);
         up_read(&current->mm->mmap_sem);
  
         ret = get_user(uval, uaddr);
author	Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
	Fri, 8 Jun 2007 10:29:30 +0000 (10:29 +0000)
committer	Willy Tarreau <w@1wt.eu>
	Wed, 15 Aug 2007 08:02:26 +0000 (10:02 +0200)
include/linux/sched.h		patch \| blob \| history
kernel/exit.c		patch \| blob \| history
kernel/futex.c		patch \| blob \| history