]> Pileus Git - ~andy/linux/blob - kernel/futex.c
44a1261cb9ff63a33eec42136be4794a0752a0ac
[~andy/linux] / kernel / futex.c
1 /*
2  *  Fast Userspace Mutexes (which I call "Futexes!").
3  *  (C) Rusty Russell, IBM 2002
4  *
5  *  Generalized futexes, futex requeueing, misc fixes by Ingo Molnar
6  *  (C) Copyright 2003 Red Hat Inc, All Rights Reserved
7  *
8  *  Removed page pinning, fix privately mapped COW pages and other cleanups
9  *  (C) Copyright 2003, 2004 Jamie Lokier
10  *
11  *  Robust futex support started by Ingo Molnar
12  *  (C) Copyright 2006 Red Hat Inc, All Rights Reserved
13  *  Thanks to Thomas Gleixner for suggestions, analysis and fixes.
14  *
15  *  PI-futex support started by Ingo Molnar and Thomas Gleixner
16  *  Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
17  *  Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
18  *
19  *  PRIVATE futexes by Eric Dumazet
20  *  Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
21  *
22  *  Requeue-PI support by Darren Hart <dvhltc@us.ibm.com>
23  *  Copyright (C) IBM Corporation, 2009
24  *  Thanks to Thomas Gleixner for conceptual design and careful reviews.
25  *
26  *  Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
27  *  enough at me, Linus for the original (flawed) idea, Matthew
28  *  Kirkwood for proof-of-concept implementation.
29  *
30  *  "The futexes are also cursed."
31  *  "But they come in a choice of three flavours!"
32  *
33  *  This program is free software; you can redistribute it and/or modify
34  *  it under the terms of the GNU General Public License as published by
35  *  the Free Software Foundation; either version 2 of the License, or
36  *  (at your option) any later version.
37  *
38  *  This program is distributed in the hope that it will be useful,
39  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
40  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
41  *  GNU General Public License for more details.
42  *
43  *  You should have received a copy of the GNU General Public License
44  *  along with this program; if not, write to the Free Software
45  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
46  */
47 #include <linux/slab.h>
48 #include <linux/poll.h>
49 #include <linux/fs.h>
50 #include <linux/file.h>
51 #include <linux/jhash.h>
52 #include <linux/init.h>
53 #include <linux/futex.h>
54 #include <linux/mount.h>
55 #include <linux/pagemap.h>
56 #include <linux/syscalls.h>
57 #include <linux/signal.h>
58 #include <linux/export.h>
59 #include <linux/magic.h>
60 #include <linux/pid.h>
61 #include <linux/nsproxy.h>
62 #include <linux/ptrace.h>
63 #include <linux/sched/rt.h>
64 #include <linux/hugetlb.h>
65 #include <linux/freezer.h>
66 #include <linux/bootmem.h>
67
68 #include <asm/futex.h>
69
70 #include "locking/rtmutex_common.h"
71
72 /*
73  * Basic futex operation and ordering guarantees:
74  *
75  * The waiter reads the futex value in user space and calls
76  * futex_wait(). This function computes the hash bucket and acquires
77  * the hash bucket lock. After that it reads the futex user space value
78  * again and verifies that the data has not changed. If it has not changed
79  * it enqueues itself into the hash bucket, releases the hash bucket lock
80  * and schedules.
81  *
82  * The waker side modifies the user space value of the futex and calls
83  * futex_wake(). This function computes the hash bucket and acquires the
84  * hash bucket lock. Then it looks for waiters on that futex in the hash
85  * bucket and wakes them.
86  *
87  * In futex wake up scenarios where no tasks are blocked on a futex, taking
88  * the hb spinlock can be avoided and simply return. In order for this
89  * optimization to work, ordering guarantees must exist so that the waiter
90  * being added to the list is acknowledged when the list is concurrently being
91  * checked by the waker, avoiding scenarios like the following:
92  *
93  * CPU 0                               CPU 1
94  * val = *futex;
95  * sys_futex(WAIT, futex, val);
96  *   futex_wait(futex, val);
97  *   uval = *futex;
98  *                                     *futex = newval;
99  *                                     sys_futex(WAKE, futex);
100  *                                       futex_wake(futex);
101  *                                       if (queue_empty())
102  *                                         return;
103  *   if (uval == val)
104  *      lock(hash_bucket(futex));
105  *      queue();
106  *     unlock(hash_bucket(futex));
107  *     schedule();
108  *
109  * This would cause the waiter on CPU 0 to wait forever because it
110  * missed the transition of the user space value from val to newval
111  * and the waker did not find the waiter in the hash bucket queue.
112  *
113  * The correct serialization ensures that a waiter either observes
114  * the changed user space value before blocking or is woken by a
115  * concurrent waker:
116  *
117  * CPU 0                                 CPU 1
118  * val = *futex;
119  * sys_futex(WAIT, futex, val);
120  *   futex_wait(futex, val);
121  *
122  *   waiters++;
123  *   mb(); (A) <-- paired with -.
124  *                              |
125  *   lock(hash_bucket(futex));  |
126  *                              |
127  *   uval = *futex;             |
128  *                              |        *futex = newval;
129  *                              |        sys_futex(WAKE, futex);
130  *                              |          futex_wake(futex);
131  *                              |
132  *                              `------->  mb(); (B)
133  *   if (uval == val)
134  *     queue();
135  *     unlock(hash_bucket(futex));
136  *     schedule();                         if (waiters)
137  *                                           lock(hash_bucket(futex));
138  *                                           wake_waiters(futex);
139  *                                           unlock(hash_bucket(futex));
140  *
141  * Where (A) orders the waiters increment and the futex value read -- this
142  * is guaranteed by the head counter in the hb spinlock; and where (B)
143  * orders the write to futex and the waiters read -- this is done by the
144  * barriers in get_futex_key_refs(), through either ihold or atomic_inc,
145  * depending on the futex type.
146  *
147  * This yields the following case (where X:=waiters, Y:=futex):
148  *
149  *      X = Y = 0
150  *
151  *      w[X]=1          w[Y]=1
152  *      MB              MB
153  *      r[Y]=y          r[X]=x
154  *
155  * Which guarantees that x==0 && y==0 is impossible; which translates back into
156  * the guarantee that we cannot both miss the futex variable change and the
157  * enqueue.
158  */
159
160 int __read_mostly futex_cmpxchg_enabled;
161
162 /*
163  * Futex flags used to encode options to functions and preserve them across
164  * restarts.
165  */
166 #define FLAGS_SHARED            0x01
167 #define FLAGS_CLOCKRT           0x02
168 #define FLAGS_HAS_TIMEOUT       0x04
169
170 /*
171  * Priority Inheritance state:
172  */
173 struct futex_pi_state {
174         /*
175          * list of 'owned' pi_state instances - these have to be
176          * cleaned up in do_exit() if the task exits prematurely:
177          */
178         struct list_head list;
179
180         /*
181          * The PI object:
182          */
183         struct rt_mutex pi_mutex;
184
185         struct task_struct *owner;
186         atomic_t refcount;
187
188         union futex_key key;
189 };
190
191 /**
192  * struct futex_q - The hashed futex queue entry, one per waiting task
193  * @list:               priority-sorted list of tasks waiting on this futex
194  * @task:               the task waiting on the futex
195  * @lock_ptr:           the hash bucket lock
196  * @key:                the key the futex is hashed on
197  * @pi_state:           optional priority inheritance state
198  * @rt_waiter:          rt_waiter storage for use with requeue_pi
199  * @requeue_pi_key:     the requeue_pi target futex key
200  * @bitset:             bitset for the optional bitmasked wakeup
201  *
202  * We use this hashed waitqueue, instead of a normal wait_queue_t, so
203  * we can wake only the relevant ones (hashed queues may be shared).
204  *
205  * A futex_q has a woken state, just like tasks have TASK_RUNNING.
206  * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
207  * The order of wakeup is always to make the first condition true, then
208  * the second.
209  *
210  * PI futexes are typically woken before they are removed from the hash list via
211  * the rt_mutex code. See unqueue_me_pi().
212  */
213 struct futex_q {
214         struct plist_node list;
215
216         struct task_struct *task;
217         spinlock_t *lock_ptr;
218         union futex_key key;
219         struct futex_pi_state *pi_state;
220         struct rt_mutex_waiter *rt_waiter;
221         union futex_key *requeue_pi_key;
222         u32 bitset;
223 };
224
225 static const struct futex_q futex_q_init = {
226         /* list gets initialized in queue_me()*/
227         .key = FUTEX_KEY_INIT,
228         .bitset = FUTEX_BITSET_MATCH_ANY
229 };
230
231 /*
232  * Hash buckets are shared by all the futex_keys that hash to the same
233  * location.  Each key may have multiple futex_q structures, one for each task
234  * waiting on a futex.
235  */
236 struct futex_hash_bucket {
237         spinlock_t lock;
238         struct plist_head chain;
239 } ____cacheline_aligned_in_smp;
240
241 static unsigned long __read_mostly futex_hashsize;
242
243 static struct futex_hash_bucket *futex_queues;
244
245 static inline void futex_get_mm(union futex_key *key)
246 {
247         atomic_inc(&key->private.mm->mm_count);
248         /*
249          * Ensure futex_get_mm() implies a full barrier such that
250          * get_futex_key() implies a full barrier. This is relied upon
251          * as full barrier (B), see the ordering comment above.
252          */
253         smp_mb__after_atomic_inc();
254 }
255
256 static inline bool hb_waiters_pending(struct futex_hash_bucket *hb)
257 {
258 #ifdef CONFIG_SMP
259         /*
260          * Tasks trying to enter the critical region are most likely
261          * potential waiters that will be added to the plist. Ensure
262          * that wakers won't miss to-be-slept tasks in the window between
263          * the wait call and the actual plist_add.
264          */
265         if (spin_is_locked(&hb->lock))
266                 return true;
267         smp_rmb(); /* Make sure we check the lock state first */
268
269         return !plist_head_empty(&hb->chain);
270 #else
271         return true;
272 #endif
273 }
274
275 /*
276  * We hash on the keys returned from get_futex_key (see below).
277  */
278 static struct futex_hash_bucket *hash_futex(union futex_key *key)
279 {
280         u32 hash = jhash2((u32*)&key->both.word,
281                           (sizeof(key->both.word)+sizeof(key->both.ptr))/4,
282                           key->both.offset);
283         return &futex_queues[hash & (futex_hashsize - 1)];
284 }
285
286 /*
287  * Return 1 if two futex_keys are equal, 0 otherwise.
288  */
289 static inline int match_futex(union futex_key *key1, union futex_key *key2)
290 {
291         return (key1 && key2
292                 && key1->both.word == key2->both.word
293                 && key1->both.ptr == key2->both.ptr
294                 && key1->both.offset == key2->both.offset);
295 }
296
297 /*
298  * Take a reference to the resource addressed by a key.
299  * Can be called while holding spinlocks.
300  *
301  */
302 static void get_futex_key_refs(union futex_key *key)
303 {
304         if (!key->both.ptr)
305                 return;
306
307         switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
308         case FUT_OFF_INODE:
309                 ihold(key->shared.inode); /* implies MB (B) */
310                 break;
311         case FUT_OFF_MMSHARED:
312                 futex_get_mm(key); /* implies MB (B) */
313                 break;
314         }
315 }
316
317 /*
318  * Drop a reference to the resource addressed by a key.
319  * The hash bucket spinlock must not be held.
320  */
321 static void drop_futex_key_refs(union futex_key *key)
322 {
323         if (!key->both.ptr) {
324                 /* If we're here then we tried to put a key we failed to get */
325                 WARN_ON_ONCE(1);
326                 return;
327         }
328
329         switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
330         case FUT_OFF_INODE:
331                 iput(key->shared.inode);
332                 break;
333         case FUT_OFF_MMSHARED:
334                 mmdrop(key->private.mm);
335                 break;
336         }
337 }
338
339 /**
340  * get_futex_key() - Get parameters which are the keys for a futex
341  * @uaddr:      virtual address of the futex
342  * @fshared:    0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
343  * @key:        address where result is stored.
344  * @rw:         mapping needs to be read/write (values: VERIFY_READ,
345  *              VERIFY_WRITE)
346  *
347  * Return: a negative error code or 0
348  *
349  * The key words are stored in *key on success.
350  *
351  * For shared mappings, it's (page->index, file_inode(vma->vm_file),
352  * offset_within_page).  For private mappings, it's (uaddr, current->mm).
353  * We can usually work out the index without swapping in the page.
354  *
355  * lock_page() might sleep, the caller should not hold a spinlock.
356  */
357 static int
358 get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
359 {
360         unsigned long address = (unsigned long)uaddr;
361         struct mm_struct *mm = current->mm;
362         struct page *page, *page_head;
363         int err, ro = 0;
364
365         /*
366          * The futex address must be "naturally" aligned.
367          */
368         key->both.offset = address % PAGE_SIZE;
369         if (unlikely((address % sizeof(u32)) != 0))
370                 return -EINVAL;
371         address -= key->both.offset;
372
373         if (unlikely(!access_ok(rw, uaddr, sizeof(u32))))
374                 return -EFAULT;
375
376         /*
377          * PROCESS_PRIVATE futexes are fast.
378          * As the mm cannot disappear under us and the 'key' only needs
379          * virtual address, we dont even have to find the underlying vma.
380          * Note : We do have to check 'uaddr' is a valid user address,
381          *        but access_ok() should be faster than find_vma()
382          */
383         if (!fshared) {
384                 key->private.mm = mm;
385                 key->private.address = address;
386                 get_futex_key_refs(key);  /* implies MB (B) */
387                 return 0;
388         }
389
390 again:
391         err = get_user_pages_fast(address, 1, 1, &page);
392         /*
393          * If write access is not required (eg. FUTEX_WAIT), try
394          * and get read-only access.
395          */
396         if (err == -EFAULT && rw == VERIFY_READ) {
397                 err = get_user_pages_fast(address, 1, 0, &page);
398                 ro = 1;
399         }
400         if (err < 0)
401                 return err;
402         else
403                 err = 0;
404
405 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
406         page_head = page;
407         if (unlikely(PageTail(page))) {
408                 put_page(page);
409                 /* serialize against __split_huge_page_splitting() */
410                 local_irq_disable();
411                 if (likely(__get_user_pages_fast(address, 1, !ro, &page) == 1)) {
412                         page_head = compound_head(page);
413                         /*
414                          * page_head is valid pointer but we must pin
415                          * it before taking the PG_lock and/or
416                          * PG_compound_lock. The moment we re-enable
417                          * irqs __split_huge_page_splitting() can
418                          * return and the head page can be freed from
419                          * under us. We can't take the PG_lock and/or
420                          * PG_compound_lock on a page that could be
421                          * freed from under us.
422                          */
423                         if (page != page_head) {
424                                 get_page(page_head);
425                                 put_page(page);
426                         }
427                         local_irq_enable();
428                 } else {
429                         local_irq_enable();
430                         goto again;
431                 }
432         }
433 #else
434         page_head = compound_head(page);
435         if (page != page_head) {
436                 get_page(page_head);
437                 put_page(page);
438         }
439 #endif
440
441         lock_page(page_head);
442
443         /*
444          * If page_head->mapping is NULL, then it cannot be a PageAnon
445          * page; but it might be the ZERO_PAGE or in the gate area or
446          * in a special mapping (all cases which we are happy to fail);
447          * or it may have been a good file page when get_user_pages_fast
448          * found it, but truncated or holepunched or subjected to
449          * invalidate_complete_page2 before we got the page lock (also
450          * cases which we are happy to fail).  And we hold a reference,
451          * so refcount care in invalidate_complete_page's remove_mapping
452          * prevents drop_caches from setting mapping to NULL beneath us.
453          *
454          * The case we do have to guard against is when memory pressure made
455          * shmem_writepage move it from filecache to swapcache beneath us:
456          * an unlikely race, but we do need to retry for page_head->mapping.
457          */
458         if (!page_head->mapping) {
459                 int shmem_swizzled = PageSwapCache(page_head);
460                 unlock_page(page_head);
461                 put_page(page_head);
462                 if (shmem_swizzled)
463                         goto again;
464                 return -EFAULT;
465         }
466
467         /*
468          * Private mappings are handled in a simple way.
469          *
470          * NOTE: When userspace waits on a MAP_SHARED mapping, even if
471          * it's a read-only handle, it's expected that futexes attach to
472          * the object not the particular process.
473          */
474         if (PageAnon(page_head)) {
475                 /*
476                  * A RO anonymous page will never change and thus doesn't make
477                  * sense for futex operations.
478                  */
479                 if (ro) {
480                         err = -EFAULT;
481                         goto out;
482                 }
483
484                 key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
485                 key->private.mm = mm;
486                 key->private.address = address;
487         } else {
488                 key->both.offset |= FUT_OFF_INODE; /* inode-based key */
489                 key->shared.inode = page_head->mapping->host;
490                 key->shared.pgoff = basepage_index(page);
491         }
492
493         get_futex_key_refs(key); /* implies MB (B) */
494
495 out:
496         unlock_page(page_head);
497         put_page(page_head);
498         return err;
499 }
500
501 static inline void put_futex_key(union futex_key *key)
502 {
503         drop_futex_key_refs(key);
504 }
505
506 /**
507  * fault_in_user_writeable() - Fault in user address and verify RW access
508  * @uaddr:      pointer to faulting user space address
509  *
510  * Slow path to fixup the fault we just took in the atomic write
511  * access to @uaddr.
512  *
513  * We have no generic implementation of a non-destructive write to the
514  * user address. We know that we faulted in the atomic pagefault
515  * disabled section so we can as well avoid the #PF overhead by
516  * calling get_user_pages() right away.
517  */
518 static int fault_in_user_writeable(u32 __user *uaddr)
519 {
520         struct mm_struct *mm = current->mm;
521         int ret;
522
523         down_read(&mm->mmap_sem);
524         ret = fixup_user_fault(current, mm, (unsigned long)uaddr,
525                                FAULT_FLAG_WRITE);
526         up_read(&mm->mmap_sem);
527
528         return ret < 0 ? ret : 0;
529 }
530
531 /**
532  * futex_top_waiter() - Return the highest priority waiter on a futex
533  * @hb:         the hash bucket the futex_q's reside in
534  * @key:        the futex key (to distinguish it from other futex futex_q's)
535  *
536  * Must be called with the hb lock held.
537  */
538 static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb,
539                                         union futex_key *key)
540 {
541         struct futex_q *this;
542
543         plist_for_each_entry(this, &hb->chain, list) {
544                 if (match_futex(&this->key, key))
545                         return this;
546         }
547         return NULL;
548 }
549
550 static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr,
551                                       u32 uval, u32 newval)
552 {
553         int ret;
554
555         pagefault_disable();
556         ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval);
557         pagefault_enable();
558
559         return ret;
560 }
561
562 static int get_futex_value_locked(u32 *dest, u32 __user *from)
563 {
564         int ret;
565
566         pagefault_disable();
567         ret = __copy_from_user_inatomic(dest, from, sizeof(u32));
568         pagefault_enable();
569
570         return ret ? -EFAULT : 0;
571 }
572
573
574 /*
575  * PI code:
576  */
577 static int refill_pi_state_cache(void)
578 {
579         struct futex_pi_state *pi_state;
580
581         if (likely(current->pi_state_cache))
582                 return 0;
583
584         pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL);
585
586         if (!pi_state)
587                 return -ENOMEM;
588
589         INIT_LIST_HEAD(&pi_state->list);
590         /* pi_mutex gets initialized later */
591         pi_state->owner = NULL;
592         atomic_set(&pi_state->refcount, 1);
593         pi_state->key = FUTEX_KEY_INIT;
594
595         current->pi_state_cache = pi_state;
596
597         return 0;
598 }
599
600 static struct futex_pi_state * alloc_pi_state(void)
601 {
602         struct futex_pi_state *pi_state = current->pi_state_cache;
603
604         WARN_ON(!pi_state);
605         current->pi_state_cache = NULL;
606
607         return pi_state;
608 }
609
610 static void free_pi_state(struct futex_pi_state *pi_state)
611 {
612         if (!atomic_dec_and_test(&pi_state->refcount))
613                 return;
614
615         /*
616          * If pi_state->owner is NULL, the owner is most probably dying
617          * and has cleaned up the pi_state already
618          */
619         if (pi_state->owner) {
620                 raw_spin_lock_irq(&pi_state->owner->pi_lock);
621                 list_del_init(&pi_state->list);
622                 raw_spin_unlock_irq(&pi_state->owner->pi_lock);
623
624                 rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner);
625         }
626
627         if (current->pi_state_cache)
628                 kfree(pi_state);
629         else {
630                 /*
631                  * pi_state->list is already empty.
632                  * clear pi_state->owner.
633                  * refcount is at 0 - put it back to 1.
634                  */
635                 pi_state->owner = NULL;
636                 atomic_set(&pi_state->refcount, 1);
637                 current->pi_state_cache = pi_state;
638         }
639 }
640
641 /*
642  * Look up the task based on what TID userspace gave us.
643  * We dont trust it.
644  */
645 static struct task_struct * futex_find_get_task(pid_t pid)
646 {
647         struct task_struct *p;
648
649         rcu_read_lock();
650         p = find_task_by_vpid(pid);
651         if (p)
652                 get_task_struct(p);
653
654         rcu_read_unlock();
655
656         return p;
657 }
658
659 /*
660  * This task is holding PI mutexes at exit time => bad.
661  * Kernel cleans up PI-state, but userspace is likely hosed.
662  * (Robust-futex cleanup is separate and might save the day for userspace.)
663  */
664 void exit_pi_state_list(struct task_struct *curr)
665 {
666         struct list_head *next, *head = &curr->pi_state_list;
667         struct futex_pi_state *pi_state;
668         struct futex_hash_bucket *hb;
669         union futex_key key = FUTEX_KEY_INIT;
670
671         if (!futex_cmpxchg_enabled)
672                 return;
673         /*
674          * We are a ZOMBIE and nobody can enqueue itself on
675          * pi_state_list anymore, but we have to be careful
676          * versus waiters unqueueing themselves:
677          */
678         raw_spin_lock_irq(&curr->pi_lock);
679         while (!list_empty(head)) {
680
681                 next = head->next;
682                 pi_state = list_entry(next, struct futex_pi_state, list);
683                 key = pi_state->key;
684                 hb = hash_futex(&key);
685                 raw_spin_unlock_irq(&curr->pi_lock);
686
687                 spin_lock(&hb->lock);
688
689                 raw_spin_lock_irq(&curr->pi_lock);
690                 /*
691                  * We dropped the pi-lock, so re-check whether this
692                  * task still owns the PI-state:
693                  */
694                 if (head->next != next) {
695                         spin_unlock(&hb->lock);
696                         continue;
697                 }
698
699                 WARN_ON(pi_state->owner != curr);
700                 WARN_ON(list_empty(&pi_state->list));
701                 list_del_init(&pi_state->list);
702                 pi_state->owner = NULL;
703                 raw_spin_unlock_irq(&curr->pi_lock);
704
705                 rt_mutex_unlock(&pi_state->pi_mutex);
706
707                 spin_unlock(&hb->lock);
708
709                 raw_spin_lock_irq(&curr->pi_lock);
710         }
711         raw_spin_unlock_irq(&curr->pi_lock);
712 }
713
714 static int
715 lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
716                 union futex_key *key, struct futex_pi_state **ps)
717 {
718         struct futex_pi_state *pi_state = NULL;
719         struct futex_q *this, *next;
720         struct task_struct *p;
721         pid_t pid = uval & FUTEX_TID_MASK;
722
723         plist_for_each_entry_safe(this, next, &hb->chain, list) {
724                 if (match_futex(&this->key, key)) {
725                         /*
726                          * Another waiter already exists - bump up
727                          * the refcount and return its pi_state:
728                          */
729                         pi_state = this->pi_state;
730                         /*
731                          * Userspace might have messed up non-PI and PI futexes
732                          */
733                         if (unlikely(!pi_state))
734                                 return -EINVAL;
735
736                         WARN_ON(!atomic_read(&pi_state->refcount));
737
738                         /*
739                          * When pi_state->owner is NULL then the owner died
740                          * and another waiter is on the fly. pi_state->owner
741                          * is fixed up by the task which acquires
742                          * pi_state->rt_mutex.
743                          *
744                          * We do not check for pid == 0 which can happen when
745                          * the owner died and robust_list_exit() cleared the
746                          * TID.
747                          */
748                         if (pid && pi_state->owner) {
749                                 /*
750                                  * Bail out if user space manipulated the
751                                  * futex value.
752                                  */
753                                 if (pid != task_pid_vnr(pi_state->owner))
754                                         return -EINVAL;
755                         }
756
757                         atomic_inc(&pi_state->refcount);
758                         *ps = pi_state;
759
760                         return 0;
761                 }
762         }
763
764         /*
765          * We are the first waiter - try to look up the real owner and attach
766          * the new pi_state to it, but bail out when TID = 0
767          */
768         if (!pid)
769                 return -ESRCH;
770         p = futex_find_get_task(pid);
771         if (!p)
772                 return -ESRCH;
773
774         /*
775          * We need to look at the task state flags to figure out,
776          * whether the task is exiting. To protect against the do_exit
777          * change of the task flags, we do this protected by
778          * p->pi_lock:
779          */
780         raw_spin_lock_irq(&p->pi_lock);
781         if (unlikely(p->flags & PF_EXITING)) {
782                 /*
783                  * The task is on the way out. When PF_EXITPIDONE is
784                  * set, we know that the task has finished the
785                  * cleanup:
786                  */
787                 int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN;
788
789                 raw_spin_unlock_irq(&p->pi_lock);
790                 put_task_struct(p);
791                 return ret;
792         }
793
794         pi_state = alloc_pi_state();
795
796         /*
797          * Initialize the pi_mutex in locked state and make 'p'
798          * the owner of it:
799          */
800         rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
801
802         /* Store the key for possible exit cleanups: */
803         pi_state->key = *key;
804
805         WARN_ON(!list_empty(&pi_state->list));
806         list_add(&pi_state->list, &p->pi_state_list);
807         pi_state->owner = p;
808         raw_spin_unlock_irq(&p->pi_lock);
809
810         put_task_struct(p);
811
812         *ps = pi_state;
813
814         return 0;
815 }
816
817 /**
818  * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex
819  * @uaddr:              the pi futex user address
820  * @hb:                 the pi futex hash bucket
821  * @key:                the futex key associated with uaddr and hb
822  * @ps:                 the pi_state pointer where we store the result of the
823  *                      lookup
824  * @task:               the task to perform the atomic lock work for.  This will
825  *                      be "current" except in the case of requeue pi.
826  * @set_waiters:        force setting the FUTEX_WAITERS bit (1) or not (0)
827  *
828  * Return:
829  *  0 - ready to wait;
830  *  1 - acquired the lock;
831  * <0 - error
832  *
833  * The hb->lock and futex_key refs shall be held by the caller.
834  */
835 static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
836                                 union futex_key *key,
837                                 struct futex_pi_state **ps,
838                                 struct task_struct *task, int set_waiters)
839 {
840         int lock_taken, ret, force_take = 0;
841         u32 uval, newval, curval, vpid = task_pid_vnr(task);
842
843 retry:
844         ret = lock_taken = 0;
845
846         /*
847          * To avoid races, we attempt to take the lock here again
848          * (by doing a 0 -> TID atomic cmpxchg), while holding all
849          * the locks. It will most likely not succeed.
850          */
851         newval = vpid;
852         if (set_waiters)
853                 newval |= FUTEX_WAITERS;
854
855         if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, 0, newval)))
856                 return -EFAULT;
857
858         /*
859          * Detect deadlocks.
860          */
861         if ((unlikely((curval & FUTEX_TID_MASK) == vpid)))
862                 return -EDEADLK;
863
864         /*
865          * Surprise - we got the lock. Just return to userspace:
866          */
867         if (unlikely(!curval))
868                 return 1;
869
870         uval = curval;
871
872         /*
873          * Set the FUTEX_WAITERS flag, so the owner will know it has someone
874          * to wake at the next unlock.
875          */
876         newval = curval | FUTEX_WAITERS;
877
878         /*
879          * Should we force take the futex? See below.
880          */
881         if (unlikely(force_take)) {
882                 /*
883                  * Keep the OWNER_DIED and the WAITERS bit and set the
884                  * new TID value.
885                  */
886                 newval = (curval & ~FUTEX_TID_MASK) | vpid;
887                 force_take = 0;
888                 lock_taken = 1;
889         }
890
891         if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
892                 return -EFAULT;
893         if (unlikely(curval != uval))
894                 goto retry;
895
896         /*
897          * We took the lock due to forced take over.
898          */
899         if (unlikely(lock_taken))
900                 return 1;
901
902         /*
903          * We dont have the lock. Look up the PI state (or create it if
904          * we are the first waiter):
905          */
906         ret = lookup_pi_state(uval, hb, key, ps);
907
908         if (unlikely(ret)) {
909                 switch (ret) {
910                 case -ESRCH:
911                         /*
912                          * We failed to find an owner for this
913                          * futex. So we have no pi_state to block
914                          * on. This can happen in two cases:
915                          *
916                          * 1) The owner died
917                          * 2) A stale FUTEX_WAITERS bit
918                          *
919                          * Re-read the futex value.
920                          */
921                         if (get_futex_value_locked(&curval, uaddr))
922                                 return -EFAULT;
923
924                         /*
925                          * If the owner died or we have a stale
926                          * WAITERS bit the owner TID in the user space
927                          * futex is 0.
928                          */
929                         if (!(curval & FUTEX_TID_MASK)) {
930                                 force_take = 1;
931                                 goto retry;
932                         }
933                 default:
934                         break;
935                 }
936         }
937
938         return ret;
939 }
940
941 /**
942  * __unqueue_futex() - Remove the futex_q from its futex_hash_bucket
943  * @q:  The futex_q to unqueue
944  *
945  * The q->lock_ptr must not be NULL and must be held by the caller.
946  */
947 static void __unqueue_futex(struct futex_q *q)
948 {
949         struct futex_hash_bucket *hb;
950
951         if (WARN_ON_SMP(!q->lock_ptr || !spin_is_locked(q->lock_ptr))
952             || WARN_ON(plist_node_empty(&q->list)))
953                 return;
954
955         hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
956         plist_del(&q->list, &hb->chain);
957 }
958
959 /*
960  * The hash bucket lock must be held when this is called.
961  * Afterwards, the futex_q must not be accessed.
962  */
963 static void wake_futex(struct futex_q *q)
964 {
965         struct task_struct *p = q->task;
966
967         if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n"))
968                 return;
969
970         /*
971          * We set q->lock_ptr = NULL _before_ we wake up the task. If
972          * a non-futex wake up happens on another CPU then the task
973          * might exit and p would dereference a non-existing task
974          * struct. Prevent this by holding a reference on p across the
975          * wake up.
976          */
977         get_task_struct(p);
978
979         __unqueue_futex(q);
980         /*
981          * The waiting task can free the futex_q as soon as
982          * q->lock_ptr = NULL is written, without taking any locks. A
983          * memory barrier is required here to prevent the following
984          * store to lock_ptr from getting ahead of the plist_del.
985          */
986         smp_wmb();
987         q->lock_ptr = NULL;
988
989         wake_up_state(p, TASK_NORMAL);
990         put_task_struct(p);
991 }
992
993 static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
994 {
995         struct task_struct *new_owner;
996         struct futex_pi_state *pi_state = this->pi_state;
997         u32 uninitialized_var(curval), newval;
998
999         if (!pi_state)
1000                 return -EINVAL;
1001
1002         /*
1003          * If current does not own the pi_state then the futex is
1004          * inconsistent and user space fiddled with the futex value.
1005          */
1006         if (pi_state->owner != current)
1007                 return -EINVAL;
1008
1009         raw_spin_lock(&pi_state->pi_mutex.wait_lock);
1010         new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
1011
1012         /*
1013          * It is possible that the next waiter (the one that brought
1014          * this owner to the kernel) timed out and is no longer
1015          * waiting on the lock.
1016          */
1017         if (!new_owner)
1018                 new_owner = this->task;
1019
1020         /*
1021          * We pass it to the next owner. (The WAITERS bit is always
1022          * kept enabled while there is PI state around. We must also
1023          * preserve the owner died bit.)
1024          */
1025         if (!(uval & FUTEX_OWNER_DIED)) {
1026                 int ret = 0;
1027
1028                 newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
1029
1030                 if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
1031                         ret = -EFAULT;
1032                 else if (curval != uval)
1033                         ret = -EINVAL;
1034                 if (ret) {
1035                         raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
1036                         return ret;
1037                 }
1038         }
1039
1040         raw_spin_lock_irq(&pi_state->owner->pi_lock);
1041         WARN_ON(list_empty(&pi_state->list));
1042         list_del_init(&pi_state->list);
1043         raw_spin_unlock_irq(&pi_state->owner->pi_lock);
1044
1045         raw_spin_lock_irq(&new_owner->pi_lock);
1046         WARN_ON(!list_empty(&pi_state->list));
1047         list_add(&pi_state->list, &new_owner->pi_state_list);
1048         pi_state->owner = new_owner;
1049         raw_spin_unlock_irq(&new_owner->pi_lock);
1050
1051         raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
1052         rt_mutex_unlock(&pi_state->pi_mutex);
1053
1054         return 0;
1055 }
1056
1057 static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
1058 {
1059         u32 uninitialized_var(oldval);
1060
1061         /*
1062          * There is no waiter, so we unlock the futex. The owner died
1063          * bit has not to be preserved here. We are the owner:
1064          */
1065         if (cmpxchg_futex_value_locked(&oldval, uaddr, uval, 0))
1066                 return -EFAULT;
1067         if (oldval != uval)
1068                 return -EAGAIN;
1069
1070         return 0;
1071 }
1072
1073 /*
1074  * Express the locking dependencies for lockdep:
1075  */
1076 static inline void
1077 double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
1078 {
1079         if (hb1 <= hb2) {
1080                 spin_lock(&hb1->lock);
1081                 if (hb1 < hb2)
1082                         spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING);
1083         } else { /* hb1 > hb2 */
1084                 spin_lock(&hb2->lock);
1085                 spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING);
1086         }
1087 }
1088
1089 static inline void
1090 double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
1091 {
1092         spin_unlock(&hb1->lock);
1093         if (hb1 != hb2)
1094                 spin_unlock(&hb2->lock);
1095 }
1096
1097 /*
1098  * Wake up waiters matching bitset queued on this futex (uaddr).
1099  */
1100 static int
1101 futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
1102 {
1103         struct futex_hash_bucket *hb;
1104         struct futex_q *this, *next;
1105         union futex_key key = FUTEX_KEY_INIT;
1106         int ret;
1107
1108         if (!bitset)
1109                 return -EINVAL;
1110
1111         ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_READ);
1112         if (unlikely(ret != 0))
1113                 goto out;
1114
1115         hb = hash_futex(&key);
1116
1117         /* Make sure we really have tasks to wakeup */
1118         if (!hb_waiters_pending(hb))
1119                 goto out_put_key;
1120
1121         spin_lock(&hb->lock);
1122
1123         plist_for_each_entry_safe(this, next, &hb->chain, list) {
1124                 if (match_futex (&this->key, &key)) {
1125                         if (this->pi_state || this->rt_waiter) {
1126                                 ret = -EINVAL;
1127                                 break;
1128                         }
1129
1130                         /* Check if one of the bits is set in both bitsets */
1131                         if (!(this->bitset & bitset))
1132                                 continue;
1133
1134                         wake_futex(this);
1135                         if (++ret >= nr_wake)
1136                                 break;
1137                 }
1138         }
1139
1140         spin_unlock(&hb->lock);
1141 out_put_key:
1142         put_futex_key(&key);
1143 out:
1144         return ret;
1145 }
1146
1147 /*
1148  * Wake up all waiters hashed on the physical page that is mapped
1149  * to this virtual address:
1150  */
1151 static int
1152 futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
1153               int nr_wake, int nr_wake2, int op)
1154 {
1155         union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
1156         struct futex_hash_bucket *hb1, *hb2;
1157         struct futex_q *this, *next;
1158         int ret, op_ret;
1159
1160 retry:
1161         ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);
1162         if (unlikely(ret != 0))
1163                 goto out;
1164         ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
1165         if (unlikely(ret != 0))
1166                 goto out_put_key1;
1167
1168         hb1 = hash_futex(&key1);
1169         hb2 = hash_futex(&key2);
1170
1171 retry_private:
1172         double_lock_hb(hb1, hb2);
1173         op_ret = futex_atomic_op_inuser(op, uaddr2);
1174         if (unlikely(op_ret < 0)) {
1175
1176                 double_unlock_hb(hb1, hb2);
1177
1178 #ifndef CONFIG_MMU
1179                 /*
1180                  * we don't get EFAULT from MMU faults if we don't have an MMU,
1181                  * but we might get them from range checking
1182                  */
1183                 ret = op_ret;
1184                 goto out_put_keys;
1185 #endif
1186
1187                 if (unlikely(op_ret != -EFAULT)) {
1188                         ret = op_ret;
1189                         goto out_put_keys;
1190                 }
1191
1192                 ret = fault_in_user_writeable(uaddr2);
1193                 if (ret)
1194                         goto out_put_keys;
1195
1196                 if (!(flags & FLAGS_SHARED))
1197                         goto retry_private;
1198
1199                 put_futex_key(&key2);
1200                 put_futex_key(&key1);
1201                 goto retry;
1202         }
1203
1204         plist_for_each_entry_safe(this, next, &hb1->chain, list) {
1205                 if (match_futex (&this->key, &key1)) {
1206                         if (this->pi_state || this->rt_waiter) {
1207                                 ret = -EINVAL;
1208                                 goto out_unlock;
1209                         }
1210                         wake_futex(this);
1211                         if (++ret >= nr_wake)
1212                                 break;
1213                 }
1214         }
1215
1216         if (op_ret > 0) {
1217                 op_ret = 0;
1218                 plist_for_each_entry_safe(this, next, &hb2->chain, list) {
1219                         if (match_futex (&this->key, &key2)) {
1220                                 if (this->pi_state || this->rt_waiter) {
1221                                         ret = -EINVAL;
1222                                         goto out_unlock;
1223                                 }
1224                                 wake_futex(this);
1225                                 if (++op_ret >= nr_wake2)
1226                                         break;
1227                         }
1228                 }
1229                 ret += op_ret;
1230         }
1231
1232 out_unlock:
1233         double_unlock_hb(hb1, hb2);
1234 out_put_keys:
1235         put_futex_key(&key2);
1236 out_put_key1:
1237         put_futex_key(&key1);
1238 out:
1239         return ret;
1240 }
1241
1242 /**
1243  * requeue_futex() - Requeue a futex_q from one hb to another
1244  * @q:          the futex_q to requeue
1245  * @hb1:        the source hash_bucket
1246  * @hb2:        the target hash_bucket
1247  * @key2:       the new key for the requeued futex_q
1248  */
1249 static inline
1250 void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
1251                    struct futex_hash_bucket *hb2, union futex_key *key2)
1252 {
1253
1254         /*
1255          * If key1 and key2 hash to the same bucket, no need to
1256          * requeue.
1257          */
1258         if (likely(&hb1->chain != &hb2->chain)) {
1259                 plist_del(&q->list, &hb1->chain);
1260                 plist_add(&q->list, &hb2->chain);
1261                 q->lock_ptr = &hb2->lock;
1262         }
1263         get_futex_key_refs(key2);
1264         q->key = *key2;
1265 }
1266
1267 /**
1268  * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue
1269  * @q:          the futex_q
1270  * @key:        the key of the requeue target futex
1271  * @hb:         the hash_bucket of the requeue target futex
1272  *
1273  * During futex_requeue, with requeue_pi=1, it is possible to acquire the
1274  * target futex if it is uncontended or via a lock steal.  Set the futex_q key
1275  * to the requeue target futex so the waiter can detect the wakeup on the right
1276  * futex, but remove it from the hb and NULL the rt_waiter so it can detect
1277  * atomic lock acquisition.  Set the q->lock_ptr to the requeue target hb->lock
1278  * to protect access to the pi_state to fixup the owner later.  Must be called
1279  * with both q->lock_ptr and hb->lock held.
1280  */
1281 static inline
1282 void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
1283                            struct futex_hash_bucket *hb)
1284 {
1285         get_futex_key_refs(key);
1286         q->key = *key;
1287
1288         __unqueue_futex(q);
1289
1290         WARN_ON(!q->rt_waiter);
1291         q->rt_waiter = NULL;
1292
1293         q->lock_ptr = &hb->lock;
1294
1295         wake_up_state(q->task, TASK_NORMAL);
1296 }
1297
1298 /**
1299  * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter
1300  * @pifutex:            the user address of the to futex
1301  * @hb1:                the from futex hash bucket, must be locked by the caller
1302  * @hb2:                the to futex hash bucket, must be locked by the caller
1303  * @key1:               the from futex key
1304  * @key2:               the to futex key
1305  * @ps:                 address to store the pi_state pointer
1306  * @set_waiters:        force setting the FUTEX_WAITERS bit (1) or not (0)
1307  *
1308  * Try and get the lock on behalf of the top waiter if we can do it atomically.
1309  * Wake the top waiter if we succeed.  If the caller specified set_waiters,
1310  * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit.
1311  * hb1 and hb2 must be held by the caller.
1312  *
1313  * Return:
1314  *  0 - failed to acquire the lock atomically;
1315  *  1 - acquired the lock;
1316  * <0 - error
1317  */
1318 static int futex_proxy_trylock_atomic(u32 __user *pifutex,
1319                                  struct futex_hash_bucket *hb1,
1320                                  struct futex_hash_bucket *hb2,
1321                                  union futex_key *key1, union futex_key *key2,
1322                                  struct futex_pi_state **ps, int set_waiters)
1323 {
1324         struct futex_q *top_waiter = NULL;
1325         u32 curval;
1326         int ret;
1327
1328         if (get_futex_value_locked(&curval, pifutex))
1329                 return -EFAULT;
1330
1331         /*
1332          * Find the top_waiter and determine if there are additional waiters.
1333          * If the caller intends to requeue more than 1 waiter to pifutex,
1334          * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now,
1335          * as we have means to handle the possible fault.  If not, don't set
1336          * the bit unecessarily as it will force the subsequent unlock to enter
1337          * the kernel.
1338          */
1339         top_waiter = futex_top_waiter(hb1, key1);
1340
1341         /* There are no waiters, nothing for us to do. */
1342         if (!top_waiter)
1343                 return 0;
1344
1345         /* Ensure we requeue to the expected futex. */
1346         if (!match_futex(top_waiter->requeue_pi_key, key2))
1347                 return -EINVAL;
1348
1349         /*
1350          * Try to take the lock for top_waiter.  Set the FUTEX_WAITERS bit in
1351          * the contended case or if set_waiters is 1.  The pi_state is returned
1352          * in ps in contended cases.
1353          */
1354         ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
1355                                    set_waiters);
1356         if (ret == 1)
1357                 requeue_pi_wake_futex(top_waiter, key2, hb2);
1358
1359         return ret;
1360 }
1361
1362 /**
1363  * futex_requeue() - Requeue waiters from uaddr1 to uaddr2
1364  * @uaddr1:     source futex user address
1365  * @flags:      futex flags (FLAGS_SHARED, etc.)
1366  * @uaddr2:     target futex user address
1367  * @nr_wake:    number of waiters to wake (must be 1 for requeue_pi)
1368  * @nr_requeue: number of waiters to requeue (0-INT_MAX)
1369  * @cmpval:     @uaddr1 expected value (or %NULL)
1370  * @requeue_pi: if we are attempting to requeue from a non-pi futex to a
1371  *              pi futex (pi to pi requeue is not supported)
1372  *
1373  * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
1374  * uaddr2 atomically on behalf of the top waiter.
1375  *
1376  * Return:
1377  * >=0 - on success, the number of tasks requeued or woken;
1378  *  <0 - on error
1379  */
1380 static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
1381                          u32 __user *uaddr2, int nr_wake, int nr_requeue,
1382                          u32 *cmpval, int requeue_pi)
1383 {
1384         union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
1385         int drop_count = 0, task_count = 0, ret;
1386         struct futex_pi_state *pi_state = NULL;
1387         struct futex_hash_bucket *hb1, *hb2;
1388         struct futex_q *this, *next;
1389         u32 curval2;
1390
1391         if (requeue_pi) {
1392                 /*
1393                  * requeue_pi requires a pi_state, try to allocate it now
1394                  * without any locks in case it fails.
1395                  */
1396                 if (refill_pi_state_cache())
1397                         return -ENOMEM;
1398                 /*
1399                  * requeue_pi must wake as many tasks as it can, up to nr_wake
1400                  * + nr_requeue, since it acquires the rt_mutex prior to
1401                  * returning to userspace, so as to not leave the rt_mutex with
1402                  * waiters and no owner.  However, second and third wake-ups
1403                  * cannot be predicted as they involve race conditions with the
1404                  * first wake and a fault while looking up the pi_state.  Both
1405                  * pthread_cond_signal() and pthread_cond_broadcast() should
1406                  * use nr_wake=1.
1407                  */
1408                 if (nr_wake != 1)
1409                         return -EINVAL;
1410         }
1411
1412 retry:
1413         if (pi_state != NULL) {
1414                 /*
1415                  * We will have to lookup the pi_state again, so free this one
1416                  * to keep the accounting correct.
1417                  */
1418                 free_pi_state(pi_state);
1419                 pi_state = NULL;
1420         }
1421
1422         ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);
1423         if (unlikely(ret != 0))
1424                 goto out;
1425         ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2,
1426                             requeue_pi ? VERIFY_WRITE : VERIFY_READ);
1427         if (unlikely(ret != 0))
1428                 goto out_put_key1;
1429
1430         hb1 = hash_futex(&key1);
1431         hb2 = hash_futex(&key2);
1432
1433 retry_private:
1434         double_lock_hb(hb1, hb2);
1435
1436         if (likely(cmpval != NULL)) {
1437                 u32 curval;
1438
1439                 ret = get_futex_value_locked(&curval, uaddr1);
1440
1441                 if (unlikely(ret)) {
1442                         double_unlock_hb(hb1, hb2);
1443
1444                         ret = get_user(curval, uaddr1);
1445                         if (ret)
1446                                 goto out_put_keys;
1447
1448                         if (!(flags & FLAGS_SHARED))
1449                                 goto retry_private;
1450
1451                         put_futex_key(&key2);
1452                         put_futex_key(&key1);
1453                         goto retry;
1454                 }
1455                 if (curval != *cmpval) {
1456                         ret = -EAGAIN;
1457                         goto out_unlock;
1458                 }
1459         }
1460
1461         if (requeue_pi && (task_count - nr_wake < nr_requeue)) {
1462                 /*
1463                  * Attempt to acquire uaddr2 and wake the top waiter. If we
1464                  * intend to requeue waiters, force setting the FUTEX_WAITERS
1465                  * bit.  We force this here where we are able to easily handle
1466                  * faults rather in the requeue loop below.
1467                  */
1468                 ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1,
1469                                                  &key2, &pi_state, nr_requeue);
1470
1471                 /*
1472                  * At this point the top_waiter has either taken uaddr2 or is
1473                  * waiting on it.  If the former, then the pi_state will not
1474                  * exist yet, look it up one more time to ensure we have a
1475                  * reference to it.
1476                  */
1477                 if (ret == 1) {
1478                         WARN_ON(pi_state);
1479                         drop_count++;
1480                         task_count++;
1481                         ret = get_futex_value_locked(&curval2, uaddr2);
1482                         if (!ret)
1483                                 ret = lookup_pi_state(curval2, hb2, &key2,
1484                                                       &pi_state);
1485                 }
1486
1487                 switch (ret) {
1488                 case 0:
1489                         break;
1490                 case -EFAULT:
1491                         double_unlock_hb(hb1, hb2);
1492                         put_futex_key(&key2);
1493                         put_futex_key(&key1);
1494                         ret = fault_in_user_writeable(uaddr2);
1495                         if (!ret)
1496                                 goto retry;
1497                         goto out;
1498                 case -EAGAIN:
1499                         /* The owner was exiting, try again. */
1500                         double_unlock_hb(hb1, hb2);
1501                         put_futex_key(&key2);
1502                         put_futex_key(&key1);
1503                         cond_resched();
1504                         goto retry;
1505                 default:
1506                         goto out_unlock;
1507                 }
1508         }
1509
1510         plist_for_each_entry_safe(this, next, &hb1->chain, list) {
1511                 if (task_count - nr_wake >= nr_requeue)
1512                         break;
1513
1514                 if (!match_futex(&this->key, &key1))
1515                         continue;
1516
1517                 /*
1518                  * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always
1519                  * be paired with each other and no other futex ops.
1520                  *
1521                  * We should never be requeueing a futex_q with a pi_state,
1522                  * which is awaiting a futex_unlock_pi().
1523                  */
1524                 if ((requeue_pi && !this->rt_waiter) ||
1525                     (!requeue_pi && this->rt_waiter) ||
1526                     this->pi_state) {
1527                         ret = -EINVAL;
1528                         break;
1529                 }
1530
1531                 /*
1532                  * Wake nr_wake waiters.  For requeue_pi, if we acquired the
1533                  * lock, we already woke the top_waiter.  If not, it will be
1534                  * woken by futex_unlock_pi().
1535                  */
1536                 if (++task_count <= nr_wake && !requeue_pi) {
1537                         wake_futex(this);
1538                         continue;
1539                 }
1540
1541                 /* Ensure we requeue to the expected futex for requeue_pi. */
1542                 if (requeue_pi && !match_futex(this->requeue_pi_key, &key2)) {
1543                         ret = -EINVAL;
1544                         break;
1545                 }
1546
1547                 /*
1548                  * Requeue nr_requeue waiters and possibly one more in the case
1549                  * of requeue_pi if we couldn't acquire the lock atomically.
1550                  */
1551                 if (requeue_pi) {
1552                         /* Prepare the waiter to take the rt_mutex. */
1553                         atomic_inc(&pi_state->refcount);
1554                         this->pi_state = pi_state;
1555                         ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
1556                                                         this->rt_waiter,
1557                                                         this->task, 1);
1558                         if (ret == 1) {
1559                                 /* We got the lock. */
1560                                 requeue_pi_wake_futex(this, &key2, hb2);
1561                                 drop_count++;
1562                                 continue;
1563                         } else if (ret) {
1564                                 /* -EDEADLK */
1565                                 this->pi_state = NULL;
1566                                 free_pi_state(pi_state);
1567                                 goto out_unlock;
1568                         }
1569                 }
1570                 requeue_futex(this, hb1, hb2, &key2);
1571                 drop_count++;
1572         }
1573
1574 out_unlock:
1575         double_unlock_hb(hb1, hb2);
1576
1577         /*
1578          * drop_futex_key_refs() must be called outside the spinlocks. During
1579          * the requeue we moved futex_q's from the hash bucket at key1 to the
1580          * one at key2 and updated their key pointer.  We no longer need to
1581          * hold the references to key1.
1582          */
1583         while (--drop_count >= 0)
1584                 drop_futex_key_refs(&key1);
1585
1586 out_put_keys:
1587         put_futex_key(&key2);
1588 out_put_key1:
1589         put_futex_key(&key1);
1590 out:
1591         if (pi_state != NULL)
1592                 free_pi_state(pi_state);
1593         return ret ? ret : task_count;
1594 }
1595
1596 /* The key must be already stored in q->key. */
1597 static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
1598         __acquires(&hb->lock)
1599 {
1600         struct futex_hash_bucket *hb;
1601
1602         hb = hash_futex(&q->key);
1603         q->lock_ptr = &hb->lock;
1604
1605         spin_lock(&hb->lock); /* implies MB (A) */
1606         return hb;
1607 }
1608
1609 static inline void
1610 queue_unlock(struct futex_hash_bucket *hb)
1611         __releases(&hb->lock)
1612 {
1613         spin_unlock(&hb->lock);
1614 }
1615
1616 /**
1617  * queue_me() - Enqueue the futex_q on the futex_hash_bucket
1618  * @q:  The futex_q to enqueue
1619  * @hb: The destination hash bucket
1620  *
1621  * The hb->lock must be held by the caller, and is released here. A call to
1622  * queue_me() is typically paired with exactly one call to unqueue_me().  The
1623  * exceptions involve the PI related operations, which may use unqueue_me_pi()
1624  * or nothing if the unqueue is done as part of the wake process and the unqueue
1625  * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
1626  * an example).
1627  */
1628 static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
1629         __releases(&hb->lock)
1630 {
1631         int prio;
1632
1633         /*
1634          * The priority used to register this element is
1635          * - either the real thread-priority for the real-time threads
1636          * (i.e. threads with a priority lower than MAX_RT_PRIO)
1637          * - or MAX_RT_PRIO for non-RT threads.
1638          * Thus, all RT-threads are woken first in priority order, and
1639          * the others are woken last, in FIFO order.
1640          */
1641         prio = min(current->normal_prio, MAX_RT_PRIO);
1642
1643         plist_node_init(&q->list, prio);
1644         plist_add(&q->list, &hb->chain);
1645         q->task = current;
1646         spin_unlock(&hb->lock);
1647 }
1648
1649 /**
1650  * unqueue_me() - Remove the futex_q from its futex_hash_bucket
1651  * @q:  The futex_q to unqueue
1652  *
1653  * The q->lock_ptr must not be held by the caller. A call to unqueue_me() must
1654  * be paired with exactly one earlier call to queue_me().
1655  *
1656  * Return:
1657  *   1 - if the futex_q was still queued (and we removed unqueued it);
1658  *   0 - if the futex_q was already removed by the waking thread
1659  */
1660 static int unqueue_me(struct futex_q *q)
1661 {
1662         spinlock_t *lock_ptr;
1663         int ret = 0;
1664
1665         /* In the common case we don't take the spinlock, which is nice. */
1666 retry:
1667         lock_ptr = q->lock_ptr;
1668         barrier();
1669         if (lock_ptr != NULL) {
1670                 spin_lock(lock_ptr);
1671                 /*
1672                  * q->lock_ptr can change between reading it and
1673                  * spin_lock(), causing us to take the wrong lock.  This
1674                  * corrects the race condition.
1675                  *
1676                  * Reasoning goes like this: if we have the wrong lock,
1677                  * q->lock_ptr must have changed (maybe several times)
1678                  * between reading it and the spin_lock().  It can
1679                  * change again after the spin_lock() but only if it was
1680                  * already changed before the spin_lock().  It cannot,
1681                  * however, change back to the original value.  Therefore
1682                  * we can detect whether we acquired the correct lock.
1683                  */
1684                 if (unlikely(lock_ptr != q->lock_ptr)) {
1685                         spin_unlock(lock_ptr);
1686                         goto retry;
1687                 }
1688                 __unqueue_futex(q);
1689
1690                 BUG_ON(q->pi_state);
1691
1692                 spin_unlock(lock_ptr);
1693                 ret = 1;
1694         }
1695
1696         drop_futex_key_refs(&q->key);
1697         return ret;
1698 }
1699
1700 /*
1701  * PI futexes can not be requeued and must remove themself from the
1702  * hash bucket. The hash bucket lock (i.e. lock_ptr) is held on entry
1703  * and dropped here.
1704  */
1705 static void unqueue_me_pi(struct futex_q *q)
1706         __releases(q->lock_ptr)
1707 {
1708         __unqueue_futex(q);
1709
1710         BUG_ON(!q->pi_state);
1711         free_pi_state(q->pi_state);
1712         q->pi_state = NULL;
1713
1714         spin_unlock(q->lock_ptr);
1715 }
1716
1717 /*
1718  * Fixup the pi_state owner with the new owner.
1719  *
1720  * Must be called with hash bucket lock held and mm->sem held for non
1721  * private futexes.
1722  */
1723 static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
1724                                 struct task_struct *newowner)
1725 {
1726         u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
1727         struct futex_pi_state *pi_state = q->pi_state;
1728         struct task_struct *oldowner = pi_state->owner;
1729         u32 uval, uninitialized_var(curval), newval;
1730         int ret;
1731
1732         /* Owner died? */
1733         if (!pi_state->owner)
1734                 newtid |= FUTEX_OWNER_DIED;
1735
1736         /*
1737          * We are here either because we stole the rtmutex from the
1738          * previous highest priority waiter or we are the highest priority
1739          * waiter but failed to get the rtmutex the first time.
1740          * We have to replace the newowner TID in the user space variable.
1741          * This must be atomic as we have to preserve the owner died bit here.
1742          *
1743          * Note: We write the user space value _before_ changing the pi_state
1744          * because we can fault here. Imagine swapped out pages or a fork
1745          * that marked all the anonymous memory readonly for cow.
1746          *
1747          * Modifying pi_state _before_ the user space value would
1748          * leave the pi_state in an inconsistent state when we fault
1749          * here, because we need to drop the hash bucket lock to
1750          * handle the fault. This might be observed in the PID check
1751          * in lookup_pi_state.
1752          */
1753 retry:
1754         if (get_futex_value_locked(&uval, uaddr))
1755                 goto handle_fault;
1756
1757         while (1) {
1758                 newval = (uval & FUTEX_OWNER_DIED) | newtid;
1759
1760                 if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
1761                         goto handle_fault;
1762                 if (curval == uval)
1763                         break;
1764                 uval = curval;
1765         }
1766
1767         /*
1768          * We fixed up user space. Now we need to fix the pi_state
1769          * itself.
1770          */
1771         if (pi_state->owner != NULL) {
1772                 raw_spin_lock_irq(&pi_state->owner->pi_lock);
1773                 WARN_ON(list_empty(&pi_state->list));
1774                 list_del_init(&pi_state->list);
1775                 raw_spin_unlock_irq(&pi_state->owner->pi_lock);
1776         }
1777
1778         pi_state->owner = newowner;
1779
1780         raw_spin_lock_irq(&newowner->pi_lock);
1781         WARN_ON(!list_empty(&pi_state->list));
1782         list_add(&pi_state->list, &newowner->pi_state_list);
1783         raw_spin_unlock_irq(&newowner->pi_lock);
1784         return 0;
1785
1786         /*
1787          * To handle the page fault we need to drop the hash bucket
1788          * lock here. That gives the other task (either the highest priority
1789          * waiter itself or the task which stole the rtmutex) the
1790          * chance to try the fixup of the pi_state. So once we are
1791          * back from handling the fault we need to check the pi_state
1792          * after reacquiring the hash bucket lock and before trying to
1793          * do another fixup. When the fixup has been done already we
1794          * simply return.
1795          */
1796 handle_fault:
1797         spin_unlock(q->lock_ptr);
1798
1799         ret = fault_in_user_writeable(uaddr);
1800
1801         spin_lock(q->lock_ptr);
1802
1803         /*
1804          * Check if someone else fixed it for us:
1805          */
1806         if (pi_state->owner != oldowner)
1807                 return 0;
1808
1809         if (ret)
1810                 return ret;
1811
1812         goto retry;
1813 }
1814
1815 static long futex_wait_restart(struct restart_block *restart);
1816
1817 /**
1818  * fixup_owner() - Post lock pi_state and corner case management
1819  * @uaddr:      user address of the futex
1820  * @q:          futex_q (contains pi_state and access to the rt_mutex)
1821  * @locked:     if the attempt to take the rt_mutex succeeded (1) or not (0)
1822  *
1823  * After attempting to lock an rt_mutex, this function is called to cleanup
1824  * the pi_state owner as well as handle race conditions that may allow us to
1825  * acquire the lock. Must be called with the hb lock held.
1826  *
1827  * Return:
1828  *  1 - success, lock taken;
1829  *  0 - success, lock not taken;
1830  * <0 - on error (-EFAULT)
1831  */
1832 static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
1833 {
1834         struct task_struct *owner;
1835         int ret = 0;
1836
1837         if (locked) {
1838                 /*
1839                  * Got the lock. We might not be the anticipated owner if we
1840                  * did a lock-steal - fix up the PI-state in that case:
1841                  */
1842                 if (q->pi_state->owner != current)
1843                         ret = fixup_pi_state_owner(uaddr, q, current);
1844                 goto out;
1845         }
1846
1847         /*
1848          * Catch the rare case, where the lock was released when we were on the
1849          * way back before we locked the hash bucket.
1850          */
1851         if (q->pi_state->owner == current) {
1852                 /*
1853                  * Try to get the rt_mutex now. This might fail as some other
1854                  * task acquired the rt_mutex after we removed ourself from the
1855                  * rt_mutex waiters list.
1856                  */
1857                 if (rt_mutex_trylock(&q->pi_state->pi_mutex)) {
1858                         locked = 1;
1859                         goto out;
1860                 }
1861
1862                 /*
1863                  * pi_state is incorrect, some other task did a lock steal and
1864                  * we returned due to timeout or signal without taking the
1865                  * rt_mutex. Too late.
1866                  */
1867                 raw_spin_lock(&q->pi_state->pi_mutex.wait_lock);
1868                 owner = rt_mutex_owner(&q->pi_state->pi_mutex);
1869                 if (!owner)
1870                         owner = rt_mutex_next_owner(&q->pi_state->pi_mutex);
1871                 raw_spin_unlock(&q->pi_state->pi_mutex.wait_lock);
1872                 ret = fixup_pi_state_owner(uaddr, q, owner);
1873                 goto out;
1874         }
1875
1876         /*
1877          * Paranoia check. If we did not take the lock, then we should not be
1878          * the owner of the rt_mutex.
1879          */
1880         if (rt_mutex_owner(&q->pi_state->pi_mutex) == current)
1881                 printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p "
1882                                 "pi-state %p\n", ret,
1883                                 q->pi_state->pi_mutex.owner,
1884                                 q->pi_state->owner);
1885
1886 out:
1887         return ret ? ret : locked;
1888 }
1889
1890 /**
1891  * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal
1892  * @hb:         the futex hash bucket, must be locked by the caller
1893  * @q:          the futex_q to queue up on
1894  * @timeout:    the prepared hrtimer_sleeper, or null for no timeout
1895  */
1896 static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
1897                                 struct hrtimer_sleeper *timeout)
1898 {
1899         /*
1900          * The task state is guaranteed to be set before another task can
1901          * wake it. set_current_state() is implemented using set_mb() and
1902          * queue_me() calls spin_unlock() upon completion, both serializing
1903          * access to the hash list and forcing another memory barrier.
1904          */
1905         set_current_state(TASK_INTERRUPTIBLE);
1906         queue_me(q, hb);
1907
1908         /* Arm the timer */
1909         if (timeout) {
1910                 hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
1911                 if (!hrtimer_active(&timeout->timer))
1912                         timeout->task = NULL;
1913         }
1914
1915         /*
1916          * If we have been removed from the hash list, then another task
1917          * has tried to wake us, and we can skip the call to schedule().
1918          */
1919         if (likely(!plist_node_empty(&q->list))) {
1920                 /*
1921                  * If the timer has already expired, current will already be
1922                  * flagged for rescheduling. Only call schedule if there
1923                  * is no timeout, or if it has yet to expire.
1924                  */
1925                 if (!timeout || timeout->task)
1926                         freezable_schedule();
1927         }
1928         __set_current_state(TASK_RUNNING);
1929 }
1930
1931 /**
1932  * futex_wait_setup() - Prepare to wait on a futex
1933  * @uaddr:      the futex userspace address
1934  * @val:        the expected value
1935  * @flags:      futex flags (FLAGS_SHARED, etc.)
1936  * @q:          the associated futex_q
1937  * @hb:         storage for hash_bucket pointer to be returned to caller
1938  *
1939  * Setup the futex_q and locate the hash_bucket.  Get the futex value and
1940  * compare it with the expected value.  Handle atomic faults internally.
1941  * Return with the hb lock held and a q.key reference on success, and unlocked
1942  * with no q.key reference on failure.
1943  *
1944  * Return:
1945  *  0 - uaddr contains val and hb has been locked;
1946  * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked
1947  */
1948 static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
1949                            struct futex_q *q, struct futex_hash_bucket **hb)
1950 {
1951         u32 uval;
1952         int ret;
1953
1954         /*
1955          * Access the page AFTER the hash-bucket is locked.
1956          * Order is important:
1957          *
1958          *   Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val);
1959          *   Userspace waker:  if (cond(var)) { var = new; futex_wake(&var); }
1960          *
1961          * The basic logical guarantee of a futex is that it blocks ONLY
1962          * if cond(var) is known to be true at the time of blocking, for
1963          * any cond.  If we locked the hash-bucket after testing *uaddr, that
1964          * would open a race condition where we could block indefinitely with
1965          * cond(var) false, which would violate the guarantee.
1966          *
1967          * On the other hand, we insert q and release the hash-bucket only
1968          * after testing *uaddr.  This guarantees that futex_wait() will NOT
1969          * absorb a wakeup if *uaddr does not match the desired values
1970          * while the syscall executes.
1971          */
1972 retry:
1973         ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, VERIFY_READ);
1974         if (unlikely(ret != 0))
1975                 return ret;
1976
1977 retry_private:
1978         *hb = queue_lock(q);
1979
1980         ret = get_futex_value_locked(&uval, uaddr);
1981
1982         if (ret) {
1983                 queue_unlock(*hb);
1984
1985                 ret = get_user(uval, uaddr);
1986                 if (ret)
1987                         goto out;
1988
1989                 if (!(flags & FLAGS_SHARED))
1990                         goto retry_private;
1991
1992                 put_futex_key(&q->key);
1993                 goto retry;
1994         }
1995
1996         if (uval != val) {
1997                 queue_unlock(*hb);
1998                 ret = -EWOULDBLOCK;
1999         }
2000
2001 out:
2002         if (ret)
2003                 put_futex_key(&q->key);
2004         return ret;
2005 }
2006
2007 static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
2008                       ktime_t *abs_time, u32 bitset)
2009 {
2010         struct hrtimer_sleeper timeout, *to = NULL;
2011         struct restart_block *restart;
2012         struct futex_hash_bucket *hb;
2013         struct futex_q q = futex_q_init;
2014         int ret;
2015
2016         if (!bitset)
2017                 return -EINVAL;
2018         q.bitset = bitset;
2019
2020         if (abs_time) {
2021                 to = &timeout;
2022
2023                 hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
2024                                       CLOCK_REALTIME : CLOCK_MONOTONIC,
2025                                       HRTIMER_MODE_ABS);
2026                 hrtimer_init_sleeper(to, current);
2027                 hrtimer_set_expires_range_ns(&to->timer, *abs_time,
2028                                              current->timer_slack_ns);
2029         }
2030
2031 retry:
2032         /*
2033          * Prepare to wait on uaddr. On success, holds hb lock and increments
2034          * q.key refs.
2035          */
2036         ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
2037         if (ret)
2038                 goto out;
2039
2040         /* queue_me and wait for wakeup, timeout, or a signal. */
2041         futex_wait_queue_me(hb, &q, to);
2042
2043         /* If we were woken (and unqueued), we succeeded, whatever. */
2044         ret = 0;
2045         /* unqueue_me() drops q.key ref */
2046         if (!unqueue_me(&q))
2047                 goto out;
2048         ret = -ETIMEDOUT;
2049         if (to && !to->task)
2050                 goto out;
2051
2052         /*
2053          * We expect signal_pending(current), but we might be the
2054          * victim of a spurious wakeup as well.
2055          */
2056         if (!signal_pending(current))
2057                 goto retry;
2058
2059         ret = -ERESTARTSYS;
2060         if (!abs_time)
2061                 goto out;
2062
2063         restart = &current_thread_info()->restart_block;
2064         restart->fn = futex_wait_restart;
2065         restart->futex.uaddr = uaddr;
2066         restart->futex.val = val;
2067         restart->futex.time = abs_time->tv64;
2068         restart->futex.bitset = bitset;
2069         restart->futex.flags = flags | FLAGS_HAS_TIMEOUT;
2070
2071         ret = -ERESTART_RESTARTBLOCK;
2072
2073 out:
2074         if (to) {
2075                 hrtimer_cancel(&to->timer);
2076                 destroy_hrtimer_on_stack(&to->timer);
2077         }
2078         return ret;
2079 }
2080
2081
2082 static long futex_wait_restart(struct restart_block *restart)
2083 {
2084         u32 __user *uaddr = restart->futex.uaddr;
2085         ktime_t t, *tp = NULL;
2086
2087         if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
2088                 t.tv64 = restart->futex.time;
2089                 tp = &t;
2090         }
2091         restart->fn = do_no_restart_syscall;
2092
2093         return (long)futex_wait(uaddr, restart->futex.flags,
2094                                 restart->futex.val, tp, restart->futex.bitset);
2095 }
2096
2097
2098 /*
2099  * Userspace tried a 0 -> TID atomic transition of the futex value
2100  * and failed. The kernel side here does the whole locking operation:
2101  * if there are waiters then it will block, it does PI, etc. (Due to
2102  * races the kernel might see a 0 value of the futex too.)
2103  */
2104 static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, int detect,
2105                          ktime_t *time, int trylock)
2106 {
2107         struct hrtimer_sleeper timeout, *to = NULL;
2108         struct futex_hash_bucket *hb;
2109         struct futex_q q = futex_q_init;
2110         int res, ret;
2111
2112         if (refill_pi_state_cache())
2113                 return -ENOMEM;
2114
2115         if (time) {
2116                 to = &timeout;
2117                 hrtimer_init_on_stack(&to->timer, CLOCK_REALTIME,
2118                                       HRTIMER_MODE_ABS);
2119                 hrtimer_init_sleeper(to, current);
2120                 hrtimer_set_expires(&to->timer, *time);
2121         }
2122
2123 retry:
2124         ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, VERIFY_WRITE);
2125         if (unlikely(ret != 0))
2126                 goto out;
2127
2128 retry_private:
2129         hb = queue_lock(&q);
2130
2131         ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0);
2132         if (unlikely(ret)) {
2133                 switch (ret) {
2134                 case 1:
2135                         /* We got the lock. */
2136                         ret = 0;
2137                         goto out_unlock_put_key;
2138                 case -EFAULT:
2139                         goto uaddr_faulted;
2140                 case -EAGAIN:
2141                         /*
2142                          * Task is exiting and we just wait for the
2143                          * exit to complete.
2144                          */
2145                         queue_unlock(hb);
2146                         put_futex_key(&q.key);
2147                         cond_resched();
2148                         goto retry;
2149                 default:
2150                         goto out_unlock_put_key;
2151                 }
2152         }
2153
2154         /*
2155          * Only actually queue now that the atomic ops are done:
2156          */
2157         queue_me(&q, hb);
2158
2159         WARN_ON(!q.pi_state);
2160         /*
2161          * Block on the PI mutex:
2162          */
2163         if (!trylock)
2164                 ret = rt_mutex_timed_lock(&q.pi_state->pi_mutex, to, 1);
2165         else {
2166                 ret = rt_mutex_trylock(&q.pi_state->pi_mutex);
2167                 /* Fixup the trylock return value: */
2168                 ret = ret ? 0 : -EWOULDBLOCK;
2169         }
2170
2171         spin_lock(q.lock_ptr);
2172         /*
2173          * Fixup the pi_state owner and possibly acquire the lock if we
2174          * haven't already.
2175          */
2176         res = fixup_owner(uaddr, &q, !ret);
2177         /*
2178          * If fixup_owner() returned an error, proprogate that.  If it acquired
2179          * the lock, clear our -ETIMEDOUT or -EINTR.
2180          */
2181         if (res)
2182                 ret = (res < 0) ? res : 0;
2183
2184         /*
2185          * If fixup_owner() faulted and was unable to handle the fault, unlock
2186          * it and return the fault to userspace.
2187          */
2188         if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current))
2189                 rt_mutex_unlock(&q.pi_state->pi_mutex);
2190
2191         /* Unqueue and drop the lock */
2192         unqueue_me_pi(&q);
2193
2194         goto out_put_key;
2195
2196 out_unlock_put_key:
2197         queue_unlock(hb);
2198
2199 out_put_key:
2200         put_futex_key(&q.key);
2201 out:
2202         if (to)
2203                 destroy_hrtimer_on_stack(&to->timer);
2204         return ret != -EINTR ? ret : -ERESTARTNOINTR;
2205
2206 uaddr_faulted:
2207         queue_unlock(hb);
2208
2209         ret = fault_in_user_writeable(uaddr);
2210         if (ret)
2211                 goto out_put_key;
2212
2213         if (!(flags & FLAGS_SHARED))
2214                 goto retry_private;
2215
2216         put_futex_key(&q.key);
2217         goto retry;
2218 }
2219
2220 /*
2221  * Userspace attempted a TID -> 0 atomic transition, and failed.
2222  * This is the in-kernel slowpath: we look up the PI state (if any),
2223  * and do the rt-mutex unlock.
2224  */
2225 static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
2226 {
2227         struct futex_hash_bucket *hb;
2228         struct futex_q *this, *next;
2229         union futex_key key = FUTEX_KEY_INIT;
2230         u32 uval, vpid = task_pid_vnr(current);
2231         int ret;
2232
2233 retry:
2234         if (get_user(uval, uaddr))
2235                 return -EFAULT;
2236         /*
2237          * We release only a lock we actually own:
2238          */
2239         if ((uval & FUTEX_TID_MASK) != vpid)
2240                 return -EPERM;
2241
2242         ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_WRITE);
2243         if (unlikely(ret != 0))
2244                 goto out;
2245
2246         hb = hash_futex(&key);
2247         spin_lock(&hb->lock);
2248
2249         /*
2250          * To avoid races, try to do the TID -> 0 atomic transition
2251          * again. If it succeeds then we can return without waking
2252          * anyone else up:
2253          */
2254         if (!(uval & FUTEX_OWNER_DIED) &&
2255             cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0))
2256                 goto pi_faulted;
2257         /*
2258          * Rare case: we managed to release the lock atomically,
2259          * no need to wake anyone else up:
2260          */
2261         if (unlikely(uval == vpid))
2262                 goto out_unlock;
2263
2264         /*
2265          * Ok, other tasks may need to be woken up - check waiters
2266          * and do the wakeup if necessary:
2267          */
2268         plist_for_each_entry_safe(this, next, &hb->chain, list) {
2269                 if (!match_futex (&this->key, &key))
2270                         continue;
2271                 ret = wake_futex_pi(uaddr, uval, this);
2272                 /*
2273                  * The atomic access to the futex value
2274                  * generated a pagefault, so retry the
2275                  * user-access and the wakeup:
2276                  */
2277                 if (ret == -EFAULT)
2278                         goto pi_faulted;
2279                 goto out_unlock;
2280         }
2281         /*
2282          * No waiters - kernel unlocks the futex:
2283          */
2284         if (!(uval & FUTEX_OWNER_DIED)) {
2285                 ret = unlock_futex_pi(uaddr, uval);
2286                 if (ret == -EFAULT)
2287                         goto pi_faulted;
2288         }
2289
2290 out_unlock:
2291         spin_unlock(&hb->lock);
2292         put_futex_key(&key);
2293
2294 out:
2295         return ret;
2296
2297 pi_faulted:
2298         spin_unlock(&hb->lock);
2299         put_futex_key(&key);
2300
2301         ret = fault_in_user_writeable(uaddr);
2302         if (!ret)
2303                 goto retry;
2304
2305         return ret;
2306 }
2307
2308 /**
2309  * handle_early_requeue_pi_wakeup() - Detect early wakeup on the initial futex
2310  * @hb:         the hash_bucket futex_q was original enqueued on
2311  * @q:          the futex_q woken while waiting to be requeued
2312  * @key2:       the futex_key of the requeue target futex
2313  * @timeout:    the timeout associated with the wait (NULL if none)
2314  *
2315  * Detect if the task was woken on the initial futex as opposed to the requeue
2316  * target futex.  If so, determine if it was a timeout or a signal that caused
2317  * the wakeup and return the appropriate error code to the caller.  Must be
2318  * called with the hb lock held.
2319  *
2320  * Return:
2321  *  0 = no early wakeup detected;
2322  * <0 = -ETIMEDOUT or -ERESTARTNOINTR
2323  */
2324 static inline
2325 int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2326                                    struct futex_q *q, union futex_key *key2,
2327                                    struct hrtimer_sleeper *timeout)
2328 {
2329         int ret = 0;
2330
2331         /*
2332          * With the hb lock held, we avoid races while we process the wakeup.
2333          * We only need to hold hb (and not hb2) to ensure atomicity as the
2334          * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb.
2335          * It can't be requeued from uaddr2 to something else since we don't
2336          * support a PI aware source futex for requeue.
2337          */
2338         if (!match_futex(&q->key, key2)) {
2339                 WARN_ON(q->lock_ptr && (&hb->lock != q->lock_ptr));
2340                 /*
2341                  * We were woken prior to requeue by a timeout or a signal.
2342                  * Unqueue the futex_q and determine which it was.
2343                  */
2344                 plist_del(&q->list, &hb->chain);
2345
2346                 /* Handle spurious wakeups gracefully */
2347                 ret = -EWOULDBLOCK;
2348                 if (timeout && !timeout->task)
2349                         ret = -ETIMEDOUT;
2350                 else if (signal_pending(current))
2351                         ret = -ERESTARTNOINTR;
2352         }
2353         return ret;
2354 }
2355
2356 /**
2357  * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
2358  * @uaddr:      the futex we initially wait on (non-pi)
2359  * @flags:      futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be
2360  *              the same type, no requeueing from private to shared, etc.
2361  * @val:        the expected value of uaddr
2362  * @abs_time:   absolute timeout
2363  * @bitset:     32 bit wakeup bitset set by userspace, defaults to all
2364  * @uaddr2:     the pi futex we will take prior to returning to user-space
2365  *
2366  * The caller will wait on uaddr and will be requeued by futex_requeue() to
2367  * uaddr2 which must be PI aware and unique from uaddr.  Normal wakeup will wake
2368  * on uaddr2 and complete the acquisition of the rt_mutex prior to returning to
2369  * userspace.  This ensures the rt_mutex maintains an owner when it has waiters;
2370  * without one, the pi logic would not know which task to boost/deboost, if
2371  * there was a need to.
2372  *
2373  * We call schedule in futex_wait_queue_me() when we enqueue and return there
2374  * via the following--
2375  * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
2376  * 2) wakeup on uaddr2 after a requeue
2377  * 3) signal
2378  * 4) timeout
2379  *
2380  * If 3, cleanup and return -ERESTARTNOINTR.
2381  *
2382  * If 2, we may then block on trying to take the rt_mutex and return via:
2383  * 5) successful lock
2384  * 6) signal
2385  * 7) timeout
2386  * 8) other lock acquisition failure
2387  *
2388  * If 6, return -EWOULDBLOCK (restarting the syscall would do the same).
2389  *
2390  * If 4 or 7, we cleanup and return with -ETIMEDOUT.
2391  *
2392  * Return:
2393  *  0 - On success;
2394  * <0 - On error
2395  */
2396 static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
2397                                  u32 val, ktime_t *abs_time, u32 bitset,
2398                                  u32 __user *uaddr2)
2399 {
2400         struct hrtimer_sleeper timeout, *to = NULL;
2401         struct rt_mutex_waiter rt_waiter;
2402         struct rt_mutex *pi_mutex = NULL;
2403         struct futex_hash_bucket *hb;
2404         union futex_key key2 = FUTEX_KEY_INIT;
2405         struct futex_q q = futex_q_init;
2406         int res, ret;
2407
2408         if (uaddr == uaddr2)
2409                 return -EINVAL;
2410
2411         if (!bitset)
2412                 return -EINVAL;
2413
2414         if (abs_time) {
2415                 to = &timeout;
2416                 hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
2417                                       CLOCK_REALTIME : CLOCK_MONOTONIC,
2418                                       HRTIMER_MODE_ABS);
2419                 hrtimer_init_sleeper(to, current);
2420                 hrtimer_set_expires_range_ns(&to->timer, *abs_time,
2421                                              current->timer_slack_ns);
2422         }
2423
2424         /*
2425          * The waiter is allocated on our stack, manipulated by the requeue
2426          * code while we sleep on uaddr.
2427          */
2428         debug_rt_mutex_init_waiter(&rt_waiter);
2429         RB_CLEAR_NODE(&rt_waiter.pi_tree_entry);
2430         RB_CLEAR_NODE(&rt_waiter.tree_entry);
2431         rt_waiter.task = NULL;
2432
2433         ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
2434         if (unlikely(ret != 0))
2435                 goto out;
2436
2437         q.bitset = bitset;
2438         q.rt_waiter = &rt_waiter;
2439         q.requeue_pi_key = &key2;
2440
2441         /*
2442          * Prepare to wait on uaddr. On success, increments q.key (key1) ref
2443          * count.
2444          */
2445         ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
2446         if (ret)
2447                 goto out_key2;
2448
2449         /* Queue the futex_q, drop the hb lock, wait for wakeup. */
2450         futex_wait_queue_me(hb, &q, to);
2451
2452         spin_lock(&hb->lock);
2453         ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
2454         spin_unlock(&hb->lock);
2455         if (ret)
2456                 goto out_put_keys;
2457
2458         /*
2459          * In order for us to be here, we know our q.key == key2, and since
2460          * we took the hb->lock above, we also know that futex_requeue() has
2461          * completed and we no longer have to concern ourselves with a wakeup
2462          * race with the atomic proxy lock acquisition by the requeue code. The
2463          * futex_requeue dropped our key1 reference and incremented our key2
2464          * reference count.
2465          */
2466
2467         /* Check if the requeue code acquired the second futex for us. */
2468         if (!q.rt_waiter) {
2469                 /*
2470                  * Got the lock. We might not be the anticipated owner if we
2471                  * did a lock-steal - fix up the PI-state in that case.
2472                  */
2473                 if (q.pi_state && (q.pi_state->owner != current)) {
2474                         spin_lock(q.lock_ptr);
2475                         ret = fixup_pi_state_owner(uaddr2, &q, current);
2476                         spin_unlock(q.lock_ptr);
2477                 }
2478         } else {
2479                 /*
2480                  * We have been woken up by futex_unlock_pi(), a timeout, or a
2481                  * signal.  futex_unlock_pi() will not destroy the lock_ptr nor
2482                  * the pi_state.
2483                  */
2484                 WARN_ON(!q.pi_state);
2485                 pi_mutex = &q.pi_state->pi_mutex;
2486                 ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1);
2487                 debug_rt_mutex_free_waiter(&rt_waiter);
2488
2489                 spin_lock(q.lock_ptr);
2490                 /*
2491                  * Fixup the pi_state owner and possibly acquire the lock if we
2492                  * haven't already.
2493                  */
2494                 res = fixup_owner(uaddr2, &q, !ret);
2495                 /*
2496                  * If fixup_owner() returned an error, proprogate that.  If it
2497                  * acquired the lock, clear -ETIMEDOUT or -EINTR.
2498                  */
2499                 if (res)
2500                         ret = (res < 0) ? res : 0;
2501
2502                 /* Unqueue and drop the lock. */
2503                 unqueue_me_pi(&q);
2504         }
2505
2506         /*
2507          * If fixup_pi_state_owner() faulted and was unable to handle the
2508          * fault, unlock the rt_mutex and return the fault to userspace.
2509          */
2510         if (ret == -EFAULT) {
2511                 if (pi_mutex && rt_mutex_owner(pi_mutex) == current)
2512                         rt_mutex_unlock(pi_mutex);
2513         } else if (ret == -EINTR) {
2514                 /*
2515                  * We've already been requeued, but cannot restart by calling
2516                  * futex_lock_pi() directly. We could restart this syscall, but
2517                  * it would detect that the user space "val" changed and return
2518                  * -EWOULDBLOCK.  Save the overhead of the restart and return
2519                  * -EWOULDBLOCK directly.
2520                  */
2521                 ret = -EWOULDBLOCK;
2522         }
2523
2524 out_put_keys:
2525         put_futex_key(&q.key);
2526 out_key2:
2527         put_futex_key(&key2);
2528
2529 out:
2530         if (to) {
2531                 hrtimer_cancel(&to->timer);
2532                 destroy_hrtimer_on_stack(&to->timer);
2533         }
2534         return ret;
2535 }
2536
2537 /*
2538  * Support for robust futexes: the kernel cleans up held futexes at
2539  * thread exit time.
2540  *
2541  * Implementation: user-space maintains a per-thread list of locks it
2542  * is holding. Upon do_exit(), the kernel carefully walks this list,
2543  * and marks all locks that are owned by this thread with the
2544  * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is
2545  * always manipulated with the lock held, so the list is private and
2546  * per-thread. Userspace also maintains a per-thread 'list_op_pending'
2547  * field, to allow the kernel to clean up if the thread dies after
2548  * acquiring the lock, but just before it could have added itself to
2549  * the list. There can only be one such pending lock.
2550  */
2551
2552 /**
2553  * sys_set_robust_list() - Set the robust-futex list head of a task
2554  * @head:       pointer to the list-head
2555  * @len:        length of the list-head, as userspace expects
2556  */
2557 SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
2558                 size_t, len)
2559 {
2560         if (!futex_cmpxchg_enabled)
2561                 return -ENOSYS;
2562         /*
2563          * The kernel knows only one size for now:
2564          */
2565         if (unlikely(len != sizeof(*head)))
2566                 return -EINVAL;
2567
2568         current->robust_list = head;
2569
2570         return 0;
2571 }
2572
2573 /**
2574  * sys_get_robust_list() - Get the robust-futex list head of a task
2575  * @pid:        pid of the process [zero for current task]
2576  * @head_ptr:   pointer to a list-head pointer, the kernel fills it in
2577  * @len_ptr:    pointer to a length field, the kernel fills in the header size
2578  */
2579 SYSCALL_DEFINE3(get_robust_list, int, pid,
2580                 struct robust_list_head __user * __user *, head_ptr,
2581                 size_t __user *, len_ptr)
2582 {
2583         struct robust_list_head __user *head;
2584         unsigned long ret;
2585         struct task_struct *p;
2586
2587         if (!futex_cmpxchg_enabled)
2588                 return -ENOSYS;
2589
2590         rcu_read_lock();
2591
2592         ret = -ESRCH;
2593         if (!pid)
2594                 p = current;
2595         else {
2596                 p = find_task_by_vpid(pid);
2597                 if (!p)
2598                         goto err_unlock;
2599         }
2600
2601         ret = -EPERM;
2602         if (!ptrace_may_access(p, PTRACE_MODE_READ))
2603                 goto err_unlock;
2604
2605         head = p->robust_list;
2606         rcu_read_unlock();
2607
2608         if (put_user(sizeof(*head), len_ptr))
2609                 return -EFAULT;
2610         return put_user(head, head_ptr);
2611
2612 err_unlock:
2613         rcu_read_unlock();
2614
2615         return ret;
2616 }
2617
2618 /*
2619  * Process a futex-list entry, check whether it's owned by the
2620  * dying task, and do notification if so:
2621  */
2622 int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi)
2623 {
2624         u32 uval, uninitialized_var(nval), mval;
2625
2626 retry:
2627         if (get_user(uval, uaddr))
2628                 return -1;
2629
2630         if ((uval & FUTEX_TID_MASK) == task_pid_vnr(curr)) {
2631                 /*
2632                  * Ok, this dying thread is truly holding a futex
2633                  * of interest. Set the OWNER_DIED bit atomically
2634                  * via cmpxchg, and if the value had FUTEX_WAITERS
2635                  * set, wake up a waiter (if any). (We have to do a
2636                  * futex_wake() even if OWNER_DIED is already set -
2637                  * to handle the rare but possible case of recursive
2638                  * thread-death.) The rest of the cleanup is done in
2639                  * userspace.
2640                  */
2641                 mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
2642                 /*
2643                  * We are not holding a lock here, but we want to have
2644                  * the pagefault_disable/enable() protection because
2645                  * we want to handle the fault gracefully. If the
2646                  * access fails we try to fault in the futex with R/W
2647                  * verification via get_user_pages. get_user() above
2648                  * does not guarantee R/W access. If that fails we
2649                  * give up and leave the futex locked.
2650                  */
2651                 if (cmpxchg_futex_value_locked(&nval, uaddr, uval, mval)) {
2652                         if (fault_in_user_writeable(uaddr))
2653                                 return -1;
2654                         goto retry;
2655                 }
2656                 if (nval != uval)
2657                         goto retry;
2658
2659                 /*
2660                  * Wake robust non-PI futexes here. The wakeup of
2661                  * PI futexes happens in exit_pi_state():
2662                  */
2663                 if (!pi && (uval & FUTEX_WAITERS))
2664                         futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
2665         }
2666         return 0;
2667 }
2668
2669 /*
2670  * Fetch a robust-list pointer. Bit 0 signals PI futexes:
2671  */
2672 static inline int fetch_robust_entry(struct robust_list __user **entry,
2673                                      struct robust_list __user * __user *head,
2674                                      unsigned int *pi)
2675 {
2676         unsigned long uentry;
2677
2678         if (get_user(uentry, (unsigned long __user *)head))
2679                 return -EFAULT;
2680
2681         *entry = (void __user *)(uentry & ~1UL);
2682         *pi = uentry & 1;
2683
2684         return 0;
2685 }
2686
2687 /*
2688  * Walk curr->robust_list (very carefully, it's a userspace list!)
2689  * and mark any locks found there dead, and notify any waiters.
2690  *
2691  * We silently return on any sign of list-walking problem.
2692  */
2693 void exit_robust_list(struct task_struct *curr)
2694 {
2695         struct robust_list_head __user *head = curr->robust_list;
2696         struct robust_list __user *entry, *next_entry, *pending;
2697         unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
2698         unsigned int uninitialized_var(next_pi);
2699         unsigned long futex_offset;
2700         int rc;
2701
2702         if (!futex_cmpxchg_enabled)
2703                 return;
2704
2705         /*
2706          * Fetch the list head (which was registered earlier, via
2707          * sys_set_robust_list()):
2708          */
2709         if (fetch_robust_entry(&entry, &head->list.next, &pi))
2710                 return;
2711         /*
2712          * Fetch the relative futex offset:
2713          */
2714         if (get_user(futex_offset, &head->futex_offset))
2715                 return;
2716         /*
2717          * Fetch any possibly pending lock-add first, and handle it
2718          * if it exists:
2719          */
2720         if (fetch_robust_entry(&pending, &head->list_op_pending, &pip))
2721                 return;
2722
2723         next_entry = NULL;      /* avoid warning with gcc */
2724         while (entry != &head->list) {
2725                 /*
2726                  * Fetch the next entry in the list before calling
2727                  * handle_futex_death:
2728                  */
2729                 rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi);
2730                 /*
2731                  * A pending lock might already be on the list, so
2732                  * don't process it twice:
2733                  */
2734                 if (entry != pending)
2735                         if (handle_futex_death((void __user *)entry + futex_offset,
2736                                                 curr, pi))
2737                                 return;
2738                 if (rc)
2739                         return;
2740                 entry = next_entry;
2741                 pi = next_pi;
2742                 /*
2743                  * Avoid excessively long or circular lists:
2744                  */
2745                 if (!--limit)
2746                         break;
2747
2748                 cond_resched();
2749         }
2750
2751         if (pending)
2752                 handle_futex_death((void __user *)pending + futex_offset,
2753                                    curr, pip);
2754 }
2755
2756 long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
2757                 u32 __user *uaddr2, u32 val2, u32 val3)
2758 {
2759         int cmd = op & FUTEX_CMD_MASK;
2760         unsigned int flags = 0;
2761
2762         if (!(op & FUTEX_PRIVATE_FLAG))
2763                 flags |= FLAGS_SHARED;
2764
2765         if (op & FUTEX_CLOCK_REALTIME) {
2766                 flags |= FLAGS_CLOCKRT;
2767                 if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)
2768                         return -ENOSYS;
2769         }
2770
2771         switch (cmd) {
2772         case FUTEX_LOCK_PI:
2773         case FUTEX_UNLOCK_PI:
2774         case FUTEX_TRYLOCK_PI:
2775         case FUTEX_WAIT_REQUEUE_PI:
2776         case FUTEX_CMP_REQUEUE_PI:
2777                 if (!futex_cmpxchg_enabled)
2778                         return -ENOSYS;
2779         }
2780
2781         switch (cmd) {
2782         case FUTEX_WAIT:
2783                 val3 = FUTEX_BITSET_MATCH_ANY;
2784         case FUTEX_WAIT_BITSET:
2785                 return futex_wait(uaddr, flags, val, timeout, val3);
2786         case FUTEX_WAKE:
2787                 val3 = FUTEX_BITSET_MATCH_ANY;
2788         case FUTEX_WAKE_BITSET:
2789                 return futex_wake(uaddr, flags, val, val3);
2790         case FUTEX_REQUEUE:
2791                 return futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0);
2792         case FUTEX_CMP_REQUEUE:
2793                 return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0);
2794         case FUTEX_WAKE_OP:
2795                 return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
2796         case FUTEX_LOCK_PI:
2797                 return futex_lock_pi(uaddr, flags, val, timeout, 0);
2798         case FUTEX_UNLOCK_PI:
2799                 return futex_unlock_pi(uaddr, flags);
2800         case FUTEX_TRYLOCK_PI:
2801                 return futex_lock_pi(uaddr, flags, 0, timeout, 1);
2802         case FUTEX_WAIT_REQUEUE_PI:
2803                 val3 = FUTEX_BITSET_MATCH_ANY;
2804                 return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
2805                                              uaddr2);
2806         case FUTEX_CMP_REQUEUE_PI:
2807                 return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
2808         }
2809         return -ENOSYS;
2810 }
2811
2812
2813 SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
2814                 struct timespec __user *, utime, u32 __user *, uaddr2,
2815                 u32, val3)
2816 {
2817         struct timespec ts;
2818         ktime_t t, *tp = NULL;
2819         u32 val2 = 0;
2820         int cmd = op & FUTEX_CMD_MASK;
2821
2822         if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
2823                       cmd == FUTEX_WAIT_BITSET ||
2824                       cmd == FUTEX_WAIT_REQUEUE_PI)) {
2825                 if (copy_from_user(&ts, utime, sizeof(ts)) != 0)
2826                         return -EFAULT;
2827                 if (!timespec_valid(&ts))
2828                         return -EINVAL;
2829
2830                 t = timespec_to_ktime(ts);
2831                 if (cmd == FUTEX_WAIT)
2832                         t = ktime_add_safe(ktime_get(), t);
2833                 tp = &t;
2834         }
2835         /*
2836          * requeue parameter in 'utime' if cmd == FUTEX_*_REQUEUE_*.
2837          * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP.
2838          */
2839         if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
2840             cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)
2841                 val2 = (u32) (unsigned long) utime;
2842
2843         return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
2844 }
2845
2846 static int __init futex_init(void)
2847 {
2848         u32 curval;
2849         unsigned int futex_shift;
2850         unsigned long i;
2851
2852 #if CONFIG_BASE_SMALL
2853         futex_hashsize = 16;
2854 #else
2855         futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus());
2856 #endif
2857
2858         futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues),
2859                                                futex_hashsize, 0,
2860                                                futex_hashsize < 256 ? HASH_SMALL : 0,
2861                                                &futex_shift, NULL,
2862                                                futex_hashsize, futex_hashsize);
2863         futex_hashsize = 1UL << futex_shift;
2864         /*
2865          * This will fail and we want it. Some arch implementations do
2866          * runtime detection of the futex_atomic_cmpxchg_inatomic()
2867          * functionality. We want to know that before we call in any
2868          * of the complex code paths. Also we want to prevent
2869          * registration of robust lists in that case. NULL is
2870          * guaranteed to fault and we get -EFAULT on functional
2871          * implementation, the non-functional ones will return
2872          * -ENOSYS.
2873          */
2874         if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT)
2875                 futex_cmpxchg_enabled = 1;
2876
2877         for (i = 0; i < futex_hashsize; i++) {
2878                 plist_head_init(&futex_queues[i].chain);
2879                 spin_lock_init(&futex_queues[i].lock);
2880         }
2881
2882         return 0;
2883 }
2884 __initcall(futex_init);