futex requeueing feature, futex-requeue-2.5.69-D3

futex requeueing feature, futex-requeue-2.5.69-D3

Post by Ingo Molna » Tue, 20 May 2003 11:40:08



the attached patch addresses a futex related SMP scalability problem of
glibc. A number of regressions have been reported to the NTPL mailing list
when going to many CPUs, for applications that use condition variables and
the pthread_cond_broadcast() API call. Using this functionality, testcode
shows a slowdown from 0.12 seconds runtime to over 237 seconds (!)  
runtime, on 4-CPU systems.

pthread condition variables use two futex-backed mutex-alike locks: an
internal one for the glibc CV state itself, and a user-supplied mutex
which the API guarantees to take in certain codepaths. (Unfortunately the
user-supplied mutex cannot be used to protect the CV state, so we've got
to deal with two locks.)

The cause of the slowdown is a 'swarm effect': if lots of threads are
blocked on a condition variable, and pthread_cond_broadcast() is done,
then glibc first does a FUTEX_WAKE on the cv-internal mutex, then down a
mutex_down() on the user-supplied mutex. Ie. a swarm of threads is created
which all race to serialize on the user-supplied mutex. The more threads
are used, the more likely it becomes that the scheduler will balance them
over to other CPUs - where they just schedule, try to lock the mutex, and
go to sleep. This 'swarm effect' is purely technical, a side-effect of
glibc's use of futexes, and the imperfect coupling of the two locks.

the solution to this problem is to not wake up the swarm of threads, but
'requeue' them from the CV-internal mutex to the user-supplied mutex. The
attached patch adds the FUTEX_REQUEUE feature FUTEX_REQUEUE requeues N
threads from futex address A to futex address B:

        sys_futex(uaddr, FUTEX_REQUEUE, nr_wake, NULL, uaddr2);

the 'val' parameter to sys_futex (nr_wake) is the # of woken up threads.  
This way glibc can wake up a single thread (which will take the
user-mutex), and can requeue the rest, with a single system-call.

Ulrich Drepper has implemented FUTEX_REQUEUE support in glibc, and a
number of people have tested it over the past couple of weeks. Here are
the measurements done by Saurabh Desai:

System: 4xPIII 700MHz

 ./cond-perf -r 100 -n 200:        1p       2p         4p
 Default NPTL:                 0.120s   0.211s   237.407s
 requeue NPTL:                 0.124s   0.156s     0.040s

 ./cond-perf -r 1000 -n 100:
 Default NPTL:                 0.276s   0.412s     0.530s
 requeue NPTL:                 0.349s   0.503s     0.550s

 ./pp -v -n 128 -i 1000 -S 32768:
 Default NPTL: 128 games in    1.111s   1.270s    16.894s
 requeue NPTL: 128 games in    1.111s   1.959s     2.426s

 ./pp -v -n 1024 -i 10 -S 32768:
 Default NPTL: 1024 games in   0.181s   0.394s     incompleted 2m+
 requeue NPTL: 1024 games in   0.166s   0.254s     0.341s

the speedup with increasing number of threads is quite significant, in the
128 threads, case it's more than 8 times. In the cond-perf test, on 4 CPUs
it's almost infinitely faster than the 'swarm of threads' catastrophy
triggered by the old code.

there's a slowdown on UP, which is expected: on UP the O(1) scheduler
implicitly serializes all active threads on the runqueue, and doesnt
degrade under lots of threads. On SMP the 'point of breakdown' depends on
the precise amount of time needed for the threads to become rated as
'cache-cold' by the load-balancer.

(the patch adds a new futex syscall parameter (uaddr2), which is a
compatible extension of sys_futex. Old NPTL applications will continue to
work without any impact, only the FUTEX_REQUEUE codepath uses the new
parameter.)

        Ingo

--- linux/include/linux/futex.h.orig    
+++ linux/include/linux/futex.h
@@ -5,7 +5,8 @@
 #define FUTEX_WAIT (0)
 #define FUTEX_WAKE (1)
 #define FUTEX_FD (2)
+#define FUTEX_REQUEUE (3)

-extern asmlinkage long sys_futex(u32 __user *uaddr, int op, int val, struct timespec __user *utime);
+extern asmlinkage long sys_futex(u32 __user *uaddr, int op, int val, struct timespec __user *utime, u32 __user *uaddr2);

 #endif
--- linux/kernel/fork.c.orig    
+++ linux/kernel/fork.c
@@ -457,7 +457,7 @@ void mm_release(struct task_struct *tsk,
                 * not set up a proper pointer then tough luck.
                 */
                put_user(0, tidptr);
-               sys_futex(tidptr, FUTEX_WAKE, 1, NULL);
+               sys_futex(tidptr, FUTEX_WAKE, 1, NULL, NULL);
        }
 }

--- linux/kernel/compat.c.orig  
+++ linux/kernel/compat.c      
@@ -214,7 +214,7 @@ asmlinkage long compat_sys_sigprocmask(i
 extern long do_futex(unsigned long, int, int, unsigned long);

 asmlinkage long compat_sys_futex(u32 *uaddr, int op, int val,
-               struct compat_timespec *utime)
+               struct compat_timespec *utime, u32 *uaddr2)
 {
        struct timespec t;
        unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
@@ -224,7 +224,7 @@ asmlinkage long compat_sys_futex(u32 *ua
                        return -EFAULT;
                timeout = timespec_to_jiffies(&t) + 1;
        }
-       return do_futex((unsigned long)uaddr, op, val, timeout);
+       return do_futex((unsigned long)uaddr, op, val, timeout, (unsigned long)uaddr2);
 }

 asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit *rlim);
--- linux/kernel/futex.c.orig  
+++ linux/kernel/futex.c        
@@ -2,6 +2,9 @@
  *  Fast Userspace Mutexes (which I call "Futexes!").
  *  (C) Rusty Russell, IBM 2002
  *
+ *  Generalized futexes, futex requeueing, misc fixes by Ingo Molnar
+ *  (C) Copyright 2003 Red Hat Inc, All Rights Reserved
+ *
  *  Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
  *  enough at me, Linus for the original (flawed) idea, Matthew
  *  Kirkwood for proof-of-concept implementation.
@@ -9,9 +12,6 @@
  *  "The futexes are also cursed."
  *  "But they come in a choice of three flavours!"
  *
- *  Generalized futexes for every mapping type, Ingo Molnar, 2002
- *
- *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
  *  the Free Software Foundation; either version 2 of the License, or
@@ -216,6 +216,61 @@ static void futex_vcache_callback(vcache
        spin_unlock(&futex_lock);
 }

+/*
+ * Requeue all waiters hashed on one physical page to another
+ * physical page.
+ */
+static int futex_requeue(unsigned long uaddr1, int offset1, unsigned long uaddr2, int offset2, int num)
+{
+       struct list_head *i, *next, *head1, *head2;
+       struct page *page1, *page2;
+       int ret = 0;
+
+       lock_futex_mm();
+
+       page1 = __pin_page(uaddr1 - offset1);
+       if (!page1) {
+               unlock_futex_mm();
+               return -EFAULT;
+       }
+       page2 = __pin_page(uaddr2 - offset2);
+       if (!page2) {
+               unlock_futex_mm();
+               return -EFAULT;
+       }
+
+       head1 = hash_futex(page1, offset1);
+       head2 = hash_futex(page2, offset2);
+
+       list_for_each_safe(i, next, head1) {
+               struct futex_q *this = list_entry(i, struct futex_q, list);
+
+               if (this->page == page1 && this->offset == offset1) {
+                       list_del_init(i);
+                       __detach_vcache(&this->vcache);
+                       if (++ret <= num) {
+                               wake_up_all(&this->waiters);
+                               if (this->filp)
+                                       send_sigio(&this->filp->f_owner, this->fd, POLL_IN);
+                       } else {
+                               unpin_page(this->page);
+                               __pin_page_atomic (page2);
+                               list_add_tail(i, head2);
+                               __attach_vcache(&this->vcache, uaddr2, current->mm, futex_vcache_callback);
+                               this->offset = offset2;
+                               this->page = page2;
+                       }
+               }
+       }
+
+       unlock_futex_mm();
+
+       unpin_page(page1);
+       unpin_page(page2);
+
+       return ret;
+}
+
 static inline void __queue_me(struct futex_q *q, struct page *page,
                                unsigned long uaddr, int offset,
                                int fd, struct file *filp)
@@ -425,9 +480,9 @@ out:
        return ret;
 }

-long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout)
+long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout, unsigned long uaddr2)
 {
-       unsigned long pos_in_page;
+       unsigned long pos_in_page, pos_in_page2;
        int ret;

        pos_in_page = uaddr % PAGE_SIZE;
@@ -443,6 +498,14 @@ long do_futex(unsigned long uaddr, int o
        case FUTEX_WAKE:
                ret = futex_wake(uaddr, pos_in_page, val);
                break;
+       case FUTEX_REQUEUE:
+               pos_in_page2 = uaddr2 % PAGE_SIZE;
+
+               /* Must be "naturally" aligned */
+               if (pos_in_page2 % sizeof(u32))
+                       return -EINVAL;
+               ret = futex_requeue(uaddr, pos_in_page, uaddr2, pos_in_page2, val);
+               break;
        case FUTEX_FD:
                /* non-zero val means F_SETOWN(getpid()) & F_SETSIG(val) */
                ret = futex_fd(uaddr, pos_in_page, val);
@@ -453,7 +516,7 @@ long do_futex(unsigned long uaddr, int o
        return ret;
 }

-asmlinkage long sys_futex(u32 __user *uaddr, int op, int val, struct timespec __user *utime)
+asmlinkage long sys_futex(u32 __user *uaddr, int op, int val, struct timespec __user *utime, u32 __user *uaddr2)
 {
        struct timespec t;
        unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
@@ -463,7 +526,7 @@ asmlinkage long sys_futex(u32 __user *ua
                        return -EFAULT;
                timeout = timespec_to_jiffies(&t) + 1;
        }
-       return do_futex((unsigned long)uaddr, op, val, timeout);
+       return do_futex((unsigned long)uaddr, op, val, timeout, (unsigned long)uaddr2);
 }

 static struct super_block *

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

 
 
 

futex requeueing feature, futex-requeue-2.5.69-D3

Post by Christoph Hellwi » Tue, 20 May 2003 12:20:05



> the solution to this problem is to not wake up the swarm of threads, but
> 'requeue' them from the CV-internal mutex to the user-supplied mutex. The
> attached patch adds the FUTEX_REQUEUE feature FUTEX_REQUEUE requeues N
> threads from futex address A to futex address B:

>    sys_futex(uaddr, FUTEX_REQUEUE, nr_wake, NULL, uaddr2);

> the 'val' parameter to sys_futex (nr_wake) is the # of woken up threads.  
> This way glibc can wake up a single thread (which will take the
> user-mutex), and can requeue the rest, with a single system-call.

Urgg, yet another sys_futex extension.  Could you please split all
these totally different cases into separate syscalls instead?

Quote:> +                          wake_up_all(&this->waiters);
> +                          if (this->filp)
> +                                  send_sigio(&this->filp->f_owner, this->fd, POLL_IN);
> +                  } else {
> +                          unpin_page(this->page);
> +                          __pin_page_atomic (page2);
> +                          list_add_tail(i, head2);
> +                          __attach_vcache(&this->vcache, uaddr2, current->mm, futex_vcache_callback);

Please linewrap after 80 lines, thanks.

Quote:> +  case FUTEX_REQUEUE:
> +          pos_in_page2 = uaddr2 % PAGE_SIZE;
> +
> +          /* Must be "naturally" aligned */
> +          if (pos_in_page2 % sizeof(u32))
> +                  return -EINVAL;

Who guarantess that the alignment of u32 is always the same as it's size?

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in

More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

 
 
 

futex requeueing feature, futex-requeue-2.5.69-D3

Post by Ingo Molna » Tue, 20 May 2003 12:30:06



> [...] Could you please split all these totally different cases into
> separate syscalls instead?

sure, i'm all for it, but in a different pass, and after syncing up with
glibc. An API cleanup like this should have been done upon the
introduction of futexes, why didnt you comment on this then? Splitting off
FUTEX_REQUEUE in isolation is quite pointless.

Quote:> > +     case FUTEX_REQUEUE:
> > +             pos_in_page2 = uaddr2 % PAGE_SIZE;
> > +
> > +             /* Must be "naturally" aligned */
> > +             if (pos_in_page2 % sizeof(u32))
> > +                     return -EINVAL;

> Who guarantess that the alignment of u32 is always the same as it's size?

glibc. We do not want to handle all the misaligned cases for obvious
reasons. The use of u32 (instead of a native word) is a bit unfortunate on
64-bit systems but now a reality.

        Ingo

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in

More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

 
 
 

futex requeueing feature, futex-requeue-2.5.69-D3

Post by Andrew Morto » Tue, 20 May 2003 12:30:11



>  + page1 = __pin_page(uaddr1 - offset1);
>  + if (!page1) {
>  +         unlock_futex_mm();
>  +         return -EFAULT;
>  + }
>  + page2 = __pin_page(uaddr2 - offset2);
>  + if (!page2) {
>  +         unlock_futex_mm();
>  +         return -EFAULT;
>  + }

page1 is leaked.

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in

More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

 
 
 

futex requeueing feature, futex-requeue-2.5.69-D3

Post by Ingo Molna » Tue, 20 May 2003 12:40:10


On Mon, 19 May 2003, Andrew Morton wrote:
> page1 is leaked.

doh, indeed. -D4 patch attached. It also fixes the line-wrap noticed by
Christoph Hellwig. Patch applies, compiles and boots fine.

        Ingo

--- linux/include/linux/futex.h.orig    
+++ linux/include/linux/futex.h
@@ -5,7 +5,8 @@
 #define FUTEX_WAIT (0)
 #define FUTEX_WAKE (1)
 #define FUTEX_FD (2)
+#define FUTEX_REQUEUE (3)

-extern asmlinkage long sys_futex(u32 __user *uaddr, int op, int val, struct timespec __user *utime);
+extern asmlinkage long sys_futex(u32 __user *uaddr, int op, int val, struct timespec __user *utime, u32 __user *uaddr2);

 #endif
--- linux/kernel/fork.c.orig    
+++ linux/kernel/fork.c
@@ -457,7 +457,7 @@ void mm_release(struct task_struct *tsk,
                 * not set up a proper pointer then tough luck.
                 */
                put_user(0, tidptr);
-               sys_futex(tidptr, FUTEX_WAKE, 1, NULL);
+               sys_futex(tidptr, FUTEX_WAKE, 1, NULL, NULL);
        }
 }

--- linux/kernel/compat.c.orig  
+++ linux/kernel/compat.c      
@@ -214,7 +214,7 @@ asmlinkage long compat_sys_sigprocmask(i
 extern long do_futex(unsigned long, int, int, unsigned long);

 asmlinkage long compat_sys_futex(u32 *uaddr, int op, int val,
-               struct compat_timespec *utime)
+               struct compat_timespec *utime, u32 *uaddr2)
 {
        struct timespec t;
        unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
@@ -224,7 +224,7 @@ asmlinkage long compat_sys_futex(u32 *ua
                        return -EFAULT;
                timeout = timespec_to_jiffies(&t) + 1;
        }
-       return do_futex((unsigned long)uaddr, op, val, timeout);
+       return do_futex((unsigned long)uaddr, op, val, timeout, (unsigned long)uaddr2);
 }

 asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit *rlim);
--- linux/kernel/futex.c.orig  
+++ linux/kernel/futex.c        
@@ -2,6 +2,9 @@
  *  Fast Userspace Mutexes (which I call "Futexes!").
  *  (C) Rusty Russell, IBM 2002
  *
+ *  Generalized futexes, futex requeueing, misc fixes by Ingo Molnar
+ *  (C) Copyright 2003 Red Hat Inc, All Rights Reserved
+ *
  *  Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
  *  enough at me, Linus for the original (flawed) idea, Matthew
  *  Kirkwood for proof-of-concept implementation.
@@ -9,9 +12,6 @@
  *  "The futexes are also cursed."
  *  "But they come in a choice of three flavours!"
  *
- *  Generalized futexes for every mapping type, Ingo Molnar, 2002
- *
- *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
  *  the Free Software Foundation; either version 2 of the License, or
@@ -216,6 +216,62 @@ static void futex_vcache_callback(vcache
        spin_unlock(&futex_lock);
 }

+/*
+ * Requeue all waiters hashed on one physical page to another
+ * physical page.
+ */
+static int futex_requeue(unsigned long uaddr1, int offset1, unsigned long uaddr2, int offset2, int num)
+{
+       struct list_head *i, *next, *head1, *head2;
+       struct page *page1 = NULL, *page2 = NULL;
+       int ret = 0;
+
+       lock_futex_mm();
+
+       page1 = __pin_page(uaddr1 - offset1);
+       if (!page1)
+               goto out;
+       page2 = __pin_page(uaddr2 - offset2);
+       if (!page2)
+               goto out;
+
+       head1 = hash_futex(page1, offset1);
+       head2 = hash_futex(page2, offset2);
+
+       list_for_each_safe(i, next, head1) {
+               struct futex_q *this = list_entry(i, struct futex_q, list);
+
+               if (this->page == page1 && this->offset == offset1) {
+                       list_del_init(i);
+                       __detach_vcache(&this->vcache);
+                       if (++ret <= num) {
+                               wake_up_all(&this->waiters);
+                               if (this->filp)
+                                       send_sigio(&this->filp->f_owner,
+                                                       this->fd, POLL_IN);
+                       } else {
+                               unpin_page(this->page);
+                               __pin_page_atomic (page2);
+                               list_add_tail(i, head2);
+                               __attach_vcache(&this->vcache, uaddr2,
+                                       current->mm, futex_vcache_callback);
+                               this->offset = offset2;
+                               this->page = page2;
+                       }
+               }
+       }
+
+out:
+       unlock_futex_mm();
+
+       if (page1)
+               unpin_page(page1);
+       if (page2)
+               unpin_page(page2);
+
+       return ret;
+}
+
 static inline void __queue_me(struct futex_q *q, struct page *page,
                                unsigned long uaddr, int offset,
                                int fd, struct file *filp)
@@ -425,9 +481,9 @@ out:
        return ret;
 }

-long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout)
+long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout, unsigned long uaddr2)
 {
-       unsigned long pos_in_page;
+       unsigned long pos_in_page, pos_in_page2;
        int ret;

        pos_in_page = uaddr % PAGE_SIZE;
@@ -443,6 +499,14 @@ long do_futex(unsigned long uaddr, int o
        case FUTEX_WAKE:
                ret = futex_wake(uaddr, pos_in_page, val);
                break;
+       case FUTEX_REQUEUE:
+               pos_in_page2 = uaddr2 % PAGE_SIZE;
+
+               /* Must be "naturally" aligned */
+               if (pos_in_page2 % sizeof(u32))
+                       return -EINVAL;
+               ret = futex_requeue(uaddr, pos_in_page, uaddr2, pos_in_page2, val);
+               break;
        case FUTEX_FD:
                /* non-zero val means F_SETOWN(getpid()) & F_SETSIG(val) */
                ret = futex_fd(uaddr, pos_in_page, val);
@@ -453,7 +517,7 @@ long do_futex(unsigned long uaddr, int o
        return ret;
 }

-asmlinkage long sys_futex(u32 __user *uaddr, int op, int val, struct timespec __user *utime)
+asmlinkage long sys_futex(u32 __user *uaddr, int op, int val, struct timespec __user *utime, u32 __user *uaddr2)
 {
        struct timespec t;
        unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
@@ -463,7 +527,7 @@ asmlinkage long sys_futex(u32 __user *ua
                        return -EFAULT;
                timeout = timespec_to_jiffies(&t) + 1;
        }
-       return do_futex((unsigned long)uaddr, op, val, timeout);
+       return do_futex((unsigned long)uaddr, op, val, timeout, (unsigned long)uaddr2);
 }

 static struct super_block *

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

 
 
 

futex requeueing feature, futex-requeue-2.5.69-D3

Post by Christoph Hellwi » Tue, 20 May 2003 14:00:09



> sure, i'm all for it, but in a different pass, and after syncing up with
> glibc. An API cleanup like this should have been done upon the
> introduction of futexes, why didnt you comment on this then? Splitting off
> FUTEX_REQUEUE in isolation is quite pointless.

Maybe I don't spend all my time watching the futex API? :)  Okay,
let's make a deal, you add a new syscall for this case and I'll fix
up the older ones in a patch that's ontop of yours?

Quote:> > > +        case FUTEX_REQUEUE:
> > > +                pos_in_page2 = uaddr2 % PAGE_SIZE;
> > > +
> > > +                /* Must be "naturally" aligned */
> > > +                if (pos_in_page2 % sizeof(u32))
> > > +                        return -EINVAL;

> > Who guarantess that the alignment of u32 is always the same as it's size?

> glibc. We do not want to handle all the misaligned cases for obvious
> reasons. The use of u32 (instead of a native word) is a bit unfortunate on
> 64-bit systems but now a reality.

Sorry if the question wasn't clear, but who guarantess that the alignment
of u32 is the same as it's size?  You test of the size of u32, not it's
alignment even if they usually are the same.

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in

More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

 
 
 

futex requeueing feature, futex-requeue-2.5.69-D3

Post by Ingo Molna » Tue, 20 May 2003 14:40:17


On Mon, 19 May 2003, Christoph Hellwig wrote:
> Maybe I don't spend all my time watching the futex API? :)  Okay, let's
> make a deal, you add a new syscall for this case and I'll fix up the
> older ones in a patch that's ontop of yours?

well, my point was that it's not an issue introduced by the FUTEX_REQUEUE
change. Here's a patch ontop of my last futex patch, which adds all the
interface cleanups. Changes:

 - separate out sys_futex_wait, sys_futex_wake and sys_futex_requeue.

 - inline the futex_wait/wake/requeue functionality, as it should be.

 - start the phasing out of FUTEX_FD. This i believe is quite unclean and
   unrobust, because it attaches a new concept (futexes) to a very old
   (polling) concept. We want futex support in kernel-AIO, not in the
   polling APIs. AFAIK only NGPT uses FUTEX_FD.

> > > Who guarantess that the alignment of u32 is always the same as it's size?

> > glibc. We do not want to handle all the misaligned cases for obvious
> > reasons. The use of u32 (instead of a native word) is a bit unfortunate on
> > 64-bit systems but now a reality.

> Sorry if the question wasn't clear, but who guarantess that the
> alignment of u32 is the same as it's size?  You test of the size of u32,
> not it's alignment even if they usually are the same.

glibc (the main user and API-multiplexer of futexes) ensures that futex
variables are aligned on sizeof(u32).

        Ingo

--- linux/include/linux/futex.h.orig    
+++ linux/include/linux/futex.h
@@ -2,11 +2,13 @@
 #define _LINUX_FUTEX_H

 /* Second argument to futex syscall */
-#define FUTEX_WAIT (0)
-#define FUTEX_WAKE (1)
-#define FUTEX_FD (2)
-#define FUTEX_REQUEUE (3)

-extern asmlinkage long sys_futex(u32 __user *uaddr, int op, int val, struct timespec __user *utime, u32 __user *uaddr2);
+asmlinkage long sys_futex_wait(u32 __user *__uaddr,int val,
+                               struct timespec __user *utime);
+
+asmlinkage long sys_futex_wake(u32 __user *__uaddr, int val);
+
+asmlinkage long sys_futex_requeue(u32 __user *__uaddr1, u32 __user *__uaddr2,
+                                       int nr_wake);

 #endif
--- linux/arch/i386/kernel/entry.S.orig
+++ linux/arch/i386/kernel/entry.S      
@@ -837,7 +837,7 @@ ENTRY(sys_call_table)
        .long sys_fremovexattr
        .long sys_tkill
        .long sys_sendfile64
-       .long sys_futex         /* 240 */
+       .long old_futex         /* 240 */
        .long sys_sched_setaffinity
        .long sys_sched_getaffinity
        .long sys_set_thread_area
@@ -865,6 +865,8 @@ ENTRY(sys_call_table)
        .long sys_clock_gettime         /* 265 */
        .long sys_clock_getres
        .long sys_clock_nanosleep
-
+       .long sys_futex_wait
+       .long sys_futex_wake
+       .long sys_futex_requeue         /* 270 */

 nr_syscalls=(.-sys_call_table)/4
--- linux/kernel/futex.c.orig  
+++ linux/kernel/futex.c        
@@ -347,7 +347,7 @@ static int futex_wait(unsigned long uadd
         * The get_user() above might fault and schedule so we
         * cannot just set TASK_INTERRUPTIBLE state when queueing
         * ourselves into the futex hash. This code thus has to
-        * rely on the FUTEX_WAKE code doing a wakeup after removing
+        * rely on the futex_wake() code doing a wakeup after removing
         * the waiter from the list.
         */
        add_wait_queue(&q.waiters, &wait);
@@ -481,9 +481,13 @@ out:
        return ret;
 }

-long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout, unsigned long uaddr2)
+#define OLD_FUTEX_WAIT (0)
+#define OLD_FUTEX_WAKE (1)
+#define OLD_FUTEX_FD (2)
+
+static long do_old_futex(unsigned long uaddr, int op, int val, unsigned long timeout, unsigned long uaddr2)
 {
-       unsigned long pos_in_page, pos_in_page2;
+       unsigned long pos_in_page;
        int ret;

        pos_in_page = uaddr % PAGE_SIZE;
@@ -493,21 +497,13 @@ long do_futex(unsigned long uaddr, int o
                return -EINVAL;

        switch (op) {
-       case FUTEX_WAIT:
+       case OLD_FUTEX_WAIT:
                ret = futex_wait(uaddr, pos_in_page, val, timeout);
                break;
-       case FUTEX_WAKE:
+       case OLD_FUTEX_WAKE:
                ret = futex_wake(uaddr, pos_in_page, val);
                break;
-       case FUTEX_REQUEUE:
-               pos_in_page2 = uaddr2 % PAGE_SIZE;
-
-               /* Must be "naturally" aligned */
-               if (pos_in_page2 % sizeof(u32))
-                       return -EINVAL;
-               ret = futex_requeue(uaddr, pos_in_page, uaddr2, pos_in_page2, val);
-               break;
-       case FUTEX_FD:
+       case OLD_FUTEX_FD:
                /* non-zero val means F_SETOWN(getpid()) & F_SETSIG(val) */
                ret = futex_fd(uaddr, pos_in_page, val);
                break;
@@ -517,17 +513,63 @@ long do_futex(unsigned long uaddr, int o
        return ret;
 }

-asmlinkage long sys_futex(u32 __user *uaddr, int op, int val, struct timespec __user *utime, u32 __user *uaddr2)
+asmlinkage long old_futex(u32 __user *uaddr, int op, int val, struct timespec __user *utime, u32 __user *uaddr2)
 {
        struct timespec t;
        unsigned long timeout = MAX_SCHEDULE_TIMEOUT;

-       if ((op == FUTEX_WAIT) && utime) {
+       if ((op == OLD_FUTEX_WAIT) && utime) {
+               if (copy_from_user(&t, utime, sizeof(t)) != 0)
+                       return -EFAULT;
+               timeout = timespec_to_jiffies(&t) + 1;
+       }
+       return do_old_futex((unsigned long)uaddr, op, val, timeout, (unsigned long)uaddr2);
+}
+
+asmlinkage long sys_futex_wait(u32 __user *__uaddr, int val, struct timespec __user *utime)
+{
+       unsigned long uaddr = (unsigned long)__uaddr;
+       unsigned long pos_in_page = uaddr % PAGE_SIZE;
+       unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
+       struct timespec t;
+
+       /* Must be "naturally" aligned */
+       if (pos_in_page % sizeof(u32))
+               return -EINVAL;
+
+       if (utime) {
                if (copy_from_user(&t, utime, sizeof(t)) != 0)
                        return -EFAULT;
                timeout = timespec_to_jiffies(&t) + 1;
        }
-       return do_futex((unsigned long)uaddr, op, val, timeout, (unsigned long)uaddr2);
+       return futex_wait(uaddr, pos_in_page, val, timeout);
+}
+
+asmlinkage long sys_futex_wake(u32 __user *__uaddr, int val)
+{
+       unsigned long uaddr = (unsigned long)__uaddr;
+       unsigned long pos_in_page = uaddr % PAGE_SIZE;
+
+       /* Must be "naturally" aligned */
+       if (pos_in_page % sizeof(u32))
+               return -EINVAL;
+
+       return futex_wake(uaddr, pos_in_page, val);
+}
+
+asmlinkage long sys_futex_requeue(u32 __user *__uaddr1,
+                               u32 __user *__uaddr2, int nr_wake)
+{
+       unsigned long uaddr1 = (unsigned long)__uaddr1,
+                       uaddr2 = (unsigned long)__uaddr2;
+       unsigned long pos_in_page1 = uaddr1 % PAGE_SIZE,
+                       pos_in_page2 = uaddr2 % PAGE_SIZE;
+
+       /* Must be "naturally" aligned */
+       if ((pos_in_page1 | pos_in_page2) % sizeof(u32))
+               return -EINVAL;
+
+       return futex_requeue(uaddr1, pos_in_page1, uaddr2, pos_in_page2, nr_wake);
 }

 static struct super_block *
--- linux/kernel/fork.c.orig    
+++ linux/kernel/fork.c
@@ -457,7 +457,7 @@ void mm_release(struct task_struct *tsk,
                 * not set up a proper pointer then tough luck.
                 */
                put_user(0, tidptr);
-               sys_futex(tidptr, FUTEX_WAKE, 1, NULL, NULL);
+               sys_futex_wake(tidptr, 1);
        }
 }

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

 
 
 

futex requeueing feature, futex-requeue-2.5.69-D3

Post by Ingo Molna » Tue, 20 May 2003 15:00:19


updated patch (the inlining changes were missing):

--- linux/include/linux/futex.h.orig    
+++ linux/include/linux/futex.h
@@ -2,11 +2,13 @@
 #define _LINUX_FUTEX_H

 /* Second argument to futex syscall */
-#define FUTEX_WAIT (0)
-#define FUTEX_WAKE (1)
-#define FUTEX_FD (2)
-#define FUTEX_REQUEUE (3)

-extern asmlinkage long sys_futex(u32 __user *uaddr, int op, int val, struct timespec __user *utime, u32 __user *uaddr2);
+asmlinkage long sys_futex_wait(u32 __user *__uaddr,int val,
+                               struct timespec __user *utime);
+
+asmlinkage long sys_futex_wake(u32 __user *__uaddr, int val);
+
+asmlinkage long sys_futex_requeue(u32 __user *__uaddr1, u32 __user *__uaddr2,
+                                       int nr_wake);

 #endif
--- linux/arch/i386/kernel/entry.S.orig
+++ linux/arch/i386/kernel/entry.S      
@@ -837,7 +837,7 @@ ENTRY(sys_call_table)
        .long sys_fremovexattr
        .long sys_tkill
        .long sys_sendfile64
-       .long sys_futex         /* 240 */
+       .long old_futex         /* 240 */
        .long sys_sched_setaffinity
        .long sys_sched_getaffinity
        .long sys_set_thread_area
@@ -865,6 +865,8 @@ ENTRY(sys_call_table)
        .long sys_clock_gettime         /* 265 */
        .long sys_clock_getres
        .long sys_clock_nanosleep
-
+       .long sys_futex_wait
+       .long sys_futex_wake
+       .long sys_futex_requeue         /* 270 */

 nr_syscalls=(.-sys_call_table)/4
--- linux/kernel/futex.c.orig  
+++ linux/kernel/futex.c        
@@ -98,13 +98,13 @@ static inline struct list_head *hash_fut
  *
  * Must be called with (and returns with) all futex-MM locks held.
  */
-static inline
-struct page *__pin_page_atomic (struct page *page)
+static inline struct page *__pin_page_atomic (struct page *page)
 {
        if (!PageReserved(page))
                get_page(page);
        return page;
 }
+
 static struct page *__pin_page(unsigned long addr)
 {
        struct mm_struct *mm = current->mm;
@@ -155,7 +155,7 @@ static inline void unpin_page(struct pag
  * Wake up all waiters hashed on the physical page that is mapped
  * to this virtual address:
  */
-static int futex_wake(unsigned long uaddr, int offset, int num)
+static inline int futex_wake(unsigned long uaddr, int offset, int num)
 {
        struct list_head *i, *next, *head;
        struct page *page;
@@ -220,7 +220,8 @@ static void futex_vcache_callback(vcache
  * Requeue all waiters hashed on one physical page to another
  * physical page.
  */
-static int futex_requeue(unsigned long uaddr1, int offset1, unsigned long uaddr2, int offset2, int num)
+static inline int futex_requeue(unsigned long uaddr1, int offset1,
+               unsigned long uaddr2, int offset2, int num)
 {
        struct list_head *i, *next, *head1, *head2;
        struct page *page1 = NULL, *page2 = NULL;
@@ -308,7 +309,7 @@ static inline int unqueue_me(struct fute
        return ret;
 }

-static int futex_wait(unsigned long uaddr,
+static inline int futex_wait(unsigned long uaddr,
                      int offset,
                      int val,
                      unsigned long time)
@@ -347,7 +348,7 @@ static int futex_wait(unsigned long uadd
         * The get_user() above might fault and schedule so we
         * cannot just set TASK_INTERRUPTIBLE state when queueing
         * ourselves into the futex hash. This code thus has to
-        * rely on the FUTEX_WAKE code doing a wakeup after removing
+        * rely on the futex_wake() code doing a wakeup after removing
         * the waiter from the list.
         */
        add_wait_queue(&q.waiters, &wait);
@@ -481,9 +482,13 @@ out:
        return ret;
 }

-long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout, unsigned long uaddr2)
+#define OLD_FUTEX_WAIT (0)
+#define OLD_FUTEX_WAKE (1)
+#define OLD_FUTEX_FD (2)
+
+static long do_old_futex(unsigned long uaddr, int op, int val, unsigned long timeout, unsigned long uaddr2)
 {
-       unsigned long pos_in_page, pos_in_page2;
+       unsigned long pos_in_page;
        int ret;

        pos_in_page = uaddr % PAGE_SIZE;
@@ -493,21 +498,13 @@ long do_futex(unsigned long uaddr, int o
                return -EINVAL;

        switch (op) {
-       case FUTEX_WAIT:
+       case OLD_FUTEX_WAIT:
                ret = futex_wait(uaddr, pos_in_page, val, timeout);
                break;
-       case FUTEX_WAKE:
+       case OLD_FUTEX_WAKE:
                ret = futex_wake(uaddr, pos_in_page, val);
                break;
-       case FUTEX_REQUEUE:
-               pos_in_page2 = uaddr2 % PAGE_SIZE;
-
-               /* Must be "naturally" aligned */
-               if (pos_in_page2 % sizeof(u32))
-                       return -EINVAL;
-               ret = futex_requeue(uaddr, pos_in_page, uaddr2, pos_in_page2, val);
-               break;
-       case FUTEX_FD:
+       case OLD_FUTEX_FD:
                /* non-zero val means F_SETOWN(getpid()) & F_SETSIG(val) */
                ret = futex_fd(uaddr, pos_in_page, val);
                break;
@@ -517,17 +514,63 @@ long do_futex(unsigned long uaddr, int o
        return ret;
 }

-asmlinkage long sys_futex(u32 __user *uaddr, int op, int val, struct timespec __user *utime, u32 __user *uaddr2)
+asmlinkage long old_futex(u32 __user *uaddr, int op, int val, struct timespec __user *utime, u32 __user *uaddr2)
 {
        struct timespec t;
        unsigned long timeout = MAX_SCHEDULE_TIMEOUT;

-       if ((op == FUTEX_WAIT) && utime) {
+       if ((op == OLD_FUTEX_WAIT) && utime) {
                if (copy_from_user(&t, utime, sizeof(t)) != 0)
                        return -EFAULT;
                timeout = timespec_to_jiffies(&t) + 1;
        }
-       return do_futex((unsigned long)uaddr, op, val, timeout, (unsigned long)uaddr2);
+       return do_old_futex((unsigned long)uaddr, op, val, timeout, (unsigned long)uaddr2);
+}
+
+asmlinkage long sys_futex_wait(u32 __user *__uaddr, int val, struct timespec __user *utime)
+{
+       unsigned long uaddr = (unsigned long)__uaddr;
+       unsigned long pos_in_page = uaddr % PAGE_SIZE;
+       unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
+       struct timespec t;
+
+       /* Must be "naturally" aligned */
+       if (pos_in_page % sizeof(u32))
+               return -EINVAL;
+
+       if (utime) {
+               if (copy_from_user(&t, utime, sizeof(t)) != 0)
+                       return -EFAULT;
+               timeout = timespec_to_jiffies(&t) + 1;
+       }
+       return futex_wait(uaddr, pos_in_page, val, timeout);
+}
+
+asmlinkage long sys_futex_wake(u32 __user *__uaddr, int val)
+{
+       unsigned long uaddr = (unsigned long)__uaddr;
+       unsigned long pos_in_page = uaddr % PAGE_SIZE;
+
+       /* Must be "naturally" aligned */
+       if (pos_in_page % sizeof(u32))
+               return -EINVAL;
+
+       return futex_wake(uaddr, pos_in_page, val);
+}
+
+asmlinkage long sys_futex_requeue(u32 __user *__uaddr1,
+                               u32 __user *__uaddr2, int nr_wake)
+{
+       unsigned long uaddr1 = (unsigned long)__uaddr1,
+                       uaddr2 = (unsigned long)__uaddr2;
+       unsigned long pos_in_page1 = uaddr1 % PAGE_SIZE,
+                       pos_in_page2 = uaddr2 % PAGE_SIZE;
+
+       /* Must be "naturally" aligned */
+       if ((pos_in_page1 | pos_in_page2) % sizeof(u32))
+               return -EINVAL;
+
+       return futex_requeue(uaddr1, pos_in_page1, uaddr2, pos_in_page2, nr_wake);
 }

 static struct super_block *
--- linux/kernel/fork.c.orig    
+++ linux/kernel/fork.c
@@ -457,7 +457,7 @@ void mm_release(struct task_struct *tsk,
                 * not set up a proper pointer then tough luck.
                 */
                put_user(0, tidptr);
-               sys_futex(tidptr, FUTEX_WAKE, 1, NULL, NULL);
+               sys_futex_wake(tidptr, 1);
        }
 }

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

 
 
 

futex requeueing feature, futex-requeue-2.5.69-D3

Post by bert huber » Tue, 20 May 2003 16:50:15



>  - start the phasing out of FUTEX_FD. This i believe is quite unclean and
>    unrobust, because it attaches a new concept (futexes) to a very old
>    (polling) concept. We want futex support in kernel-AIO, not in the
>    polling APIs. AFAIK only NGPT uses FUTEX_FD.

I for one would want the ability to select, poll and epoll on a futex while
also being notified of availability of data on sockets. I see no alternative
even, except for messing with signals or running select with a small
timeout, introducing needless latency.

It may be weird, but it does work in practice. 'Unrobust' would be a problem
but I fail to see how this is unclean.

Thanks.

--
http://www.PowerDNS.com      Open source, database driven DNS Software
http://lartc.org           Linux Advanced Routing & Traffic Control HOWTO
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in

More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

 
 
 

futex requeueing feature, futex-requeue-2.5.69-D3

Post by Linus Torvald » Tue, 20 May 2003 17:30:07



>  - start the phasing out of FUTEX_FD. This i believe is quite unclean and
>    unrobust, because it attaches a new concept (futexes) to a very old
>    (polling) concept. We want futex support in kernel-AIO, not in the
>    polling APIs. AFAIK only NGPT uses FUTEX_FD.

This sounds like a bad idea.

Expecting "select()" and "poll()" to go away, and calling them "unclean
and unrobust" is just silly and stupid. There are a hell of a lot more
programs using select-loops out there than there are AIO versions, and I'd
argue that AIO is likely to be the much more "unrobust" solution, and
probably doesn't even scale any better than using epoll.

In fact, it's hard to see any real advantages of aio over a sane polling
loop, as long as the polling doesn't have some O(n) overhead (in other
words, as long as you use epoll).

So stop pushing your own agenda and broken morals down other peoples
throats, and re-do this patch properly.

                Linus

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in

More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

 
 
 

futex requeueing feature, futex-requeue-2.5.69-D3

Post by Ingo Molna » Tue, 20 May 2003 18:10:12



> I for one would want the ability to select, poll and epoll on a futex
> while also being notified of availability of data on sockets. I see no
> alternative even, except for messing with signals or running select with
> a small timeout, introducing needless latency.

> It may be weird, but it does work in practice. 'Unrobust' would be a
> problem but I fail to see how this is unclean.

ok, i was flaming away mindlessly. New patch on the way.

        Ingo

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in

More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

 
 
 

futex requeueing feature, futex-requeue-2.5.69-D3

Post by Ingo Molna » Tue, 20 May 2003 18:20:04


> >  - start the phasing out of FUTEX_FD. This i believe is quite unclean and
> >    unrobust, [...]

FUTEX_FD is an instant DoS, it allows the pinning of one page per file
descriptor, per thread. With a default limit of 1024 open files per
thread, and 256 threads (on a sane/conservative setup), this means 1 GB of
RAM can be pinned down by a normal unprivileged user.

Any suggestions how to fix it - or am i missing something why it should
not be considered a problem?

updated patch attached - it now has proper sys_futex_fd() support as well.

        Ingo

--- linux/include/linux/futex.h.orig    
+++ linux/include/linux/futex.h
@@ -2,11 +2,15 @@
 #define _LINUX_FUTEX_H

 /* Second argument to futex syscall */
-#define FUTEX_WAIT (0)
-#define FUTEX_WAKE (1)
-#define FUTEX_FD (2)
-#define FUTEX_REQUEUE (3)

-extern asmlinkage long sys_futex(u32 __user *uaddr, int op, int val, struct timespec __user *utime, u32 __user *uaddr2);
+asmlinkage long sys_futex_wait(u32 __user *__uaddr,int val,
+                               struct timespec __user *utime);
+
+asmlinkage long sys_futex_wake(u32 __user *__uaddr, int nr_wake);
+
+asmlinkage long sys_futex_fd(u32 __user *__uaddr, int signal);
+
+asmlinkage long sys_futex_requeue(u32 __user *__uaddr1, u32 __user *__uaddr2,
+                                       int nr_wake);

 #endif
--- linux/arch/i386/kernel/entry.S.orig
+++ linux/arch/i386/kernel/entry.S      
@@ -837,7 +837,7 @@ ENTRY(sys_call_table)
        .long sys_fremovexattr
        .long sys_tkill
        .long sys_sendfile64
-       .long sys_futex         /* 240 */
+       .long old_futex         /* 240 */
        .long sys_sched_setaffinity
        .long sys_sched_getaffinity
        .long sys_set_thread_area
@@ -865,6 +865,9 @@ ENTRY(sys_call_table)
        .long sys_clock_gettime         /* 265 */
        .long sys_clock_getres
        .long sys_clock_nanosleep
-
+       .long sys_futex_wait
+       .long sys_futex_wake
+       .long sys_futex_fd              /* 270 */
+       .long sys_futex_requeue

 nr_syscalls=(.-sys_call_table)/4
--- linux/kernel/futex.c.orig  
+++ linux/kernel/futex.c        
@@ -98,13 +98,13 @@ static inline struct list_head *hash_fut
  *
  * Must be called with (and returns with) all futex-MM locks held.
  */
-static inline
-struct page *__pin_page_atomic (struct page *page)
+static inline struct page *__pin_page_atomic (struct page *page)
 {
        if (!PageReserved(page))
                get_page(page);
        return page;
 }
+
 static struct page *__pin_page(unsigned long addr)
 {
        struct mm_struct *mm = current->mm;
@@ -155,7 +155,7 @@ static inline void unpin_page(struct pag
  * Wake up all waiters hashed on the physical page that is mapped
  * to this virtual address:
  */
-static int futex_wake(unsigned long uaddr, int offset, int num)
+static inline int futex_wake(unsigned long uaddr, int offset, int num)
 {
        struct list_head *i, *next, *head;
        struct page *page;
@@ -220,7 +220,8 @@ static void futex_vcache_callback(vcache
  * Requeue all waiters hashed on one physical page to another
  * physical page.
  */
-static int futex_requeue(unsigned long uaddr1, int offset1, unsigned long uaddr2, int offset2, int num)
+static inline int futex_requeue(unsigned long uaddr1, int offset1,
+               unsigned long uaddr2, int offset2, int num)
 {
        struct list_head *i, *next, *head1, *head2;
        struct page *page1 = NULL, *page2 = NULL;
@@ -308,7 +309,7 @@ static inline int unqueue_me(struct fute
        return ret;
 }

-static int futex_wait(unsigned long uaddr,
+static inline int futex_wait(unsigned long uaddr,
                      int offset,
                      int val,
                      unsigned long time)
@@ -347,7 +348,7 @@ static int futex_wait(unsigned long uadd
         * The get_user() above might fault and schedule so we
         * cannot just set TASK_INTERRUPTIBLE state when queueing
         * ourselves into the futex hash. This code thus has to
-        * rely on the FUTEX_WAKE code doing a wakeup after removing
+        * rely on the futex_wake() code doing a wakeup after removing
         * the waiter from the list.
         */
        add_wait_queue(&q.waiters, &wait);
@@ -481,9 +482,13 @@ out:
        return ret;
 }

-long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout, unsigned long uaddr2)
+#define OLD_FUTEX_WAIT (0)
+#define OLD_FUTEX_WAKE (1)
+#define OLD_FUTEX_FD (2)
+
+static long do_old_futex(unsigned long uaddr, int op, int val, unsigned long timeout, unsigned long uaddr2)
 {
-       unsigned long pos_in_page, pos_in_page2;
+       unsigned long pos_in_page;
        int ret;

        pos_in_page = uaddr % PAGE_SIZE;
@@ -493,21 +498,13 @@ long do_futex(unsigned long uaddr, int o
                return -EINVAL;

        switch (op) {
-       case FUTEX_WAIT:
+       case OLD_FUTEX_WAIT:
                ret = futex_wait(uaddr, pos_in_page, val, timeout);
                break;
-       case FUTEX_WAKE:
+       case OLD_FUTEX_WAKE:
                ret = futex_wake(uaddr, pos_in_page, val);
                break;
-       case FUTEX_REQUEUE:
-               pos_in_page2 = uaddr2 % PAGE_SIZE;
-
-               /* Must be "naturally" aligned */
-               if (pos_in_page2 % sizeof(u32))
-                       return -EINVAL;
-               ret = futex_requeue(uaddr, pos_in_page, uaddr2, pos_in_page2, val);
-               break;
-       case FUTEX_FD:
+       case OLD_FUTEX_FD:
                /* non-zero val means F_SETOWN(getpid()) & F_SETSIG(val) */
                ret = futex_fd(uaddr, pos_in_page, val);
                break;
@@ -517,17 +514,75 @@ long do_futex(unsigned long uaddr, int o
        return ret;
 }

-asmlinkage long sys_futex(u32 __user *uaddr, int op, int val, struct timespec __user *utime, u32 __user *uaddr2)
+asmlinkage long old_futex(u32 __user *uaddr, int op, int val, struct timespec __user *utime, u32 __user *uaddr2)
 {
        struct timespec t;
        unsigned long timeout = MAX_SCHEDULE_TIMEOUT;

-       if ((op == FUTEX_WAIT) && utime) {
+       if ((op == OLD_FUTEX_WAIT) && utime) {
                if (copy_from_user(&t, utime, sizeof(t)) != 0)
                        return -EFAULT;
                timeout = timespec_to_jiffies(&t) + 1;
        }
-       return do_futex((unsigned long)uaddr, op, val, timeout, (unsigned long)uaddr2);
+       return do_old_futex((unsigned long)uaddr, op, val, timeout, (unsigned long)uaddr2);
+}
+
+asmlinkage long sys_futex_wait(u32 __user *__uaddr, int val, struct timespec __user *utime)
+{
+       unsigned long uaddr = (unsigned long)__uaddr;
+       unsigned long pos_in_page = uaddr % PAGE_SIZE;
+       unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
+       struct timespec t;
+
+       /* Must be "naturally" aligned */
+       if (pos_in_page % sizeof(u32))
+               return -EINVAL;
+
+       if (utime) {
+               if (copy_from_user(&t, utime, sizeof(t)) != 0)
+                       return -EFAULT;
+               timeout = timespec_to_jiffies(&t) + 1;
+       }
+       return futex_wait(uaddr, pos_in_page, val, timeout);
+}
+
+asmlinkage long sys_futex_wake(u32 __user *__uaddr, int nr_wake)
+{
+       unsigned long uaddr = (unsigned long)__uaddr;
+       unsigned long pos_in_page = uaddr % PAGE_SIZE;
+
+       /* Must be "naturally" aligned */
+       if (pos_in_page % sizeof(u32))
+               return -EINVAL;
+
+       return futex_wake(uaddr, pos_in_page, nr_wake);
+}
+
+asmlinkage long sys_futex_fd(u32 __user *__uaddr, int signal)
+{
+       unsigned long uaddr = (unsigned long)__uaddr;
+       unsigned long pos_in_page = uaddr % PAGE_SIZE;
+
+       /* Must be "naturally" aligned */
+       if (pos_in_page % sizeof(u32))
+               return -EINVAL;
+
+       return futex_fd(uaddr, pos_in_page, signal);
+}
+
+asmlinkage long sys_futex_requeue(u32 __user *__uaddr1,
+                               u32 __user *__uaddr2, int nr_wake)
+{
+       unsigned long uaddr1 = (unsigned long)__uaddr1,
+                       uaddr2 = (unsigned long)__uaddr2;
+       unsigned long pos_in_page1 = uaddr1 % PAGE_SIZE,
+                       pos_in_page2 = uaddr2 % PAGE_SIZE;
+
+       /* Must be "naturally" aligned */
+       if ((pos_in_page1 | pos_in_page2) % sizeof(u32))
+               return -EINVAL;
+
+       return futex_requeue(uaddr1, pos_in_page1, uaddr2, pos_in_page2, nr_wake);
 }

 static struct super_block *
--- linux/kernel/fork.c.orig    
+++ linux/kernel/fork.c
@@ -457,7 +457,7 @@ void mm_release(struct task_struct *tsk,
                 * not set up a proper pointer then tough luck.
                 */
                put_user(0, tidptr);
-               sys_futex(tidptr, FUTEX_WAKE, 1, NULL, NULL);
+               sys_futex_wake(tidptr, 1);
        }
 }

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/