async-io API registration for 2.5.29

async-io API registration for 2.5.29

Post by Andrea Arcangel » Wed, 31 Jul 2002 14:50:05



Hello,

this patch against 2.5.29 adds the async-io API as from latest Ben's
patch.

I find the dynamic syscall approch in some vendor kernel out there
that implements a /proc/libredhat unacceptable since it's not forward
compatible with 2.5:

@@ -636,6 +637,12 @@
        .long SYMBOL_NAME(sys_ni_syscall)       /* reserved for fremovexattr */
        .long SYMBOL_NAME(sys_tkill)

+       .rept __NR_sys_dynamic_syscall-(.-sys_call_table)/4
+               .long SYMBOL_NAME(sys_ni_syscall)
+       .endr
+       .long SYMBOL_NAME(sys_dynamic_syscall)
+       .long SYMBOL_NAME(sys_io_submit)
+
        .rept NR_syscalls-(.-sys_call_table)/4
                .long SYMBOL_NAME(sys_ni_syscall)
        .endr

diff -urN v2.4.19-pre5/include/asm-i386/unistd.h
linux.diff/include/asm-i386/unistd.h
--- v2.4.19-pre5/include/asm-i386/unistd.h      Wed Apr  3 21:04:38 2002
+++ linux.diff/include/asm-i386/unistd.h        Sat May 18 11:44:01 2002
@@ -245,6 +245,9 @@

 #define __NR_tkill             238

+#define __NR_sys_dynamic_syscall       250
+#define __NR_io_submit                 251
+
 /* user-visible error numbers are in the range -1 - -124: see
 * <asm-i386/errno.h> */

to try not to execute random code they use a magic number choosen at
compile time from /dev/urandom, so the probability to execute random
code is low but still there's a chance. For the io_sumbit I'm not even
sure if it's using the magic anymore (I guess checking the cookie
payload was a showstopper performance hit, in some older patch the
io_sumbit operation was passing through the slowdown of the dynamic
syscall but infact the new code does this:

+asmlinkage long vsys_io_submit(aio_context_t ctx_id, long nr, struct iocb **iocbpp)
+{
+       long res;
+       __asm__ volatile ("int $0x80"
+               : "=a" (res)
+               : "0" (__NR_io_submit), "b" (ctx_id), "c" (nr),
+                 "d" (iocbpp));
+       return res;
+}

). So I would ask if you could merge the below interface into 2.5 so we can
ship a real async-io with real syscalls in 2.4, there's not much time to
change it given this is just used in production userspace today. I
prepared a patch against 2.5.29. Ben, I would appreciate if you could
review and confirm you're fine with it too.

BTW, I'm not the author of the API, and personally I dislike the
sys_io_sumbit approch, the worst part is the multiplexing of course:

+               if (IOCB_CMD_PREAD == tmp.aio_lio_opcode) {
+                       op = file->f_op->aio_read;
+                       if (unlikely(!(file->f_mode & FMODE_READ)))
+                               goto out_put_req;
+               } else if (IOCB_CMD_PREADX == tmp.aio_lio_opcode) {
+                       op = file->f_op->aio_readx;
+                       if (unlikely(!(file->f_mode & FMODE_READ)))
+                               goto out_put_req;
+               } else if (IOCB_CMD_PWRITE == tmp.aio_lio_opcode) {
+                       op = file->f_op->aio_write;
+                       if (unlikely(!(file->f_mode & FMODE_WRITE)))
+                               goto out_put_req;
+               } else if (IOCB_CMD_FSYNC == tmp.aio_lio_opcode) {
+                       op = file->f_op->aio_fsync;
+               } else if (IOCB_CMD_POLL == tmp.aio_lio_opcode) {
+                       op = generic_aio_poll;
+               } else
+                       op = NULL;

instead of separate syscalls for the various async_io
PREAD/PREADX/PWRITE/FSYNC/POLL operations there is just a single entry
point and a parameters specify the operation. But this is what the
current userspace expects and I wouldn't have too much time to change it
anyways because then I would break all the userspace libs too (I just
break them because of the true syscalls instead of passing through the
/proc/libredhat that calls into the dynamic syscall, but that's not
too painful to adapt). And after all even the io_submit isn't too bad
besides the above slowdown in the multiplexing (at least it's sharing
some icache for top/bottom of the functionality).

checked that it still compiles fine on x86 (all other archs should keep
compiling too). available also from here:

        http://www.us.kernel.org/pub/linux/kernel/people/andrea/patches/v2.5/...

Comments are welcome, many thanks.

diff -urNp 2.5.29/arch/i386/kernel/entry.S aio-api-1/arch/i386/kernel/entry.S
--- 2.5.29/arch/i386/kernel/entry.S     Sat Jul 27 06:07:21 2002
+++ aio-api-1/arch/i386/kernel/entry.S  Tue Jul 30 05:23:46 2002
@@ -753,6 +753,12 @@ ENTRY(sys_call_table)
        .long sys_sched_setaffinity
        .long sys_sched_getaffinity
        .long sys_set_thread_area
+       .long sys_io_setup
+       .long sys_io_destroy    /* 245 */
+       .long sys_io_submit
+       .long sys_io_cancel
+       .long sys_io_wait
+       .long sys_io_getevents

        .rept NR_syscalls-(.-sys_call_table)/4
                .long sys_ni_syscall
diff -urNp 2.5.29/fs/Makefile aio-api-1/fs/Makefile
--- 2.5.29/fs/Makefile  Wed Jul 17 02:13:47 2002
+++ aio-api-1/fs/Makefile       Tue Jul 30 05:25:03 2002
@@ -15,7 +15,7 @@ obj-y :=      open.o read_write.o devices.o f
                namei.o fcntl.o ioctl.o readdir.o select.o fifo.o locks.o \
                dcache.o inode.o attr.o bad_inode.o file.o iobuf.o dnotify.o \
                filesystems.o namespace.o seq_file.o xattr.o libfs.o \
-               fs-writeback.o mpage.o direct-io.o
+               fs-writeback.o mpage.o direct-io.o aio.o

 ifneq ($(CONFIG_NFSD),n)
 ifneq ($(CONFIG_NFSD),)
diff -urNp 2.5.29/fs/aio.c aio-api-1/fs/aio.c
--- 2.5.29/fs/aio.c     Thu Jan  1 01:00:00 1970
+++ aio-api-1/fs/aio.c  Tue Jul 30 05:33:20 2002
@@ -0,0 +1,38 @@
+#include <linux/kernel.h>
+#include <linux/aio.h>
+#include <linux/time.h>
+#include <linux/errno.h>
+
+asmlinkage long sys_io_setup(unsigned nr_reqs, aio_context_t *ctxp)
+{
+       return -ENOSYS;
+}
+
+asmlinkage long sys_io_destroy(aio_context_t ctx)
+{
+       return -ENOSYS;
+}
+
+asmlinkage long sys_io_submit(aio_context_t ctx_id, long nr, struct iocb **iocbpp)
+{
+       return -ENOSYS;
+}
+
+asmlinkage long sys_io_cancel(aio_context_t ctx_id, struct iocb *iocb)
+{
+       return -ENOSYS;
+}
+
+asmlinkage long sys_io_wait(aio_context_t ctx_id, struct iocb *iocb,
+                           const struct timespec *timeout)
+{
+       return -ENOSYS;
+}
+
+asmlinkage long sys_io_getevents(aio_context_t ctx_id,
+                                long nr,
+                                struct io_event *events,
+                                const struct timespec *timeout)
+{
+       return -ENOSYS;
+}
diff -urNp 2.5.29/include/asm-i386/unistd.h aio-api-1/include/asm-i386/unistd.h
--- 2.5.29/include/asm-i386/unistd.h    Sun Apr 14 22:09:06 2002
+++ aio-api-1/include/asm-i386/unistd.h Tue Jul 30 05:22:38 2002
@@ -247,6 +247,13 @@
 #define __NR_futex             240
 #define __NR_sched_setaffinity 241
 #define __NR_sched_getaffinity 242
+#define __NR_set_thread_area   243
+#define __NR_io_setup          244
+#define __NR_io_destroy                245
+#define __NR_io_submit         246
+#define __NR_io_cancel         247
+#define __NR_io_wait           248
+#define __NR_io_getevents      249

 /* user-visible error numbers are in the range -1 - -124: see <asm-i386/errno.h> */

diff -urNp 2.5.29/include/linux/aio.h aio-api-1/include/linux/aio.h
--- 2.5.29/include/linux/aio.h  Thu Jan  1 01:00:00 1970
+++ aio-api-1/include/linux/aio.h       Tue Jul 30 05:32:30 2002
@@ -0,0 +1,6 @@
+#ifndef __LINUX__AIO_H
+#define __LINUX__AIO_H
+
+#include <linux/aio_abi.h>
+
+#endif /* __LINUX__AIO_H */
diff -urNp 2.5.29/include/linux/aio_abi.h aio-api-1/include/linux/aio_abi.h
--- 2.5.29/include/linux/aio_abi.h      Thu Jan  1 01:00:00 1970
+++ aio-api-1/include/linux/aio_abi.h   Tue Jul 30 05:57:23 2002
@@ -0,0 +1,86 @@
+/* linux/aio_abi.h
+ *
+ * Copyright 2000,2001,2002 Red Hat.
+ *
+ * Written by Benjamin LaHaise <b...@redhat.com>
+ *
+ * Permission to use, copy, modify, and distribute this software and its
+ * documentation is hereby granted, provided that the above copyright
+ * notice appears in all copies.  This software is provided without any
+ * warranty, express or implied.  Red Hat makes no representations about
+ * the suitability of this software for any purpose.
+ *
+ * IN NO EVENT SHALL RED HAT BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT,
+ * SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OF
+ * THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF RED HAT HAS BEEN ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * RED HAT DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE.  THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND
+ * RED HAT HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+ * ENHANCEMENTS, OR MODIFICATIONS.
+ */
+#ifndef __LINUX__AIO_ABI_H
+#define __LINUX__AIO_ABI_H
+
+#include <asm/byteorder.h>
+
+typedef unsigned long  aio_context_t;
+
+enum {
+       IOCB_CMD_PREAD = 0,
+       IOCB_CMD_PWRITE = 1,
+       IOCB_CMD_FSYNC = 2,
+       IOCB_CMD_FDSYNC = 3,
+       IOCB_CMD_PREADX = 4,
+       IOCB_CMD_POLL = 5,
+       IOCB_CMD_NOOP = 6,
+};
+
+/* read() from /dev/aio returns these structures. */
+struct io_event {
+       __u64           data;           /* the data field from the iocb */
+       __u64           obj;            /* what iocb this event came from */
+       __s64           res;            /* result code for this event */
+       __s64           res2;           /* secondary result */
+};
+
+#if defined(__LITTLE_ENDIAN)
+#define PADDED(x,y)    x, y
+#elif defined(__BIG_ENDIAN)
+#define PADDED(x,y)    y, x
+#else
+#error edit for your odd byteorder.
+#endif
+
+/*
+ * we always use a 64bit off_t when communicating
+ * with userland.  its up to libraries to do the
+ * proper padding and aio_error abstraction
+ */
+
+struct iocb {
+       /* these are internal to the kernel/libc. */
+       __u64   aio_data;       /* data to be returned in event's data */
+       __u32   PADDED(aio_key, aio_reserved1);
+                               /* the kernel sets aio_key to the req # */
+
+       /* common fields */
+       __u16   aio_lio_opcode; /* see IOCB_CMD_ above */
+       __s16   aio_reqprio;
+       __u32   aio_fildes;
+
+       __u64   aio_buf;
+       __u64   aio_nbytes;
+       __s64   aio_offset;
+
+       /* extra parameters */
+       __u64   aio_reserved2;
+       __u64   aio_reserved3;
+}; /* 64 bytes */
+
+#undef IFBIG
+#undef IFLITTLE
+
+#endif /* __LINUX__AIO_ABI_H */

Andrea
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

 
 
 

async-io API registration for 2.5.29

Post by Christoph Hellwi » Wed, 31 Jul 2002 17:20:08



> I find the dynamic syscall approch in some vendor kernel out there
> that implements a /proc/libredhat unacceptable since it's not forward
> compatible with 2.5:

What is /proc/libredhat supposed to be?  It hasn't ever been part of the
AIO patches.

Quote:> ). So I would ask if you could merge the below interface into 2.5 so we can
> ship a real async-io with real syscalls in 2.4, there's not much time to
> change it given this is just used in production userspace today. I
> prepared a patch against 2.5.29. Ben, I would appreciate if you could
> review and confirm you're fine with it too.

Please don't.  First Ben has indicated on kernel summit that the abi might
change and I think it's a bad idea to lock him into the old ABI just because
suse doesn't want to have something called libredhat.so* in /lib.
Alternate suggestion: rename it to libunited.so.

And even if there is a syscall reservation the way to do it is not to add
the real syscall names to entry.S and implement stubs but to use
sys_ni_syscall.

Quote:> BTW, I'm not the author of the API, and personally I dislike the
> sys_io_sumbit approch, the worst part is the multiplexing of course:

Okay.  So you think the API is stupid but want it to get in without
discussion??

If you really want to ship the old-style AIO (of which I remember ben
saying it it broken for everything post-2.4.9) please stick to the patch
Ben has around, otherwise wait for the proper 2.5 solution.  I have my
doubts that it is backportable, though.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in

More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

 
 
 

async-io API registration for 2.5.29

Post by Rik van Rie » Wed, 31 Jul 2002 20:00:09



> I find the dynamic syscall approch in some vendor kernel out there
> that implements a /proc/libredhat unacceptable since it's not forward
> compatible with 2.5:

How do you know this one will be compatible with 2.5 ?

You yourself had suggestions for improving the interface
and I wouldn't be surprised if at least some of those
would get merged for 2.5 and would end up changing the
interface ;)

regards,

Rik
--
Bravely reimplemented by the knights who say "NIH".

http://www.surriel.com/             http://distro.conectiva.com/

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in

More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

 
 
 

async-io API registration for 2.5.29

Post by Benjamin LaHais » Wed, 31 Jul 2002 22:00:06



> instead of separate syscalls for the various async_io
> PREAD/PREADX/PWRITE/FSYNC/POLL operations there is just a single entry
> point and a parameters specify the operation. But this is what the
> current userspace expects and I wouldn't have too much time to change it
> anyways because then I would break all the userspace libs too (I just
> break them because of the true syscalls instead of passing through the
> /proc/libredhat that calls into the dynamic syscall, but that's not
> too painful to adapt). And after all even the io_submit isn't too bad
> besides the above slowdown in the multiplexing (at least it's sharing
> some icache for top/bottom of the functionality).

What would you suggest as an alternative API?  The main point of multiplexing
is that ios can be submitted in batches, which can't be done if the ios are
submitted via individual syscalls, not to mention the overlap with the posix
aio api.

Quote:> checked that it still compiles fine on x86 (all other archs should keep
> compiling too). available also from here:

>    http://www.us.kernel.org/pub/linux/kernel/people/andrea/patches/v2.5/...

> Comments are welcome, many thanks.

That's the old cancellation API.  Anyways, the core is pretty much ready, so
don't bother with this patch.

                -ben
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in

More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

 
 
 

async-io API registration for 2.5.29

Post by Suparna Bhattachary » Wed, 31 Jul 2002 22:40:07



>  Anyways, the core is pretty much ready, so

Hey Ben, That sounds great. Have been looking forward to it
to find out how much has changed and if you've left anything
for us to do :) (other than docs and driver fixes :( )

I did have an updated version of the bio traversal patch
(for 2.5.29) that avoids modifications to the bv_offset/bv_len
fields by the block layer, though I don't know if you
still need it. Besides, you probably wouldn't run into
those cases often, as the partial request completions
are probably rare. But just as a fyi ...

Regards
Suparna

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in

More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

 
 
 

async-io API registration for 2.5.29

Post by Linus Torvald » Wed, 31 Jul 2002 22:40:09



> this patch against 2.5.29 adds the async-io API as from latest Ben's
> patch.

Why not make the io_submit system call number 251 like it apparently is
already in 2.4.x? We're really close to it anyway, so if you just re-order
the system calls a bit (and leave 250 as sys_ni_syscall), you're basically
there.

Other than that it looks good.

                Linus

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in

More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

 
 
 

async-io API registration for 2.5.29

Post by Linus Torvald » Wed, 31 Jul 2002 22:50:07



> And even if there is a syscall reservation the way to do it is not to add
> the real syscall names to entry.S and implement stubs but to use
> sys_ni_syscall.

Note that something needs to get moving on this rsn, I'm not interested in
getting aio patches on Oct 30th. The feature freeze may be on Halloween,
but if I get some big feature just days before I'm likely to just say
"screw it".

I think we can still change the stuff in 2.5.x, but I really want to start
seeing some code, so that I'm not taken by surprise by something that
obviously sucks.

Is there any activity on linux-aio? I haven't heard anything since Ottawa.

                Linus

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in

More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

 
 
 

async-io API registration for 2.5.29

Post by Benjamin LaHais » Wed, 31 Jul 2002 23:00:08



> I think we can still change the stuff in 2.5.x, but I really want to start
> seeing some code, so that I'm not taken by surprise by something that
> obviously sucks.

Sorry, I was away last week.  I'm updating patches to 2.5.29, and should have
them ready by the afternoon for people to comment on.  There are a couple of
things to check on ia64 and x86-64 ABI-wise, and people need to comment on the
in-kernel f_ops->read/write changes.

                -ben
--
"You will be reincarnated as a toad; and you will be much happier."
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in

More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

 
 
 

async-io API registration for 2.5.29

Post by Andrea Arcangel » Thu, 01 Aug 2002 01:50:07


On Tue, Jul 30, 2002 at 09:11:40AM +0100, Christoph Hellwig wrote:
> On Tue, Jul 30, 2002 at 07:41:11AM +0200, Andrea Arcangeli wrote:
> > I find the dynamic syscall approch in some vendor kernel out there
> > that implements a /proc/libredhat unacceptable since it's not forward
> > compatible with 2.5:

> What is /proc/libredhat supposed to be?  It hasn't ever been part of the
> AIO patches.

you should read the code then (from the latest aio-20020619.diff).

diff -urN v2.4.19-pre5/Makefile linux.diff/Makefile
--- v2.4.19-pre5/Makefile       Wed Apr  3 21:04:25 2002
+++ linux.diff/Makefile Fri Apr 19 20:57:16 2002
@@ -226,7 +226,7 @@
        drivers/sound/pndsperm.c \
        drivers/sound/pndspini.c \
        drivers/atm/fore200e_*_fw.c drivers/atm/.fore200e_*.fw \
-       .version .config* config.in config.old \
+       .uniquebytes .version .config* config.in config.old \
        scripts/tkparse scripts/kconfig.tk scripts/kconfig.tmp \
        scripts/lxdialog/*.o scripts/lxdialog/lxdialog \
        .menuconfig.log \
@@ -268,6 +268,7 @@
                --end-group \
                -o vmlinux
        $(NM) vmlinux | grep -v '\(compiled\)\|\(\.o$$\)\|\( [aUw]
\)\|\(\.\.ng$$\)\|\(LASH[RL]DI\)' | sort > System.map
+       @$(MAKE) -C ulib

 symlinks:
        rm -f include/asm
@@ -296,7 +297,7 @@

 linuxsubdirs: $(patsubst %, _dir_%, $(SUBDIRS))

-$(patsubst %, _dir_%, $(SUBDIRS)) : dummy include/linux/version.h
include/config/MARKER
+$(patsubst %, _dir_%, $(SUBDIRS)) : dummy include/linux/compile.h
include/config/MARKER
        $(MAKE) CFLAGS="$(CFLAGS) $(CFLAGS_KERNEL)" -C $(patsubst
_dir_%, %, $@)

 $(TOPDIR)/include/linux/version.h: include/linux/version.h
@@ -322,6 +323,11 @@
           echo \#define LINUX_COMPILE_DOMAIN ; \
         fi >> .ver
        @echo \#define LINUX_COMPILER \"`$(CC) $(CFLAGS) -v 2>&1 | tail
-1`\" >> .ver
+       @rm -f .uniquebytes
+       @dd if=/dev/urandom of=.uniquebytes bs=1 count=16
+       @echo -n \#"define LINUX_UNIQUE_BYTES " >>.ver
+       @hexdump -v -e '1/1 "0x%02x, "' .uniquebytes | sed -e 's/,
$$//g' >>.ver
+       @echo "" >>.ver
        @mv -f .ver $@

 include/linux/version.h: ./Makefile
@@@ -404,6 +410,8 @@
 .PHONY: $(patsubst %, _modinst_%, $(SUBDIRS))
 $(patsubst %, _modinst_%, $(SUBDIRS)) :
        $(MAKE) -C $(patsubst _modinst_%, %, $@) modules_install
+       mkdir -p  $(INSTALL_MOD_PATH)/lib/kernel/$(KERNELRELEASE)/
+       install -m 755 ulib/libredhat-kernel.so.1.0.1 $(INSTALL_MOD_PATH)/lib/kernel/$(KERNELRELEASE)/

 # modules disabled....

diff -urN v2.4.19-pre5/ulib/Makefile linux.diff/ulib/Makefile
--- v2.4.19-pre5/ulib/Makefile  Wed Dec 31 19:00:00 1969
+++ linux.diff/ulib/Makefile    Fri Apr 19 20:58:01 2002
@@ -0,0 +1,50 @@
+#  Makefile - libredhat-kernel.so build code.
+#
+#    Copyright 2002 Red Hat, Inc.  All Rights Reserved.
+#
+#    This library is free software; you can redistribute it and/or
+#    modify it under the terms of the GNU Lesser General Public
+#    License as published by the Free Software Foundation; either
+#    version 2 of the License, or (at your option) any later version.
+#
+#    This library is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+#    Lesser General Public License for more details.
+#
+#    You should have received a copy of the GNU Lesser General Public
+#    License along with this library; if not, write to the Free
Software
+#    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
02111-1307  USA
+#
+#
+all: libredhat-kernel.so
+
+ASFLAGS=-D__KERNEL__ -D__ASSEMBLY__ -I../include -nostdlib -nostartfiles
+CFLAGS=-D__KERNEL__ -I../include -nostdlib -nostartfiles
+
+so_objs=vsysaddr.o kso_init.o
+
+vsysaddr.S: ../System.map stub.S Makefile
+       rm -f vsysaddr.S
+       echo '#include "stub.S"' >vsysaddr.S
+       awk -- "/^00000000bfff.* vsys_/ { print \"dynamic_syscall(\"\$$3 \",0x\" \$$1 \")\"; }" <../System.map >>vsysaddr.S
+       awk -- "/^bfff.* vsys_/ { print \"dynamic_syscall(\"\$$3 \",0x\" \$$1 \")\"; }" <../System.map >>vsysaddr.S
+
+vsysaddr.o: vsysaddr.S
+
+kso_init.o: ../include/linux/compile.h
+
+libredhat-kernel.so.1.0.1: $(so_objs) libredhat-kernel.map
+       gcc -nostdlib -nostartfiles -shared
-Wl,--version-script=libredhat-kernel.map
-Wl,-soname=libredhat-kernel.so.1 -o $@  $(so_objs)
+       cp $@ $@.save
+       strip $@
+
+libredhat-kernel.so: libredhat-kernel.so.1.0.1
+       ln -sf $< $@
+
+clean:
+       rm -f *.o libredhat-kernel.so myln libredhat-kernel.so.1*
vsysaddr.S
+
+# test app
+myln: myln.c libredhat-kernel.so Makefile
+       cc -g -o myln myln.c -L. -lredhat-kernel
diff -urN v2.4.19-pre5/ulib/README linux.diff/ulib/README
--- v2.4.19-pre5/ulib/README    Wed Dec 31 19:00:00 1969
+++ linux.diff/ulib/README      Fri Apr 19 20:54:05 2002
@@ -0,0 +1,2 @@
+The libredhat-kernel code is provided under the terms of the LGPL.
+See the file COPYING for details.
diff -urN v2.4.19-pre5/ulib/kso_init.c linux.diff/ulib/kso_init.c
--- v2.4.19-pre5/ulib/kso_init.c        Wed Dec 31 19:00:00 1969
+++ linux.diff/ulib/kso_init.c  Fri Apr 19 20:54:05 2002
@@ -0,0 +1,67 @@
+/* kso_init.c - libredhat-kernel.so startup code.
+
+    Copyright 2002 Red Hat, Inc.  All Rights Reserved.
+
+    This library is free software; you can redistribute it and/or
+    modify it under the terms of the GNU Lesser General Public
+    License as published by the Free Software Foundation; either
+    version 2 of the License, or (at your option) any later version.
+
+    This library is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public
+    License along with this library; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
USA
+
+ */
+#include <linux/compile.h>
+#include <linux/types.h>
+#include <asm/unistd.h>
+#include <asm/fcntl.h>
+#include <asm/mman.h>
+#include <asm/a.out.h>
+
+char libredhat_kernel_enosys = 1;      /* the asm in stub.S depends on
this */
+
+long _init(void)
+{
+       static char unique[] = { LINUX_UNIQUE_BYTES };
+       int errno;
+       long addr;
+       int fd;
+       int i;
+
+       _syscall6(int, mmap2, unsigned long, addr, unsigned long, len,
+                 unsigned long, prot, unsigned long, flags,
+                 unsigned long, fd, unsigned long, pgoff)
+       _syscall2(long, munmap, unsigned long, addr, size_t, len)
+       _syscall2(int, open, const char *, name, int, flags)
+       _syscall1(int, close, int, fd)
+
+       if (sizeof(unique) != 16)
+               return -1;
+
+       fd = open("/dev/vsys", O_RDONLY);
+       if (-1 == fd)
+               return -1;
+
+       addr = mmap2(0, VSYSCALL_SIZE, PROT_READ | PROT_EXEC,
MAP_SHARED, fd, 0);
+       if (-1 == addr)
+               return -1;
+
+       close(fd);
+
+       for (i=0; i<sizeof(unique); i++)
+               if (unique[i] != ((char *)addr)[i]) {
+                       munmap(addr, VSYSCALL_SIZE);
+                       return -1;
+               }
+
+       /* okay, all the syscalls we provide are now good */
+       libredhat_kernel_enosys = 0;
+       return 0;
+}
+
diff -urN v2.4.19-pre5/ulib/libredhat-kernel.map
linux.diff/ulib/libredhat-kernel.map
--- v2.4.19-pre5/ulib/libredhat-kernel.map      Wed Dec 31 19:00:00 1969
+++ linux.diff/ulib/libredhat-kernel.map        Tue Apr  2 18:56:58 2002
@@ -0,0 +1,11 @@
+REDHAT_0.90 {
+       global:
+               vsys_io_setup;
+               vsys_io_destroy;
+               vsys_io_submit;
+               vsys_io_cancel;
+               vsys_io_wait;
+               vsys_io_getevents;
+       local:
+               *;
+};
diff -urN v2.4.19-pre5/ulib/myln.c linux.diff/ulib/myln.c
--- v2.4.19-pre5/ulib/myln.c    Wed Dec 31 19:00:00 1969
+++ linux.diff/ulib/myln.c      Tue Apr  2 18:56:58 2002
@@ -0,0 +1,25 @@
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/mman.h>
+
+int main ()
+{
+       long ctx = 0;
+       extern long vsys_io_setup(long, long *);
+       unsigned char *bob = (void*)&vsys_io_setup;
+       long ret;
+       int i;
+       printf("%p\n", bob);
+       //printf("%p\n", mmap(0, 65536, PROT_READ | PROT_EXEC,
MAP_SHARED,
+       //      open("/dev/vsys", O_RDONLY), 0));
+       //for (i=0; i<16; i++)
+       //      printf(" %02x\n", bob[i]);
+       //printf("\n");
+
+       ret = vsys_io_setup(100, &ctx);
+
+       printf("ret=%ld, ctx=0x%lx\n", ret, ctx);
+       return 0;
+}
diff -urN v2.4.19-pre5/ulib/stub.S linux.diff/ulib/stub.S
--- v2.4.19-pre5/ulib/stub.S    Wed Dec 31 19:00:00 1969
+++ linux.diff/ulib/stub.S      Fri Apr 19 20:54:05 2002
@@ -0,0 +1,38 @@
+/* stub.S - libredhat-kernel.so jump code.
+
+    Copyright 2002 Red Hat, Inc.  All Rights Reserved.
+
+    This library is free software; you can redistribute it and/or
+    modify it under the terms of the GNU Lesser General Public
+    License as published by the Free Software Foundation; either
+    version 2 of the License, or (at your option) any later version.
+
+    This library is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public
+    License along with this library; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
USA
+
+ */
+/* stub.S */
+#include <asm/segment.h>
+#include <asm/errno.h>
+
+       .text
+
+#define dynamic_syscall(x,a) \
+       .globl  x                               ;\
+       .type   x, @function                    ;\
+       .align 16                               ;\
+       x:                                      ;\
+               cmpb $0,libredhat_kernel_enosys ;\
+               jne 1f                          ;\
+               ljmp $__USER_CS, $a             ;\
+       1:                                      ;\
+               movl    $-ENOSYS,%eax           ;\
+               ret                             ;\
+       .size    x,.-x
+

and the other funny parts:

+long sys_dynamic_syscall(struct pt_regs regs)
+{
+       struct dummy_args dummy_args;
+       struct vsyscall_entry *ent = (void *)regs.edx;
+       void *args = (void *)regs.ecx;
+       long ret;
+
+       pr_debug("ent = %p  args = %p\n", ent, args);
+       pr_debug("eip = 0x%08lx\n", regs.eip);
+
+       if (unlikely(!current->mm->vsys_mapped))
+               goto err;
@ -231,6 +232,10 @@

        /* Architecture-specific MM context */
        mm_context_t context;
+
+       struct kioctx   *ioctx_list;
+       unsigned long   new_ioctx_id;
...

read more »

 
 
 

async-io API registration for 2.5.29

Post by Andrea Arcangel » Thu, 01 Aug 2002 01:50:09




> > this patch against 2.5.29 adds the async-io API as from latest Ben's
> > patch.

> Why not make the io_submit system call number 251 like it apparently is
> already in 2.4.x? We're really close to it anyway, so if you just re-order
> the system calls a bit (and leave 250 as sys_ni_syscall), you're basically
> there.

> Other than that it looks good.

thank you very much for checking it. Since Ben asked for waiting his
patch you can reject may patch, that's really fine with me as far as it
doesn't take months for his patch to showup. my patch is in perfect sync
with his latest code on the web.

as said I never claimed current API is stupid as Christph understood, I
said I'd preferred a sys_aio_read/write/fsync etc... but I could live
fine with sys_io_submit too, it wasn't too bad enough to make me rewrite
it.

With my patch I mainly wanted to raise eyes on this issue so we can
hopefully get an API registered in a few weeks in mainline. I'm
completely flexbile to rewrite the API too if anybody find good reasons
for it (or if you say, sys_io_submit is too ugly please change to
sys_aio_read/write/etc..).

As Ben said the API is the only thing that is been mostly stable so far,
this is one more reason I felt this is the right way to proceed instead
of building the dynamic syscall slowdown overhead layer that as best
(unsure for sys_io_sumbit 250) is forward binary compatible with 2.5 by
pure luck.

thanks,

Andrea
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in

More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

 
 
 

async-io API registration for 2.5.29

Post by Benjamin LaHais » Thu, 01 Aug 2002 02:10:06



> > Please don't.  First Ben has indicated on kernel summit that the abi might
> > change and I think it's a bad idea to lock him into the old ABI just because

> What I heard and that I remeber crystal clear is that Ben indicated that
> the API isn't changing for a long time, and that's been stable so far,
> I could imagine why.

I suspect what Christoph is remember is that the in-kernel API was still
in flux and up for discussion.

Quote:> I'm trying to do my best to avoid having to merge the code I quoted
> above, that's disgusting and since the api isn't gonna change anwyays
> like Ben said I'm trying to do the right thing to avoid clashes with
> syscall 250 as well.

syscall 250 isn't used in anything Red Hat shipped, that was a matter
of experimentation I was doing in recent aio development trees (which
is what the 2.4.18 patches are, as they still cause that VM to OOM under
rather trivial io patterns).

Quote:> Really last thing: one of the major reasons I don't like the above code
> besides the overhead and complexity it introduces is that it doesn't
> guarantee 100% that it will be forward compatible with 2.5 applications
> (the syscall 250 looks not to check even for the payload, I guess they
> changed it because it was too slow to be forward compatible in most
> cases), the /dev/urandom payload may match the user arguments if you're
> unlucky and since we can guarantee correct operations by doing a syscall
> registration, I don't see why we should make it work by luck.

You haven't looked at the code very closely then.  It checks that the
payload matches, and that the caller is coming from the vsyscall pages.  
Yes, the dynamic syscall thing is a horrific kludge that shouldn't be
used, but the vsyscall technique is rather useful.  This is something
that x86-64 gets wrong by not requiring the vsyscall page to need an
mmap into the user's address space: UML cannot emulate vsyscalls by
faking the mmap.

                -ben
--
"You will be reincarnated as a toad; and you will be much happier."
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in

More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

 
 
 

async-io API registration for 2.5.29

Post by Jeff Dik » Thu, 01 Aug 2002 03:10:08



Quote:> This is something  that x86-64 gets wrong by not requiring the
> vsyscall page to need an  mmap into the user's address space: UML
> cannot emulate vsyscalls by  faking the mmap.

Andrea and I talked about this a bit at KS.

IIRC, he wants vsyscall addresses to be hardcoded constants in libc.  He
doesn't want the overhead of doing an indirect call through whatever
address you get from the vsyscall_mmap() syscall.

At first glance, that breaks any hope of UML being able to virtualize that.
Any vsyscall executed by a UML process will go straight into the host kernel,
completely bypassing UML.

We did come up with a scheme that sounded to me like it would work.

/me tries to remember what it was :-)

I think it was that we provide a syscall to move the vsyscall page.  UML
will use that to relocate the host vsyscalls and map its own page there.
The final piece is that UML would be linked with a different vsyscall address.

Andrea, does that sound right?

I don't particularly like this scheme - the get-the-address-at-runtime
approach is far cleaner, but it does satisfy Andrea's need for speed.

                                Jeff

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in

More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

 
 
 

async-io API registration for 2.5.29

Post by Benjamin LaHais » Thu, 01 Aug 2002 03:20:06



> We did come up with a scheme that sounded to me like it would work.

A constant address is still an option with an mmap'd device.  Just do
an mmap of the device and assert that it is the correct value.

                -ben
--
"You will be reincarnated as a toad; and you will be much happier."
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in

More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

 
 
 

async-io API registration for 2.5.29

Post by Linus Torvald » Thu, 01 Aug 2002 03:20:09




> > We did come up with a scheme that sounded to me like it would work.

> A constant address is still an option with an mmap'd device.  Just do
> an mmap of the device and assert that it is the correct value.

That still doesn't get the TLB advantages of a globally shared page at the
same address.. It also has the overhead of mapping it, which you don't
have if the thing is just always in the address space, and all processes
just get created with that page mapped. That can be a big deal for process
startup latency for small processes.

                Linus

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in

More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

 
 
 

async-io API registration for 2.5.29

Post by Benjamin LaHais » Thu, 01 Aug 2002 03:40:06



> That still doesn't get the TLB advantages of a globally shared page at the
> same address.. It also has the overhead of mapping it, which you don't
> have if the thing is just always in the address space, and all processes
> just get created with that page mapped. That can be a big deal for process
> startup latency for small processes.

That might be a concern once glibc startup can occur with less than a few
dozen calls to grope through the local files. ;-)  Hmmm, it would be possible
to make the vsyscall page mapped by default and leave the global bit enabled
until UML forcibly unmapped it (and then clear the global bit and do a global
invalidate).  Would that be acceptible?

                -ben
--
"You will be reincarnated as a toad; and you will be much happier."
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in

More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

 
 
 

1. missing simbols in oss modules with 2.5.29

if [ -r System.map ]; then /sbin/depmod -ae -F System.map  2.5.29; fi
depmod: *** Unresolved symbols in
/lib/modules/2.5.29/kernel/sound/oss/mpu401.o
depmod:         cli
depmod:         restore_flags
depmod:         sti
depmod:         save_flags
depmod: *** Unresolved symbols in
/lib/modules/2.5.29/kernel/sound/oss/sound.o
depmod:         cli
depmod:         restore_flags
depmod:         sti
depmod:         save_flags
depmod: *** Unresolved symbols in
/lib/modules/2.5.29/kernel/sound/oss/uart401.o
depmod:         cli
depmod:         restore_flags
depmod:         save_flags
depmod: *** Unresolved symbols in
/lib/modules/2.5.29/kernel/sound/oss/v_midi.o
depmod:         cli
depmod:         restore_flags
depmod:         save_flags

Hope this helps

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in

More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

2. truss command...

3. 4: 2.5.29-keyboard

4. *Cheap Internet Connection*

5. 2.5.29: scsi/pcmcia|sound/trident|devfs

6. APM reads 05% when IBM TP560 is saying out of power

7. 2.5.29 serial update

8. Current directory in prompt in Korn shell?

9. 3: 2.5.29-exports

10. 2.5.29-rdunzip

11. 8: 2.5.29-rd

12. 11: 2.5.29-8390

13. 2.5.29 sound/oss/trident.c [2/2] remove cli/sti calls