[PATCH] 2.5.35 patch for making DIO async

[PATCH] 2.5.35 patch for making DIO async

Post by Badari Pulavart » Thu, 19 Sep 2002 06:10:07



Hi Ben,

Here is a 2.5.35 patch to make DIO async. Basically, DIO does not
wait for io completion. Waiting will be done at the higher level
(same way generic_file_read does).

Here is what I did:

1) pass kiocb *iocb to DIO
2) allocated a "dio" (instead of using stack)
3) after submitting bios, dio code returns with -EIOCBQUEUED
4) removed dio_bio_end_io(), dio_await_one(), dio_await_completion()
5) added dio_bio_end_aio() which calls dio_bio_complete() for
   each bio and if the dio_biocount becomes 0, it calls aio_complete()
6) changed raw_read/raw_write/.. routines to pass a sync_cb and wait
   on it.

Any feedback on approach/patch is appreciated.

Thanks,
Badari

diff -Naur -X dontdiff linux-2.5.35/drivers/char/raw.c linux-2.5.35.aio/drivers/char/raw.c
--- linux-2.5.35/drivers/char/raw.c     Sun Sep 15 19:18:37 2002
+++ linux-2.5.35.aio/drivers/char/raw.c Tue Sep 17 09:00:39 2002
@@ -201,8 +201,9 @@
 }

 static ssize_t
-rw_raw_dev(int rw, struct file *filp, const struct iovec *iov, unsigned long nr_segs, loff_t *offp)
+rw_raw_aio_dev(int rw, struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t *offp)
 {
+       struct file *filp = iocb->ki_filp;
        const int minor = minor(filp->f_dentry->d_inode->i_rdev);
        struct block_device *bdev = raw_devices[minor].binding;
        struct inode *inode = bdev->bd_inode;
@@ -222,7 +223,7 @@
                count = inode->i_size - *offp;
                nr_segs = iov_shorten((struct iovec *)iov, nr_segs, count);
        }
-       ret = generic_file_direct_IO(rw, inode, iov, *offp, nr_segs);
+       ret = generic_file_direct_IO(rw, iocb, inode, iov, *offp, nr_segs);

        if (ret > 0)
                *offp += ret;
@@ -231,6 +232,19 @@
 }

 static ssize_t
+rw_raw_dev(int rw, struct file *filp, const struct iovec *iov, unsigned long nr_segs, loff_t *offp)
+{
+       struct kiocb kiocb;
+       ssize_t ret;
+
+       init_sync_kiocb(&kiocb, filp);
+       ret = rw_raw_aio_dev(rw, &kiocb, iov, nr_segs, offp);
+       if (-EIOCBQUEUED == ret)
+               ret = wait_on_sync_kiocb(&kiocb);
+       return ret;
+}
+
+static ssize_t
 raw_read(struct file *filp, char *buf, size_t size, loff_t *offp)
 {
        struct iovec local_iov = { .iov_base = buf, .iov_len = size};
@@ -246,6 +260,21 @@
        return rw_raw_dev(WRITE, filp, &local_iov, 1, offp);
 }

+static ssize_t
+raw_aio_read(struct kiocb *iocb, const char *buf, size_t size, loff_t *offp)
+{
+       struct iovec local_iov = { .iov_base = buf, .iov_len = size};
+
+       return rw_raw_aio_dev(READ, iocb, &local_iov, 1, offp);
+}
+
+static ssize_t
+raw_aio_write(struct kiocb *iocb, const char *buf, size_t size, loff_t *offp)
+{
+       struct iovec local_iov = { .iov_base = buf, .iov_len = size};
+
+       return rw_raw_aio_dev(WRITE, iocb, &local_iov, 1, offp);
+}
 static ssize_t
 raw_readv(struct file *filp, const struct iovec *iov, unsigned long nr_segs, loff_t *offp)
 {
@@ -260,7 +289,9 @@

 static struct file_operations raw_fops = {
        .read   =       raw_read,
+       .aio_read=      raw_aio_read,
        .write  =       raw_write,
+       .aio_write=     raw_aio_write,
        .open   =       raw_open,
        .release=       raw_release,
        .ioctl  =       raw_ioctl,
diff -Naur -X dontdiff linux-2.5.35/fs/block_dev.c linux-2.5.35.aio/fs/block_dev.c
--- linux-2.5.35/fs/block_dev.c Sun Sep 15 19:19:09 2002
+++ linux-2.5.35.aio/fs/block_dev.c     Tue Sep 17 09:00:26 2002
@@ -116,10 +116,10 @@
 }

 static int
-blkdev_direct_IO(int rw, struct inode *inode, const struct iovec *iov,
-                       loff_t offset, unsigned long nr_segs)
+blkdev_direct_IO(int rw, struct kiocb *iocb, struct inode * inode,
+               const struct iovec *iov, loff_t offset, unsigned long nr_segs)
 {
-       return generic_direct_IO(rw, inode, iov, offset,
+       return generic_direct_IO(rw, iocb, inode, iov, offset,
                                nr_segs, blkdev_get_blocks);
 }

diff -Naur -X dontdiff linux-2.5.35/fs/direct-io.c linux-2.5.35.aio/fs/direct-io.c
--- linux-2.5.35/fs/direct-io.c Sun Sep 15 19:18:30 2002
+++ linux-2.5.35.aio/fs/direct-io.c     Tue Sep 17 09:48:16 2002
@@ -20,6 +20,7 @@
 #include <linux/err.h>
 #include <linux/buffer_head.h>
 #include <linux/rwsem.h>
+#include <linux/slab.h>
 #include <asm/atomic.h>

 /*
@@ -68,6 +69,8 @@
        spinlock_t bio_list_lock;       /* protects bio_list */
        struct bio *bio_list;           /* singly linked via bi_private */
        struct task_struct *waiter;     /* waiting task (NULL if none) */
+       struct kiocb *iocb;             /* iocb ptr */
+       int results;
 };

 /*
@@ -145,25 +148,41 @@
 }

 /*
- * The BIO completion handler simply queues the BIO up for the process-context
- * handler.
- *
- * During I/O bi_private points at the dio.  After I/O, bi_private is used to
- * implement a singly-linked list of completed BIOs, at dio->bio_list.
+ * Process one completed BIO.  No locks are held.
  */
-static void dio_bio_end_io(struct bio *bio)
+static int dio_bio_complete(struct dio *dio, struct bio *bio)
 {
+       const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+       struct bio_vec *bvec = bio->bi_io_vec;
+       int page_no;
+
+       for (page_no = 0; page_no < bio->bi_vcnt; page_no++) {
+               struct page *page = bvec[page_no].bv_page;
+
+               if (dio->rw == READ)
+                       set_page_dirty(page);
+               page_cache_release(page);
+       }
+       atomic_dec(&dio->bio_count);
+       bio_put(bio);
+       return uptodate ? 0 : -EIO;
+}
+
+static void dio_bio_end_aio(struct bio *bio)
+{
+       int ret;
        struct dio *dio = bio->bi_private;
-       unsigned long flags;
+       ret = dio_bio_complete(dio, bio);
+       if (ret)
+               dio->results = ret;

-       spin_lock_irqsave(&dio->bio_list_lock, flags);
-       bio->bi_private = dio->bio_list;
-       dio->bio_list = bio;
-       if (dio->waiter)
-               wake_up_process(dio->waiter);
-       spin_unlock_irqrestore(&dio->bio_list_lock, flags);
+       if (atomic_read(&dio->bio_count) == 0) {
+               aio_complete(dio->iocb, dio->results, 0);
+               kfree(dio);
+       }
 }

+
 static int
 dio_bio_alloc(struct dio *dio, struct block_device *bdev,
                sector_t first_sector, int nr_vecs)
@@ -180,7 +199,7 @@
        bio->bi_size = 0;
        bio->bi_sector = first_sector;
        bio->bi_io_vec[0].bv_page = NULL;
-       bio->bi_end_io = dio_bio_end_io;
+       bio->bi_end_io = dio_bio_end_aio;

        dio->bio = bio;
        dio->bvec = NULL;            /* debug */
@@ -212,75 +231,6 @@
 }

 /*
- * Wait for the next BIO to complete.  Remove it and return it.
- */
-static struct bio *dio_await_one(struct dio *dio)
-{
-       unsigned long flags;
-       struct bio *bio;
-
-       spin_lock_irqsave(&dio->bio_list_lock, flags);
-       while (dio->bio_list == NULL) {
-               set_current_state(TASK_UNINTERRUPTIBLE);
-               if (dio->bio_list == NULL) {
-                       dio->waiter = current;
-                       spin_unlock_irqrestore(&dio->bio_list_lock, flags);
-                       blk_run_queues();
-                       schedule();
-                       spin_lock_irqsave(&dio->bio_list_lock, flags);
-                       dio->waiter = NULL;
-               }
-               set_current_state(TASK_RUNNING);
-       }
-       bio = dio->bio_list;
-       dio->bio_list = bio->bi_private;
-       spin_unlock_irqrestore(&dio->bio_list_lock, flags);
-       return bio;
-}
-
-/*
- * Process one completed BIO.  No locks are held.
- */
-static int dio_bio_complete(struct dio *dio, struct bio *bio)
-{
-       const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-       struct bio_vec *bvec = bio->bi_io_vec;
-       int page_no;
-
-       for (page_no = 0; page_no < bio->bi_vcnt; page_no++) {
-               struct page *page = bvec[page_no].bv_page;
-
-               if (dio->rw == READ)
-                       set_page_dirty(page);
-               page_cache_release(page);
-       }
-       atomic_dec(&dio->bio_count);
-       bio_put(bio);
-       return uptodate ? 0 : -EIO;
-}
-
-/*
- * Wait on and process all in-flight BIOs.
- */
-static int dio_await_completion(struct dio *dio)
-{
-       int ret = 0;
-
-       if (dio->bio)
-               dio_bio_submit(dio);
-
-       while (atomic_read(&dio->bio_count)) {
-               struct bio *bio = dio_await_one(dio);
-               int ret2;
-
-               ret2 = dio_bio_complete(dio, bio);
-               if (ret == 0)
-                       ret = ret2;
-       }
-       return ret;
-}
-
-/*
  * A really large O_DIRECT read or write can generate a lot of BIOs.  So
  * to keep the memory consumption sane we periodically reap any completed BIOs
  * during the BIO generation phase.
@@ -528,77 +478,83 @@
 }

 int
-direct_io_worker(int rw, struct inode *inode, const struct iovec *iov,
-       loff_t offset, unsigned long nr_segs, get_blocks_t get_blocks)
+direct_io_worker(int rw, struct kiocb *iocb, struct inode * inode,
+       const struct iovec *iov, loff_t offset, unsigned long nr_segs,
+       get_blocks_t get_blocks)
 {
        const unsigned blkbits = inode->i_blkbits;
        unsigned long user_addr;
-       int seg, ret2, ret = 0;
-       struct dio dio;
-       size_t bytes, tot_bytes = 0;
-
-       dio.bio = NULL;
-       dio.bvec = NULL;
-       dio.inode = inode;
-       dio.rw = rw;
-       dio.blkbits = blkbits;
-       dio.block_in_file = offset >> blkbits;
-       dio.blocks_available = 0;
-
-       dio.boundary = 0;
-       dio.reap_counter = 0;
-       dio.get_blocks = get_blocks;
-       dio.last_block_in_bio = -1;
-       dio.next_block_in_bio = -1;
+       int seg, ret = 0;
+       struct dio *dio;
+       size_t bytes;
+
+       dio = (struct dio *)kmalloc(sizeof(struct dio), GFP_KERNEL);
+       if (!dio)
+               return -ENOMEM;
+
+       dio->bio = NULL;
+       dio->bvec = NULL;
+       dio->inode = inode;
+       dio->iocb = iocb;
+       dio->rw = rw;
+       dio->blkbits = blkbits;
+       dio->block_in_file = offset >> blkbits;
+       dio->blocks_available = 0;

-       dio.page_errors = 0;
+       dio->results = 0;
+       dio->boundary = 0;
+       dio->reap_counter = 0;
+       dio->get_blocks = get_blocks;
+       dio->last_block_in_bio = -1;
+       dio->next_block_in_bio = -1;
+
+       dio->page_errors = 0;

        /* BIO completion state */
-       atomic_set(&dio.bio_count, 0);
-       spin_lock_init(&dio.bio_list_lock);
-       dio.bio_list = NULL;
-       dio.waiter = NULL;
+       atomic_set(&dio->bio_count, 0);
+       spin_lock_init(&dio->bio_list_lock);
+       dio->bio_list = NULL;
+       dio->waiter = NULL;

        for (seg = 0; seg < nr_segs; seg++) {
                user_addr = (unsigned long)iov[seg].iov_base;
                bytes = iov[seg].iov_len;

                /* Index into the first page of the first block */
-               dio.first_block_in_page = (user_addr & (PAGE_SIZE - 1)) >> blkbits;
-               dio.final_block_in_request = dio.block_in_file + (bytes >> blkbits);
+               dio->first_block_in_page = (user_addr & (PAGE_SIZE - 1)) >> blkbits;
+               dio->final_block_in_request = dio->block_in_file + (bytes >> blkbits);
                /* Page fetching state */
-               dio.head = 0;
-               dio.tail = 0;
-               dio.curr_page = 0;
+               dio->head = 0;
+               dio->tail = 0;
+               dio->curr_page = 0;

-      
...

read more »

 
 
 

[PATCH] 2.5.35 patch for making DIO async

Post by Benjamin LaHais » Thu, 19 Sep 2002 21:00:06



> Hi Ben,

> Here is a 2.5.35 patch to make DIO async. Basically, DIO does not
> wait for io completion. Waiting will be done at the higher level
> (same way generic_file_read does).

This looks pretty good.  Andrew Morton has had a look over it too and
agrees that it should go in after a bit of testing.  Does someone want
to try comparing throughput of dio based aio vs normal read/write?  It
would be interesting to see what kind of an effect pipelining aios makes
and if there are any performance niggles we need to squash.  Cheer,

                -ben
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in

More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

 
 
 

[PATCH] 2.5.35 patch for making DIO async

Post by Badari Pulavart » Fri, 20 Sep 2002 01:10:05



> > Hi Ben,

> > Here is a 2.5.35 patch to make DIO async. Basically, DIO does not
> > wait for io completion. Waiting will be done at the higher level
> > (same way generic_file_read does).

> This looks pretty good.  Andrew Morton has had a look over it too and
> agrees that it should go in after a bit of testing.  Does someone want
> to try comparing throughput of dio based aio vs normal read/write?  It
> would be interesting to see what kind of an effect pipelining aios makes
> and if there are any performance niggles we need to squash.  Cheer,

>            -ben

Thanks for looking at the patch. Patch needed few cleanups to get it
working. Here is the status so far..

1) I tested synchronous raw read/write. They perform almost same as
   without this patch. I am looking at why I can't match the performance.
   (I get 380MB/sec on 40 dd's on 40 disks without this patch.
    I get 350MB/sec with this patch).

2) wait_on_sync_kiocb() needed blk_run_queues() to make regular read/
   write perform well.

3) I am testing aio read/writes. I am using libaio.0.3.92.
   When I try aio_read/aio_write on raw device, I am get OOPS.
   Can I use libaio.0.3.92 on 2.5 ? Are there any interface
   changes I need to worry  between 2.4 and 2.5 ?

Once I get aio read/writes working, I will provide you the performance
numbers.

Thanks,
Badari

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in

More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

 
 
 

[PATCH] 2.5.35 patch for making DIO async

Post by Benjamin LaHais » Fri, 20 Sep 2002 01:10:08



> Thanks for looking at the patch. Patch needed few cleanups to get it
> working. Here is the status so far..

> 1) I tested synchronous raw read/write. They perform almost same as
>    without this patch. I am looking at why I can't match the performance.
>    (I get 380MB/sec on 40 dd's on 40 disks without this patch.
>     I get 350MB/sec with this patch).

Hmm, we should try to improve that.  Have you tried profiling a long run?

Quote:> 2) wait_on_sync_kiocb() needed blk_run_queues() to make regular read/
>    write perform well.

That should be somewhat conditional, but for now it sounds like the right
thing to do.

Quote:> 3) I am testing aio read/writes. I am using libaio.0.3.92.
>    When I try aio_read/aio_write on raw device, I am get OOPS.
>    Can I use libaio.0.3.92 on 2.5 ? Are there any interface
>    changes I need to worry  between 2.4 and 2.5 ?

libaio 0.3.92 should work on 2.5.  Could you send me a copy of the
decoded Oops?

                -ben
--
GMS rules.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in

More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

 
 
 

[PATCH] 2.5.35 patch for making DIO async

Post by Badari Pulavart » Fri, 20 Sep 2002 01:40:06



> > Thanks for looking at the patch. Patch needed few cleanups to get it
> > working. Here is the status so far..

> > 1) I tested synchronous raw read/write. They perform almost same as
> >    without this patch. I am looking at why I can't match the performance.
> >    (I get 380MB/sec on 40 dd's on 40 disks without this patch.
> >     I get 350MB/sec with this patch).

> Hmm, we should try to improve that.  Have you tried profiling a long run?

I took few shortcuts in the patch compared to original DIO code.
eg., I create a bio all the time.. original code reuses them.
But I don't think that is causing the performance hit. Anyway,
here is the vmstat and profile=2 output.

vmstat:

0 40  2      0 3804628      0  27396   0   0 344909   192 3786  3536   0   5  95
 0 40  1      0 3803268      0  28628   0   0 345686   208 3793  3578   0   5  95
 0 40  1      0 3802452      0  29404   0   0 346983   210 3806  3619   0   5  95
 0 40  1      0 3801888      0  29932   0   0 345178   203 3786  3500   0   5  95
 0 40  1      0 3801804      0  29996   0   0 345792   196 3791  3510   0   5  95
 0 40  1      0 3801652      0  30100   0   0 346048   200 3800  3556   0   5  95

profile:

146796 default_idle                             2293.6875
  1834 __scsi_end_request                        10.4205
  1201 do_softirq                                 6.2552
   891 scsi_dispatch_cmd                          2.3203
    50 __wake_up                                  1.0417
   599 get_user_pages                             0.7341
   362 blk_rq_map_sg                              0.6856
   420 do_direct_IO                               0.5707
    25 dio_prep_bio                               0.5208
   562 __make_request                             0.4878
    55 __scsi_release_command                     0.4167
     6 cap_file_permission                        0.3750
    63 release_console_sem                        0.3580
    55 do_aic7xxx_isr                             0.3438
   141 bio_alloc                                  0.3389
    45 scsi_finish_command                        0.2557
     8 scsi_sense_valid                           0.2500
    16 dio_new_bio                                0.2500
   206 schedule                                   0.2341

Quote:

> > 2) wait_on_sync_kiocb() needed blk_run_queues() to make regular read/
> >    write perform well.

> That should be somewhat conditional, but for now it sounds like the right
> thing to do.

> > 3) I am testing aio read/writes. I am using libaio.0.3.92.
> >    When I try aio_read/aio_write on raw device, I am get OOPS.
> >    Can I use libaio.0.3.92 on 2.5 ? Are there any interface
> >    changes I need to worry  between 2.4 and 2.5 ?

> libaio 0.3.92 should work on 2.5.  Could you send me a copy of the
> decoded Oops?

My Bad :)

aio_read/aio_write() routines take loff_t not loff_t * (like regular read/writes)

        ssize_t (*read) (struct file *, char *, size_t, loff_t *);
        ssize_t (*aio_read) (struct kiocb *, char *, size_t, loff_t);

I coded for loff_t *. fixed it.  I am testing it now.

Thanks,
Badari

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in

More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

 
 
 

[PATCH] 2.5.35 patch for making DIO async

Post by Badari Pulavart » Fri, 20 Sep 2002 05:50:08


Ben,

aio_read/aio_write() are now working with a minor fix to fs/aio.c

io_submit_one():

        if (likely(EIOCBQUEUED == ret))

                needs to be changed to

        if (likely(-EIOCBQUEUED == ret))
                  ^^^

I was wondering what happens to following case (I think this
happend in my test program).

Lets say, I did an sys_io_submit() and my test program did exit().
When the IO complete happend, it tried to do following and got
an OOPS in aio_complete().

        if (ctx == &ctx->mm->default_kioctx) {

I think "mm" is freed up, when process exited. Do you think this is
possible ?  How do we handle this ?

- Badari
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in

More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

 
 
 

[PATCH] 2.5.35 patch for making DIO async

Post by Suparna Bhattachary » Fri, 20 Sep 2002 20:00:08



> Ben,

> aio_read/aio_write() are now working with a minor fix to fs/aio.c

> io_submit_one():

>    if (likely(EIOCBQUEUED == ret))

>            needs to be changed to

>    if (likely(-EIOCBQUEUED == ret))
>              ^^^

> I was wondering what happens to following case (I think this
> happend in my test program).

> Lets say, I did an sys_io_submit() and my test program did exit().
> When the IO complete happend, it tried to do following and got
> an OOPS in aio_complete().

>    if (ctx == &ctx->mm->default_kioctx) {

> I think "mm" is freed up, when process exited. Do you think this is
> possible ?  How do we handle this ?

Do you see this only in the sync case ?
init_sync_iocb ought to increment ctx->reqs_active, so that
exit_aio waits for the iocb's to complete.

Regards
Suparna
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in

More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

 
 
 

[PATCH] 2.5.35 patch for making DIO async

Post by Suparna Bhattachary » Fri, 20 Sep 2002 20:40:07




> > Ben,

> > aio_read/aio_write() are now working with a minor fix to fs/aio.c

> > io_submit_one():

> >       if (likely(EIOCBQUEUED == ret))

> >               needs to be changed to

> >       if (likely(-EIOCBQUEUED == ret))
> >                 ^^^

> > I was wondering what happens to following case (I think this
> > happend in my test program).

> > Lets say, I did an sys_io_submit() and my test program did exit().
> > When the IO complete happend, it tried to do following and got
> > an OOPS in aio_complete().

> >       if (ctx == &ctx->mm->default_kioctx) {

> > I think "mm" is freed up, when process exited. Do you think this is
> > possible ?  How do we handle this ?

> Do you see this only in the sync case ?
> init_sync_iocb ought to increment ctx->reqs_active, so that
> exit_aio waits for the iocb's to complete.

Sorry, guess in the sync case, exit_aio shouldn't happen since
the current thread still has a reference to the mm.

And your problem is with the io_submit case ... have to look closely
to find out why.

> Regards
> Suparna
> --
> To unsubscribe, send a message with 'unsubscribe linux-aio' in

> see: http://www.kvack.org/aio/

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in

More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/