direct-to-BIO for O_DIRECT

direct-to-BIO for O_DIRECT

Post by Andrew Morto » Tue, 09 Jul 2002 12:20:06



Here's a patch which converts O_DIRECT to go direct-to-BIO, bypassing
the kiovec layer.  It's followed by a patch which converts the raw
driver to use the O_DIRECT engine.

CPU utilisation is about the same as the kiovec-based implementation.
Read and write bandwidth are the same too, for 128k chunks.   But with
one megabyte chunks, this implementation is 20% faster at writing.

I assume this is because the kiobuf-based implementation has to stop
and wait for each 128k chunk, whereas this code streams the entire
request, regardless of its size.

This is with a single (oldish) scsi disk on aic7xxx.  I'd expect the
margin to widen on higher-end hardware which likes to have more
requests in flight.

Question is: what do we want to do with this sucker?  These are the
remaining users of kiovecs:

        drivers/md/lvm-snap.c
        drivers/media/video/video-buf.c
        drivers/mtd/devices/blkmtd.c
        drivers/scsi/sg.c

the video and mtd drivers seems to be fairly easy to de-kiobufize.
I'm aware of one proprietary driver which uses kiobufs.  XFS uses
kiobufs a little bit - just to map the pages.

So with a bit of effort and maintainer-irritation, we can extract
the kiobuf layer from the kernel.

Do we want to do that?

 fs/Makefile                 |    2
 fs/block_dev.c              |    7
 fs/buffer.c                 |    2
 fs/direct-io.c              |  491 ++++++++++++++++++++++++++++++++++++++++++++
 fs/ext2/inode.c             |    7
 include/linux/buffer_head.h |    2
 include/linux/fs.h          |   11
 mm/filemap.c                |   64 ++---
 8 files changed, 543 insertions(+), 43 deletions(-)

--- /dev/null   Thu Aug 30 13:30:55 2001
+++ 2.5.25-akpm/fs/direct-io.c  Sun Jul  7 19:40:20 2002
@@ -0,0 +1,491 @@
+/*
+ * mm/direct-io.c
+ *
+ * Copyright (C) 2002, Linus Torvalds.
+ *
+ * O_DIRECT
+ *
+ * 04Jul2002   a...@zip.com.au
+ *             Initial version
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/bio.h>
+#include <linux/wait.h>
+#include <linux/err.h>
+#include <linux/buffer_head.h>
+#include <linux/rwsem.h>
+#include <asm/atomic.h>
+
+/*
+ * The largest-sized BIO which this code will assemble, in bytes.  Set this
+ * to PAGE_SIZE if your drivers are broken.
+ */
+#define DIO_BIO_MAX_SIZE BIO_MAX_SIZE
+
+/*
+ * How many user pages to map in one call to get_user_pages().  This determines
+ * the size of a structure on the stack.
+ */
+#define DIO_PAGES      64
+
+struct dio {
+       /* BIO submission state */
+       struct bio *bio;                /* bio under assembly */
+       struct bio_vec *bvec;           /* current bvec in that bio */
+       struct inode *inode;
+       int rw;
+       sector_t block_in_file;         /* changes */
+       sector_t final_block_in_request;/* doesn't change */
+       unsigned first_block_in_page;   /* doesn't change */
+       int boundary;                   /* prev block is at a boundary */
+       int reap_counter;               /* rate limit reaping */
+       get_block_t *get_block;
+       sector_t last_block_in_bio;
+
+       /* Page fetching state */
+       int curr_page;                  /* changes */
+       int total_pages;                /* doesn't change */
+       unsigned long curr_user_address;/* changes */
+
+       /* Page queue */
+       struct page *pages[DIO_PAGES];
+       unsigned head;
+       unsigned tail;
+
+       /* BIO completion state */
+       atomic_t bio_count;
+       spinlock_t bio_list_lock;
+       struct bio *bio_list;           /* singly linked via bi_private */
+       wait_queue_head_t wait_q;
+};
+
+/*
+ * How many pages are in the queue?
+ */
+static inline unsigned dio_pages_present(struct dio *dio)
+{
+       return dio->head - dio->tail;
+}
+
+/*
+ * Go grab and pin some userspace pages.   Typically we'll get 64 at a time.
+ */
+static int dio_refill_pages(struct dio *dio)
+{
+       int ret;
+       int nr_pages;
+
+       nr_pages = min(dio->total_pages - dio->curr_page, DIO_PAGES);
+       ret = get_user_pages(
+               current,                        /* Task for fault acounting */
+               current->mm,                 /* whose pages? */
+               dio->curr_user_address,              /* Where from? */
+               nr_pages,                       /* How many pages? */
+               dio->rw == READ,             /* Write to memory? */
+               0,                              /* force (?) */
+               &dio->pages[0],
+               NULL);                          /* vmas */
+
+       if (ret >= 0) {
+               dio->curr_user_address += ret * PAGE_SIZE;
+               dio->curr_page += ret;
+               dio->head = 0;
+               dio->tail = ret;
+               ret = 0;
+       }
+       return ret;    
+}
+
+/*
+ * Get another userspace page.  Returns an ERR_PTR on error.  Pages are
+ * buffered inside the dio so that we can call get_user_pages() against a
+ * decent number of pages, less frequently.  To provide nicer use of the
+ * L1 cache.
+ */
+static struct page *dio_get_page(struct dio *dio)
+{
+       if (dio_pages_present(dio) == 0) {
+               int ret;
+
+               ret = dio_refill_pages(dio);
+               if (ret) {
+                       printk("%s: dio_refill_pages returns %d\n",
+                               __FUNCTION__, ret);
+                       return ERR_PTR(ret);
+               }
+               BUG_ON(dio_pages_present(dio) == 0);
+       }
+       return dio->pages[dio->head++];
+}
+
+/*
+ * The BIO completion handler simply queues the BIO up for the process-context
+ * handler.
+ *
+ * During I/O bi_private points at the dio.  After I/O, bi_private is used to
+ * implement a singly-linked list of completed BIOs, at dio->bio_list.
+ */
+static void dio_bio_end_io(struct bio *bio)
+{
+       struct dio *dio = bio->bi_private;
+       unsigned long flags;
+
+       spin_lock_irqsave(&dio->bio_list_lock, flags);
+       bio->bi_private = dio->bio_list;
+       dio->bio_list = bio;
+       spin_unlock_irqrestore(&dio->bio_list_lock, flags);
+       wake_up(&dio->wait_q);
+}
+
+static int
+dio_bio_alloc(struct dio *dio, struct block_device *bdev,
+               sector_t first_sector, int nr_vecs)
+{
+       struct bio *bio;
+
+       bio = bio_alloc(GFP_KERNEL, nr_vecs);
+       if (bio == NULL)
+               return -ENOMEM;
+
+       bio->bi_bdev = bdev;
+       bio->bi_vcnt = nr_vecs;
+       bio->bi_idx = 0;
+       bio->bi_size = 0;
+       bio->bi_sector = first_sector;
+       bio->bi_io_vec[0].bv_page = NULL;
+       bio->bi_end_io = dio_bio_end_io;
+
+       dio->bio = bio;
+       dio->bvec = NULL;            /* debug */
+       return 0;
+}
+
+static void dio_bio_submit(struct dio *dio)
+{
+       struct bio *bio = dio->bio;
+
+       bio->bi_vcnt = bio->bi_idx;
+       bio->bi_idx = 0;
+       bio->bi_private = dio;
+       atomic_inc(&dio->bio_count);
+       submit_bio(dio->rw, bio);
+
+       dio->bio = NULL;
+       dio->bvec = NULL;
+}
+
+/*
+ * Release any resources in case of a failure
+ */
+static void dio_cleanup(struct dio *dio)
+{
+       while (dio_pages_present(dio))
+               page_cache_release(dio_get_page(dio));
+}
+
+/*
+ * Wait for the next BIO to complete.  Remove it and return it.
+ */
+static struct bio *dio_await_one(struct dio *dio)
+{
+       DECLARE_WAITQUEUE(wait, current);
+       unsigned long flags;
+       struct bio *bio;
+
+       spin_lock_irqsave(&dio->bio_list_lock, flags);
+       while (dio->bio_list == NULL) {
+               add_wait_queue(&dio->wait_q, &wait);
+               set_current_state(TASK_UNINTERRUPTIBLE);
+               if (dio->bio_list == NULL) {
+                       spin_unlock_irqrestore(&dio->bio_list_lock, flags);
+                       blk_run_queues();
+                       schedule();
+                       spin_lock_irqsave(&dio->bio_list_lock, flags);
+               }
+               set_current_state(TASK_RUNNING);
+               remove_wait_queue(&dio->wait_q, &wait);
+       }
+       bio = dio->bio_list;
+       dio->bio_list = bio->bi_private;
+       spin_unlock_irqrestore(&dio->bio_list_lock, flags);
+       return bio;
+}
+
+/*
+ * Process one completed BIO.  No locks are held.
+ */
+static int dio_bio_complete(struct dio *dio, struct bio *bio)
+{
+       const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+       struct bio_vec *bvec = bio->bi_io_vec;
+       int page_no;
+       int ret = 0;
+
+       for (page_no = 0; page_no < bio->bi_vcnt; page_no++) {
+               struct page *page = bvec[page_no].bv_page;
+
+               if (!uptodate) {
+                       if (ret == 0)
+                               ret = -EIO;
+               }
+
+               if (dio->rw == READ)
+                       set_page_dirty(page);
+               page_cache_release(page);
+       }
+       atomic_dec(&dio->bio_count);
+       bio_put(bio);
+       return ret;
+}
+
+/*
+ * Wait on and process all in-flight BIOs.
+ */
+static int dio_await_completion(struct dio *dio)
+{
+       int ret = 0;
+       while (atomic_read(&dio->bio_count)) {
+               struct bio *bio = dio_await_one(dio);
+               int ret2;
+
+               ret2 = dio_bio_complete(dio, bio);
+               if (ret == 0)
+                       ret = ret2;
+       }
+       return ret;
+}
+
+/*
+ * A really large O_DIRECT read or write can generate a lot of BIOs.  So
+ * to keep the memory consumption sane we periodically reap any completed BIOs
+ * during the BIO generation phase.
+ *
+ * This also helps to limis the peak amount of pinned userspace memory.
+ */
+static int dio_bio_reap(struct dio *dio)
+{
+       int ret = 0;
+
+       if (dio->reap_counter++ >= 64) {
+               while (dio->bio_list) {
+                       unsigned long flags;
+                       struct bio *bio;
+                       int ret2;
+
+                       spin_lock_irqsave(&dio->bio_list_lock, flags);
+                       bio = dio->bio_list;
+                       dio->bio_list = bio->bi_private;
+                       spin_unlock_irqrestore(&dio->bio_list_lock, flags);
+                       ret2 = dio_bio_complete(dio, bio);
+                       if (ret == 0)
+                               ret = ret2;
+               }
+               dio->reap_counter = 0;
+       }
+       return ret;
+}
+
+/*
+ * Walk the user pages, and the file, mapping blocks to disk and emitting BIOs.
+ */
+int do_direct_IO(struct dio *dio)
+{
+       struct inode * const inode = dio->inode;
+       const unsigned blkbits = inode->i_blkbits;
+       const unsigned blocksize = 1 << blkbits;
+       const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
+       struct page *page;
+       unsigned block_in_page;
+       int ret;
+
+       /* The I/O can start at any block offset within the first page */
+       block_in_page = dio->first_block_in_page;
+
+       while (dio->block_in_file < dio->final_block_in_request) {
+               int new_page;   /* Need to insert this page into the BIO? */
+
+               page = dio_get_page(dio);
+               if (IS_ERR(page)) {
+                       ret = PTR_ERR(page);
+                       goto out;
+               }
+
+               new_page = 1;
+               for ( ; block_in_page < blocks_per_page; block_in_page++) {
+                       struct buffer_head map_bh;
+                       struct bio *bio;
+
+                       map_bh.b_state = 0;
+                       ret = (*dio->get_block)(inode, dio->block_in_file,
+                                               &map_bh, dio->rw == WRITE);
+                       if (ret) {
+                               printk("%s: get_block returns %d\n",
+                                       __FUNCTION__, ret);
+                               goto fail_release;
+                       }
+                       /* blockdevs do not set buffer_new */
+                       if (buffer_new(&map_bh))
+                               unmap_underlying_metadata(map_bh.b_bdev,
+                                                       map_bh.b_blocknr);
+                       if (!buffer_mapped(&map_bh)) {
+                               ret = -EINVAL;          /* A hole */
+                               goto
...

read more »

 
 
 

direct-to-BIO for O_DIRECT

Post by Lincoln Dal » Tue, 09 Jul 2002 12:40:06



Quote:>Here's a patch which converts O_DIRECT to go direct-to-BIO, bypassing
>the kiovec layer.  It's followed by a patch which converts the raw
>driver to use the O_DIRECT engine.

>CPU utilisation is about the same as the kiovec-based implementation.
>Read and write bandwidth are the same too, for 128k chunks.   But with
>one megabyte chunks, this implementation is 20% faster at writing.

>I assume this is because the kiobuf-based implementation has to stop
>and wait for each 128k chunk, whereas this code streams the entire
>request, regardless of its size.

>This is with a single (oldish) scsi disk on aic7xxx.  I'd expect the
>margin to widen on higher-end hardware which likes to have more
>requests in flight.

i'll have a go at benchmark-testing these.

now have even bigger hardware than before: 2 x 2gbit/s FC HBAs in multiple
dual-processor (Dual P3 Xeon 550MHz 2M L2 cache and Dual P3 Xeon 833MHz
256K L2 cache) boxen, 8 x 15K RPM FC, 28 x 10K RPM SCSI.

cheers,

lincoln.

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in

More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

 
 
 

direct-to-BIO for O_DIRECT

Post by Andi Klee » Tue, 09 Jul 2002 16:30:09



>    drivers/md/lvm-snap.c
>    drivers/media/video/video-buf.c
>    drivers/mtd/devices/blkmtd.c
>    drivers/scsi/sg.c

> the video and mtd drivers seems to be fairly easy to de-kiobufize.
> I'm aware of one proprietary driver which uses kiobufs.  XFS uses
> kiobufs a little bit - just to map the pages.

lkcd uses it too for its kernel crash dump. I suspect it wouldn't be that
hard to change.

Quote:> So with a bit of effort and maintainer-irritation, we can extract
> the kiobuf layer from the kernel.

> Do we want to do that?

I think yes - keeping two kinds of iovectors for IO (kiovecs and BIOs) seems
to be redundant.
kiovecs never fulfilled their original promise of a universal zero copy
container (e.g. they were too heavy weight for networking) so it's probably
best to remove them as a failed experiment.

-Andi
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in

More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

 
 
 

direct-to-BIO for O_DIRECT

Post by Ingo Oese » Tue, 09 Jul 2002 16:50:11



> Question is: what do we want to do with this sucker?  These are the
> remaining users of kiovecs:

>    drivers/md/lvm-snap.c
>    drivers/media/video/video-buf.c
>    drivers/mtd/devices/blkmtd.c
>    drivers/scsi/sg.c

> the video and mtd drivers seems to be fairly easy to de-kiobufize.
> I'm aware of one proprietary driver which uses kiobufs.  XFS uses
> kiobufs a little bit - just to map the pages.

It would be nice if we could just map a set of user pages to a scatterlist.

Developers of mass transfer devices (video grabbers, dsp devices, sg and
many others) would just LOVE you for this ;-)

Block devices are the common case worth optimizing for, but character
devices just need to reimplement most of this, if they want the same
optimizations. Some devices need mass transfers and are NOT blockdevices.

Linux supports only one class of them properly: NICs.

Please consider supporting them better for 2.5 in stuff similiar to BIOs
and DMA to/from user pages.

Thanks & Regards

Ingo Oeser
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in

More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

 
 
 

direct-to-BIO for O_DIRECT

Post by Suparna Bhattachary » Tue, 09 Jul 2002 18:30:05




>>        drivers/md/lvm-snap.c
>>        drivers/media/video/video-buf.c
>>        drivers/mtd/devices/blkmtd.c
>>        drivers/scsi/sg.c

>> the video and mtd drivers seems to be fairly easy to de-kiobufize. I'm
>> aware of one proprietary driver which uses kiobufs.  XFS uses kiobufs a
>> little bit - just to map the pages.

> lkcd uses it too for its kernel crash dump. I suspect it wouldn't be
> that hard to change.

No, it shouldn't be hard to change. In fact, we've had to think of
changing it for 2.5 anyhow, since most likely we can't afford bio
alloc's happening under the covers down that path.

Quote:

>> So with a bit of effort and maintainer-irritation, we can extract the
>> kiobuf layer from the kernel.

>> Do we want to do that?

> I think yes - keeping two kinds of iovectors for IO (kiovecs and BIOs)
> seems to be redundant.
> kiovecs never fulfilled their original promise of a universal zero copy
> container (e.g. they were too heavy weight for networking) so it's
> probably best to remove them as a failed experiment.

Yes, I think Kiobufs can go, and we can use something like kvecs
(from aio code base) instead which are better for representing
readv/writev, for the generic case (i.e. when its not just
for block i/o). Its easy enough to map kvecs into bio s or
zero-copy networking.

Regards
Suparna

> -Andi
> -
> To unsubscribe from this list: send the line "unsubscribe linux-kernel"

> info at  http://vger.kernel.org/majordomo-info.html Please read the FAQ
> at  http://www.tux.org/lkml/

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in

More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
 
 
 

direct-to-BIO for O_DIRECT

Post by Matt D. Robinso » Wed, 10 Jul 2002 00:20:07




> >       drivers/md/lvm-snap.c
> >       drivers/media/video/video-buf.c
> >       drivers/mtd/devices/blkmtd.c
> >       drivers/scsi/sg.c

> > the video and mtd drivers seems to be fairly easy to de-kiobufize.
> > I'm aware of one proprietary driver which uses kiobufs.  XFS uses
> > kiobufs a little bit - just to map the pages.

> lkcd uses it too for its kernel crash dump. I suspect it wouldn't be that
> hard to change.

We can remove their use from our 2.5 tree.  Not a problem, as
there are other ways to accomplish what we want.

Quote:> -Andi

--Matt
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in

More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
 
 
 

direct-to-BIO for O_DIRECT

Post by Douglas Gilber » Wed, 10 Jul 2002 13:10:06




> > Question is: what do we want to do with this sucker?  These are the
> > remaining users of kiovecs:

> >       drivers/md/lvm-snap.c
> >       drivers/media/video/video-buf.c
> >       drivers/mtd/devices/blkmtd.c
> >       drivers/scsi/sg.c

> > the video and mtd drivers seems to be fairly easy to de-kiobufize.
> > I'm aware of one proprietary driver which uses kiobufs.  XFS uses
> > kiobufs a little bit - just to map the pages.

> It would be nice if we could just map a set of user pages to a scatterlist.

After disabling kiobufs in sg I would like such a drop
in replacement.

Quote:> Developers of mass transfer devices (video grabbers, dsp devices, sg and
> many others) would just LOVE you for this ;-)

Agreed. Tape devices could be added to your list.
Large page support will make for very efficient zero
copy IO.

Quote:> Block devices are the common case worth optimizing for, but character
> devices just need to reimplement most of this, if they want the same
> optimizations. Some devices need mass transfers and are NOT blockdevices.
> Please consider supporting them better for 2.5 in stuff similiar to BIOs
> and DMA to/from user pages.

CIOs?

Doug Gilbert
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in

More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

 
 
 

direct-to-BIO for O_DIRECT

Post by Andrew Morto » Wed, 10 Jul 2002 13:30:07





> > > Question is: what do we want to do with this sucker?  These are the
> > > remaining users of kiovecs:

> > >       drivers/md/lvm-snap.c
> > >       drivers/media/video/video-buf.c
> > >       drivers/mtd/devices/blkmtd.c
> > >       drivers/scsi/sg.c

> > > the video and mtd drivers seems to be fairly easy to de-kiobufize.
> > > I'm aware of one proprietary driver which uses kiobufs.  XFS uses
> > > kiobufs a little bit - just to map the pages.

> > It would be nice if we could just map a set of user pages to a scatterlist.

> After disabling kiobufs in sg I would like such a drop
> in replacement.

Ben had lightweight sg structures called `kvecs' and `kveclets'. And
library functions to map pages into them.  And code to attach them
to BIOs.  So we'll be looking at getting that happening.

The other common requirement (used in several places in the kernel,
and in LVM2) is the ability to perform bulk I/O against a blockdev - simply
read and write a chunk of disk into a list of kernel pages.  So we'll need a
library function for that.   And the O_DIRECT/raw implementation can be bent
around to use those things.

Quote:> > Developers of mass transfer devices (video grabbers, dsp devices, sg and
> > many others) would just LOVE you for this ;-)

> Agreed. Tape devices could be added to your list.
> Large page support will make for very efficient zero
> copy IO.

Haven't thought about large pages.  We don't seem to have an implementation of
them yet, and I'm not sure how the DMA mapping API would get along with
them.

-
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in

More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

 
 
 

direct-to-BIO for O_DIRECT

Post by Ingo Oese » Wed, 10 Jul 2002 17:20:06



> > > It would be nice if we could just map a set of user pages
> > > to a scatterlist.

> > After disabling kiobufs in sg I would like such a drop
> > in replacement.

> Ben had lightweight sg structures called `kvecs' and `kveclets'. And
> library functions to map pages into them.  And code to attach them
> to BIOs.  So we'll be looking at getting that happening.

BIOs are for BLOCK devices we want sth. like this for CHARACTER
devices.

I just want sth. along the lines of this:

/* Pin down (COMPLETE!) user pages and put them into a scatter gather list */
int sg_map_user_pages(struct scatterlist *sgl, const unsigned int nr_pages,
                unsigned long uaddr, int rw) {
        int res, i;
        struct page *pages[nr_pages];

        down_read(&current->mm->mmap_sem);
        res = get_user_pages(
                        current,
                        current->mm,
                        uaddr,
                        nr_pages,
                        rw == READ, /* logic is perversed^Wreversed here :-( */
                        0, /* don't force */
                        &pages[0],
                        NULL);
        up_read(&current->mm->mmap_sem);

        /* Errors and no page mapped should return here */
        if (res <= 0) return res;

        for (i=1; i < res; i++) {
                sgl[i].page = pages[i];
        }
        return res;

Quote:}

/* And unmap them... */
int sg_unmap_user_pages(struct scatterlist *sgl, const unsigned int nr_pages) {
        int i;

        for (i=0; i < nr_pages; i++)
                page_cache_release(sgl[i].page);

        return 0;

Quote:}

Possibly more complicated and less error prone, but you get the
idea ;-)

Regards

Ingo Oeser
--
Science is what we can tell a computer. Art is everything else. --- D.E.Knuth
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in

More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

 
 
 

direct-to-BIO for O_DIRECT

Post by Lincoln Dal » Fri, 12 Jul 2002 11:30:08


At 08:19 PM 7/07/2002 -0700, Andrew Morton wrote:

>Here's a patch which converts O_DIRECT to go direct-to-BIO, bypassing
>the kiovec layer.  It's followed by a patch which converts the raw
>driver to use the O_DIRECT engine.

>CPU utilisation is about the same as the kiovec-based implementation.
>Read and write bandwidth are the same too, for 128k chunks.   But with
>one megabyte chunks, this implementation is 20% faster at writing.
..
>This is with a single (oldish) scsi disk on aic7xxx.  I'd expect the
>margin to widen on higher-end hardware which likes to have more
>requests in flight.

sorry for the delay.
upgrading from 2.4.19 to 2.5.25 took longer than expected, since the QLogic
FC 2300 HBA
driver isn't part of the standard kernel, and i had to update it to reflect the
io_request_lock -> host->host_lock, kdev_t and kbuild changes.  urgh, pain
pain pain.
in the process, i discovered some races in their driver, so fixed them also.

the 2.5 block i/o layer is FAR superior to the 2.4 block i/o layer. kudos
to Jens, Andrew & co for the changeover.

the results:
   2.4.19pre8aa2 (with lockmeter and profile=2)
      normal     167772160 blocks of 512 bytes in 778 seconds (105.27
mbyte/sec), CPUs 0% idle
      O_DIRECT   20480 blocks of 4194304 bytes in 430 seconds (190.47
mbyte/sec), CPUs ~55% idle
      /dev/rawN  20480 blocks of 4194304 bytes in 463 seconds (176.86
mbyte/sec), CPUs ~62% idle

   2.5.25 ('virgin' 2.5.25 with the exception of changing PAGE_OFFSET to
0x80000000 and
          your O_DIRECT-on-blockdev patch to stop it oopsing -- oops report
below)
      normal     167772160 blocks of 512 bytes in 607 seconds (134.81
mbyte/sec), CPUs 0% idle
      O_DIRECT   20480 blocks of 4194304 bytes in 420 seconds (194.61
mbyte/sec), CPUs ~93% idle
      /dev/rawN  20480 blocks of 4194304 bytes in 422 seconds (193.84
mbyte/sec), CPUs ~92% idle

   2.5.25 with direct-to-BIO (and PAGE_OFFSET at 0x80000000)
      normal     167772160 blocks of 512 bytes in 615 seconds (133.06
mbyte/sec), CPUs 0% idle
      O_DIRECT   20480 blocks of 4194304 bytes in 421 seconds (194.37
mbyte/sec), CPUs ~92% idle
      /dev/rawN  20480 blocks of 4194304 bytes in 421 seconds (194.35
mbyte/sec), CPUs ~92% idle

its a little hard to tell CPU load difference between direct-to-BIO versus
non-direct-to-BIO,
but clearly performance was at 100% of 2gbit/s Fibre Channel with
direct-to-bio; i've never
seen it sustain exactly 100% throughout a test before.

it was interesting to watch the test of 2.4.19pre8aa2 versus both 2.5.25
tests; whether it is a
change in the linux scheduler or some other artifact, all "worker" threads
(1 thread per disk)
completed at almost exactly the same time on 2.5.25 kernels.
in contrast, the benchmark on 2.4.19pre8aa2 had some disks complete their
work up to half
a minute prior to the last thread finishing -- clearly there was some
degree of "unfairness"
between threads that has since been addressed.

i'll see about getting dual 2gbit/s FC HBAs working now; my FC MultiPathing
configuration
is having a bad hair day today and i'm not physically near the test host in
question to
replace a physical fibre cable reporting errors.

details of how the test was conducted --

test host:
  - dual P3 Xeon (733MHz), 2GB PC133 SDRAM (no HIGHMEM defined)
  - single QLogic FC 2300 HBA operating at 2gbit/s in a 64/66 PCI slot

test:
  - benchmark consisted of sequential read requests in parallel across
    8 x 18G 15K RPM FC disks across the first 10GB of each disk
    (why use "sequential reads" you ask?  because its generally consistent --
    i'm not measuring any i/o re-ordering/elevator behaviour, nor am
    i measuring the speed of any disk-shelf controller cache or
    disk-spindle seek speed.  i'm purely measuring how fast data can
    move from the storage subsystem to userspace).
  - benchmark-test considered complete when all disks have gone idle.
  - benchmark program is multithreaded, one thread per device
  - each test run twice with machine rebooted in-between to ensure
    repeatability

block sizes:
  - for normal, test used 20971520 blocks of 512 bytes (10GB) on each disk
  - for O_DIRECT, test used 2560 blocks of 4194304 bytes (10GB) on each disk
  - for /dev/rawN, test used 2560 blocks of 4194304 bytes (10GB) on each disk

oops report #1: (virgin 2.5.25)
         oops occurs on attempting to issue a read() on a O_DIRECT device.
         this was corrected with Andrew's patch of:

         Oops: 0000
         CPU:    0
         EIP:    0010:[<801c4e11>]    Not tainted
         Using defaults from ksymoops -t elf32-i386 -a i386
         EFLAGS: 00010296
         eax: 00000080   ebx: 00000000   ecx: f6e83b20   edx: f3e79c00
         esi: f3e79cc0   edi: 00010000   ebp: f6e83b20   esp: f393bdcc
         ds: 0018   es: 0018   ss: 0018
         Stack: 8013e856 820fcde0 00000010 000000c0 2aca6000 00000000
f3e79cc0 00070000
                00000070 801c4fac f6e83b20 f6e83b20 8013edbd 00000000
f6e83b20 00000010
                00000010 00000000 00000000 00000010 00000001 80127acb
f56e9ae0 f54691e0
         Call Trace: [<8013e856>] [<801c4fac>] [<8013edbd>] [<80127acb>]
[<8013e118>]
            [<8013e05f>] [<801269de>] [<80126af8>] [<80140113>]
[<801400a0>] [<8012a9c7>]
            [<8012abad>] [<8011404b>] [<8013a738>] [<8013a8ea>] [<80108a0b>]
         Code: 8b 43 0c c1 ef 09 8b 50 38 8b 40 34 0f ac d0 09 89 c6 85 f6

         >>EIP; 801c4e11 <generic_make_request+11/130>   <=====
         Trace; 8013e856 <bio_alloc+e6/1a0>
         Trace; 801c4fac <submit_bio+5c/70>
         Trace; 8013edbd <ll_rw_kio+1ad/210>
         Trace; 80127acb <handle_mm_fault+6b/e0>
         Trace; 8013e118 <brw_kiovec+a8/100>
         Trace; 8013e05f <generic_direct_IO+ef/100>
         Trace; 801269de <get_user_pages+ee/150>
         Trace; 80126af8 <map_user_kiobuf+b8/100>
         Trace; 80140113 <blkdev_direct_IO+23/30>
         Trace; 801400a0 <blkdev_get_block+0/50>
         Trace; 8012a9c7 <generic_file_direct_IO+167/1e0>
         Trace; 8012abad <generic_file_read+ed/130>
         Trace; 8011404b <schedule+33b/3a0>
         Trace; 8013a738 <vfs_read+98/110>
         Trace; 8013a8ea <sys_read+2a/40>
         Trace; 80108a0b <syscall_call+7/b>
         Code;  801c4e11 <generic_make_request+11/130>
         00000000 <_EIP>:
         Code;  801c4e11 <generic_make_request+11/130>   <=====
            0:   8b 43 0c                  mov    0xc(%ebx),%eax   <=====
         Code;  801c4e14 <generic_make_request+14/130>
            3:   c1 ef 09                  shr    $0x9,%edi
         Code;  801c4e17 <generic_make_request+17/130>
            6:   8b 50 38                  mov    0x38(%eax),%edx
         Code;  801c4e1a <generic_make_request+1a/130>
            9:   8b 40 34                  mov    0x34(%eax),%eax
         Code;  801c4e1d <generic_make_request+1d/130>
            c:   0f ac d0 09               shrd   $0x9,%edx,%eax
         Code;  801c4e21 <generic_make_request+21/130>
           10:   89 c6                     mov    %eax,%esi
         Code;  801c4e23 <generic_make_request+23/130>
           12:   85 f6                     test   %esi,%esi

cheers,

lincoln.

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

 
 
 

direct-to-BIO for O_DIRECT

Post by Andrew Morto » Fri, 12 Jul 2002 12:20:06



> ...
> sorry for the delay.

Is cool.   Thanks for doing this.

Quote:> upgrading from 2.4.19 to 2.5.25 took longer than expected, since the QLogic
> FC 2300 HBA
> driver isn't part of the standard kernel, and i had to update it to reflect the
> io_request_lock -> host->host_lock, kdev_t and kbuild changes.  urgh, pain
> pain pain.
> in the process, i discovered some races in their driver, so fixed them also.

> the 2.5 block i/o layer is FAR superior to the 2.4 block i/o layer. kudos
> to Jens, Andrew & co for the changeover.

> the results:
>    2.4.19pre8aa2 (with lockmeter and profile=2)
>       normal     167772160 blocks of 512 bytes in 778 seconds (105.27
> mbyte/sec), CPUs 0% idle
>       O_DIRECT   20480 blocks of 4194304 bytes in 430 seconds (190.47
> mbyte/sec), CPUs ~55% idle
>       /dev/rawN  20480 blocks of 4194304 bytes in 463 seconds (176.86
> mbyte/sec), CPUs ~62% idle

>    2.5.25 ('*' 2.5.25 with the exception of changing PAGE_OFFSET to
> 0x80000000 and
>           your O_DIRECT-on-blockdev patch to stop it oopsing -- oops report
> below)
>       normal     167772160 blocks of 512 bytes in 607 seconds (134.81
> mbyte/sec), CPUs 0% idle
>       O_DIRECT   20480 blocks of 4194304 bytes in 420 seconds (194.61
> mbyte/sec), CPUs ~93% idle
>       /dev/rawN  20480 blocks of 4194304 bytes in 422 seconds (193.84
> mbyte/sec), CPUs ~92% idle

The 30% improvement in pagecache-buffered reads is somewhat unexpected.
The blockdevs are not using multipage BIOs - they're still using
buffer_head-based I/O for both reads and writes.  Are you sure that
the 2.4 QLogic driver is using block-highmem?

Quote:>    2.5.25 with direct-to-BIO (and PAGE_OFFSET at 0x80000000)
>       normal     167772160 blocks of 512 bytes in 615 seconds (133.06
> mbyte/sec), CPUs 0% idle
>       O_DIRECT   20480 blocks of 4194304 bytes in 421 seconds (194.37
> mbyte/sec), CPUs ~92% idle
>       /dev/rawN  20480 blocks of 4194304 bytes in 421 seconds (194.35
> mbyte/sec), CPUs ~92% idle

OK, so there's nothing there at all really (or there may be.  Hard
to tell when the interface has saturated).

But on my lowly scsi disks I was seeing no change in read bandwidth
either.  Only writes benefitted for some reason.   Can you do
some write testing as well?   If you test writes through the pagecache,
use ext2 and not direct-to-blockdev please - that'll take the multipage
BIOs, buffer_head-bypass route.  Plain old read and write of /dev/XdYY
isn't very optimised at all.

Thanks.

-
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in

More majordomo info at  http://www.veryComputer.com/
Please read the FAQ at  http://www.veryComputer.com/

 
 
 

direct-to-BIO for O_DIRECT

Post by Lincoln Dal » Fri, 12 Jul 2002 12:30:09



Quote:> >    2.5.25 ('*' 2.5.25 with the exception of changing PAGE_OFFSET to
> > 0x80000000 and
> >           your O_DIRECT-on-blockdev patch to stop it oopsing -- oops report
> > below)
> >       normal     167772160 blocks of 512 bytes in 607 seconds (134.81
> > mbyte/sec), CPUs 0% idle
> >       O_DIRECT   20480 blocks of 4194304 bytes in 420 seconds (194.61
> > mbyte/sec), CPUs ~93% idle
> >       /dev/rawN  20480 blocks of 4194304 bytes in 422 seconds (193.84
> > mbyte/sec), CPUs ~92% idle

>The 30% improvement in pagecache-buffered reads is somewhat unexpected.
>The blockdevs are not using multipage BIOs - they're still using
>buffer_head-based I/O for both reads and writes.  Are you sure that
>the 2.4 QLogic driver is using block-highmem?

pretty sure -- there's no highmem in the system: :-)
(i.e. i changed PAGE_OFFSET in order to prevent there being any highmem).


         MemTotal:      1945680 kB
         MemFree:       1853812 kB
         MemShared:           0 kB
         Cached:          29536 kB
         SwapCached:       2520 kB
         Active:          32336 kB
         Inactive:         8336 kB
         HighTotal:           0 kB
         HighFree:            0 kB
         LowTotal:      1945680 kB
         LowFree:       1853812 kB
         SwapTotal:     2047992 kB
         SwapFree:      2037268 kB
         Dirty:            1396 kB
         Writeback:           0 kB

Quote:>OK, so there's nothing there at all really (or there may be.  Hard
>to tell when the interface has saturated).

>But on my lowly scsi disks I was seeing no change in read bandwidth
>either.  Only writes benefitted for some reason.   Can you do
>some write testing as well?   If you test writes through the pagecache,
>use ext2 and not direct-to-blockdev please - that'll take the multipage
>BIOs, buffer_head-bypass route.  Plain old read and write of /dev/XdYY
>isn't very optimised at all.

will do.

do you have any other preferences --
  - ext2 or ext3?
  - if ext3, change the journalling mode?
  - i/o to a single large file or multiple files per spindle?

i can also add combinations of read/write & seeking also.
what kind of file-size should i be using?

cheers,

lincoln.

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in

More majordomo info at  http://www.veryComputer.com/
Please read the FAQ at  http://www.veryComputer.com/

 
 
 

direct-to-BIO for O_DIRECT

Post by Adam J. Richte » Fri, 12 Jul 2002 15:20:06




[...]
>> It would be nice if we could just map a set of user pages to a scatterlist.

>After disabling kiobufs in sg I would like such a drop
>in replacement.

>> Developers of mass transfer devices (video grabbers, dsp devices, sg and
>> many others) would just LOVE you for this ;-)

>Agreed. Tape devices could be added to your list.
>Large page support will make for very efficient zero
>copy IO.

>> Block devices are the common case worth optimizing for, but character
>> devices just need to reimplement most of this, if they want the same
>> optimizations. Some devices need mass transfers and are NOT blockdevices.

>> Please consider supporting them better for 2.5 in stuff similiar to BIOs
>> and DMA to/from user pages.

>CIOs?

        This is what I want to accomplish in my proposal to
pull most of the DMA transfer optimization code up from block
devices by generalizing DMA targets and turning struct scatterlist
into a linked list, discussed here:

        http://marc.theaimsgroup.com/?t=102487685000002&r=1&w=2

        I have not started coding this yet because:

        1. I'm tracking down a bug in the next revision of my proposed
           bio_append patch (which eliminates {read,write}_full_page from
           fs/buffers.c), and I want to hit that ball out of my court first.

        2. I want to look at aio to see if it has a better way or if it
           could benefit from this.

        3. I want to accomodate Dave Miller's request for a non-PCI
           generalization of pci_alloc_consistent, pci_map_single, etc.,
           first, and that will depend on struct device, for which there are
           some relevant changes working their way from Patrick Mochel
           to Linus.

        4. After getting a general dma_alloc_consistent, etc. interface,
           then I want to create a struct dma_target, to abstract
           out the DMA capabilities currently maintained by the block
           layer.  I hope that by doing this in stages, that it will be
           more palatable to Jens, who expressed concern that the
           my proposal to go to a linked list for struct scatterlist
           was a bit too much change.

        Then, I think we'll be in a better position to go to a struct
scatterlist linked list or something similar that can be used by most
if not all big producers of IO.

        In the meantime, killing off kiobufs should be helpful.

Adam J. Richter     __     ______________   575 Oroville Road

+1 408 309-6081         | g g d r a s i l   United States of America
                         "Free Software For The Rest Of Us."
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in

More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

 
 
 

direct-to-BIO for O_DIRECT

Post by Ingo Oese » Sat, 13 Jul 2002 02:40:06



> Ben had lightweight sg structures called `kvecs' and `kveclets'. And
> library functions to map pages into them.  And code to attach them
> to BIOs.  So we'll be looking at getting that happening.

Ok, I've looked at them and they don't help me at all.

A user, who splits its IO into single pages, wants to do DMA and
needs bus addresses for that. So he needs "struct scatterlist".

If one doesn't need to DMA, one can do copy_{from,to}_user
directly with an immediate buffer, so the splitup isn't needed.

For this I conclude, that using the EXISTING 'struct scatterlist'
will be enough for both. Attaching a vector of these to the BIOs
is no problem. Neither it is for CHARACTER device IOs (CIOs).

So by using this simple abstraction we MIGHT waste only 4-8 bytes
per page submitted, but by page-splitting the IO only for devices,
that need DMA (e.g. make the request that explicitly) we don't
really waste it and support BIOs and CIOs the same way.

I will refine that code for my own uses anyway, so if nobody with
more clues about IO than me implements it, I will submit it
later.

Regards

Ingo Oeser
--
Science is what we can tell a computer. Art is everything else. --- D.E.Knuth
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in

More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

 
 
 

direct-to-BIO for O_DIRECT

Post by Jesse Barne » Sat, 13 Jul 2002 05:00:11



> sorry for the delay.
> upgrading from 2.4.19 to 2.5.25 took longer than expected, since the
> QLogic FC 2300 HBA driver isn't part of the standard kernel, and i
> had to update it to reflect the io_request_lock -> host->host_lock,
> kdev_t and kbuild changes.  urgh, pain pain pain.  in the process, i
> discovered some races in their driver, so fixed them also.

So you ported the qla2x00 driver forward to 2.5?  Would it be possible
to post that driver?  Not having it has held up some testing I'd like
to do...

Thanks,
Jesse
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in

More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/