direct-to-BIO writeback

direct-to-BIO writeback

Post by Andrew Morto » Tue, 28 May 2002 05:50:10



Multipage BIO writeout from the pagecache.

It's pretty much the same as multipage reads.  It falls back to buffers
if things got complex.

The write case is a little more complex because it handles pages which
have buffers and pages which do not.  If the page didn't have buffers
this code does not add them.

=====================================

--- 2.5.18/fs/mpage.c~mpage-write       Sun May 26 02:51:21 2002
+++ 2.5.18-akpm/fs/mpage.c      Sun May 26 02:51:31 2002
@@ -60,11 +60,31 @@ static void mpage_end_io_read(struct bio
        bio_put(bio);
 }

+static void mpage_end_io_write(struct bio *bio)
+{
+       const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+       struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+
+       do {
+               struct page *page = bvec->bv_page;
+
+               if (--bvec >= bio->bi_io_vec)
+                       prefetchw(&bvec->bv_page->flags);
+
+               if (!uptodate)
+                       SetPageError(page);
+               end_page_writeback(page);
+       } while (bvec >= bio->bi_io_vec);
+       bio_put(bio);
+}
+
 struct bio *mpage_bio_submit(int rw, struct bio *bio)
 {
        bio->bi_vcnt = bio->bi_idx;
        bio->bi_idx = 0;
        bio->bi_end_io = mpage_end_io_read;
+       if (rw == WRITE)
+               bio->bi_end_io = mpage_end_io_write;
        submit_bio(rw, bio);
        return NULL;
 }
@@ -270,3 +290,258 @@ int mpage_readpage(struct page *page, ge
        return 0;
 }
 EXPORT_SYMBOL(mpage_readpage);
+
+/*
+ * Writing is not so simple.
+ *
+ * If the page has buffers then they will be used for obtaining the disk
+ * mapping.  We only support pages which are fully mapped-and-dirty, with a
+ * special case for pages which are unmapped at the end: end-of-file.
+ *
+ * If the page has no buffers (preferred) then the page is mapped here.
+ *
+ * If all blocks are found to be contiguous then the page can go into the
+ * BIO.  Otherwise fall back to block_write_full_page().
+ *
+ * FIXME: This code wants an estimate of how many pages are still to be
+ * written, so it can intelligently allocate a suitably-sized BIO.  For now,
+ * just allocate full-size (16-page) BIOs.
+ */
+static /* inline */ struct bio *
+mpage_writepage(struct bio *bio, struct page *page, get_block_t get_block,
+                       sector_t *last_block_in_bio, int *ret)
+{
+       struct inode *inode = page->mapping->host;
+       const unsigned blkbits = inode->i_blkbits;
+       unsigned long end_index;
+       const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits;
+       struct bio_vec *bvec;
+       sector_t last_block;
+       sector_t block_in_file;
+       sector_t blocks[MAX_BUF_PER_PAGE];
+       unsigned page_block;
+       unsigned first_unmapped = blocks_per_page;
+       struct block_device *bdev = NULL;
+       int boundary = 0;
+
+       if (page_has_buffers(page)) {
+               struct buffer_head *head = page_buffers(page);
+               struct buffer_head *bh = head;
+
+               /* If they're all mapped and dirty, do it */
+               page_block = 0;
+               do {
+                       BUG_ON(buffer_locked(bh));
+                       if (!buffer_mapped(bh)) {
+                               /*
+                                * unmapped dirty buffers are created by
+                                * __set_page_dirty_buffers -> mmapped data
+                                */
+                               if (buffer_dirty(bh))
+                                       goto confused;
+                               if (first_unmapped == blocks_per_page)
+                                       first_unmapped = page_block;
+                               continue;
+                       }
+
+                       if (first_unmapped != blocks_per_page)
+                               goto confused;  /* hole -> non-hole */
+
+                       if (!buffer_dirty(bh) || !buffer_uptodate(bh))
+                               goto confused;
+                       if (page_block) {
+                               if (bh->b_blocknr != blocks[page_block-1] + 1)
+                                       goto confused;
+                       }
+                       blocks[page_block++] = bh->b_blocknr;
+                       boundary = buffer_boundary(bh);
+                       bdev = bh->b_bdev;
+               } while ((bh = bh->b_this_page) != head);
+
+               if (first_unmapped)
+                       goto page_is_mapped;
+
+               /*
+                * Page has buffers, but they are all unmapped. The page was
+                * created by pagein or read over a hole which was handled by
+                * block_read_full_page().  If this address_space is also
+                * using mpage_readpages then this can rarely happen.
+                */
+               goto confused;
+       }
+
+       /*
+        * The page has no buffers: map it to disk
+        */
+       BUG_ON(!PageUptodate(page));
+       block_in_file = page->index << (PAGE_CACHE_SHIFT - blkbits);
+       last_block = (inode->i_size - 1) >> blkbits;
+       for (page_block = 0; page_block < blocks_per_page; ) {
+               struct buffer_head map_bh;
+
+               map_bh.b_state = 0;
+               if (get_block(inode, block_in_file, &map_bh, 1))
+                       goto confused;
+               if (buffer_new(&map_bh))
+                       unmap_underlying_metadata(map_bh.b_bdev,
+                                               map_bh.b_blocknr);
+               if (page_block) {
+                       if (map_bh.b_blocknr != blocks[page_block-1] + 1)
+                               goto confused;
+               }
+               blocks[page_block++] = map_bh.b_blocknr;
+               boundary = buffer_boundary(&map_bh);
+               bdev = map_bh.b_bdev;
+               if (block_in_file == last_block)
+                       break;
+               block_in_file++;
+       }
+       if (page_block == 0)
+               buffer_error();
+
+       first_unmapped = page_block;
+
+       end_index = inode->i_size >> PAGE_CACHE_SHIFT;
+       if (page->index >= end_index) {
+               unsigned offset = inode->i_size & (PAGE_CACHE_SIZE - 1);
+
+               if (page->index > end_index || !offset)
+                       goto confused;
+               memset(kmap(page) + offset, 0, PAGE_CACHE_SIZE - offset);
+               flush_dcache_page(page);
+               kunmap(page);
+       }
+
+page_is_mapped:
+
+       /*
+        * This page will go to BIO.  Do we need to send this BIO off first?
+        */
+       if (bio && (bio->bi_idx == bio->bi_vcnt ||
+                               *last_block_in_bio != blocks[0] - 1))
+               bio = mpage_bio_submit(WRITE, bio);
+
+       if (bio == NULL) {
+               unsigned nr_bvecs = MPAGE_BIO_MAX_SIZE / PAGE_CACHE_SIZE;
+
+               bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
+                                       nr_bvecs, GFP_NOFS);
+               if (bio == NULL)
+                       goto confused;
+       }
+
+       /*
+        * OK, we have our BIO, so we can now mark the buffers clean.  Make
+        * sure to only clean buffers which we know we'll be writing.
+        */
+       if (page_has_buffers(page)) {
+               struct buffer_head *head = page_buffers(page);
+               struct buffer_head *bh = head;
+               unsigned buffer_counter = 0;
+
+               do {
+                       if (buffer_counter++ == first_unmapped)
+                               break;
+                       clear_buffer_dirty(bh);
+                       bh = bh->b_this_page;
+               } while (bh != head);
+       }
+
+       bvec = &bio->bi_io_vec[bio->bi_idx++];
+       bvec->bv_page = page;
+       bvec->bv_len = (first_unmapped << blkbits);
+       bvec->bv_offset = 0;
+       bio->bi_size += bvec->bv_len;
+       BUG_ON(PageWriteback(page));
+       SetPageWriteback(page);
+       unlock_page(page);
+       if (boundary || (first_unmapped != blocks_per_page))
+               bio = mpage_bio_submit(WRITE, bio);
+       else
+               *last_block_in_bio = blocks[blocks_per_page - 1];
+       goto out;
+
+confused:
+       if (bio)
+               bio = mpage_bio_submit(WRITE, bio);
+       *ret = block_write_full_page(page, get_block);
+out:
+       return bio;
+}
+
+/*
+ * This is a cut-n-paste of generic_writeback_mapping().  We _could_
+ * generalise that function.  It'd get a bit messy.  We'll see.
+ */
+int
+mpage_writeback_mapping(struct address_space *mapping,
+                       int *nr_to_write, get_block_t get_block)
+{
+       struct bio *bio = NULL;
+       sector_t last_block_in_bio = 0;
+       int ret = 0;
+       int done = 0;
+
+       write_lock(&mapping->page_lock);
+
+       list_splice(&mapping->dirty_pages, &mapping->io_pages);
+       INIT_LIST_HEAD(&mapping->dirty_pages);
+
+        while (!list_empty(&mapping->io_pages) && !done) {
+               struct page *page = list_entry(mapping->io_pages.prev,
+                                       struct page, list);
+               list_del(&page->list);
+               if (PageWriteback(page)) {
+                       if (PageDirty(page)) {
+                               list_add(&page->list, &mapping->dirty_pages);
+                               continue;
+                       }
+                       list_add(&page->list, &mapping->locked_pages);
+                       continue;
+               }
+               if (!PageDirty(page)) {
+                       list_add(&page->list, &mapping->clean_pages);
+                       continue;
+               }
+               list_add(&page->list, &mapping->locked_pages);
+
+               page_cache_get(page);
+               write_unlock(&mapping->page_lock);
+
+               lock_page(page);
+
+               if (page->mapping && TestClearPageDirty(page) &&
+                                       !PageWriteback(page)) {
+                       /* FIXME: batch this up */
+                       if (!PageActive(page) && PageLRU(page)) {
+                               spin_lock(&pagemap_lru_lock);
+                               if (!PageActive(page) && PageLRU(page)) {
+                                       list_del(&page->lru);
+                                       list_add(&page->lru, &inactive_list);
+                               }
+                               spin_unlock(&pagemap_lru_lock);
+                       }
+                       bio = mpage_writepage(bio, page, get_block,
+                                       &last_block_in_bio, &ret);
+                       if (ret || (nr_to_write && --(*nr_to_write) <= 0))
+                               done = 1;
+               } else {
+                       unlock_page(page);
+               }
+
+               page_cache_release(page);
+               write_lock(&mapping->page_lock);
+       }
+       if (!list_empty(&mapping->io_pages)) {
+               /*
+                * Put the rest back, in the correct order.
+                */
+               list_splice(&mapping->io_pages, mapping->dirty_pages.prev);
+               INIT_LIST_HEAD(&mapping->io_pages);
+       }
+       write_unlock(&mapping->page_lock);
+       if (bio)
+               mpage_bio_submit(WRITE, bio);
+       return ret;
+}
+EXPORT_SYMBOL(mpage_writeback_mapping);
--- 2.5.18/include/linux/mpage.h~mpage-write    Sun May 26 02:51:21 2002
+++ 2.5.18-akpm/include/linux/mpage.h   Sun May 26 02:51:25 2002
@@ -13,3 +13,6 @@
 int mpage_readpages(struct address_space *mapping, struct list_head *pages,
                                unsigned nr_pages, get_block_t get_block);
 int mpage_readpage(struct page *page, get_block_t get_block);
+int mpage_writeback_mapping(struct address_space *mapping,
+               int *nr_to_write, get_block_t get_block);
+
--- 2.5.18/fs/ext2/inode.c~mpage-write  Sun May 26 02:51:21 2002
+++ 2.5.18-akpm/fs/ext2/inode.c Sun May 26 02:51:25 2002
@@ -622,7 +622,7 @@ ext2_writeback_mapping(struct address_sp
        int err;

        ret = write_mapping_buffers(mapping);
-       err = generic_writeback_mapping(mapping, nr_to_write);
+       err = mpage_writeback_mapping(mapping, nr_to_write, ext2_get_block);
        if (!ret)
                ret = err;
        return ret;
--- 2.5.18/include/linux/buffer_head.h~mpage-write      Sun May 26 02:51:21 2002
+++ 2.5.18-akpm/include/linux/buffer_head.h     Sun May 26 02:51:25 2002
@@ -162,6 +162,7 @@ int inode_has_buffers(struct inode *);
 void invalidate_inode_buffers(struct inode *);
 int fsync_buffers_list(spinlock_t *lock, struct list_head *);
 int sync_mapping_buffers(struct address_space *mapping);
+void unmap_underlying_metadata(struct block_device *bdev, sector_t block);

 void mark_buffer_async_read(struct buffer_head *bh);
 void mark_buffer_async_write(struct buffer_head *bh);
--- 2.5.18/fs/buffer.c~mpage-write      Sun May 26 02:51:21 2002
+++ 2.5.18-akpm/fs/buffer.c     Sun May 26 02:51:25 2002
@@ -1448,11 +1448,11 @@ EXPORT_SYMBOL(create_empty_buffers);
  * wait on that I/O in bforget() - it's more efficient to wait on the I/O
  * only if we really need to.  That happens here.
  */
-static void unmap_underlying_metadata(struct buffer_head *bh)
+void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
 {
        struct buffer_head *old_bh;

-       old_bh = __get_hash_table(bh->b_bdev, bh->b_blocknr, 0);
+       old_bh = __get_hash_table(bdev, block, 0);
        if (old_bh) {
 #if 0  /* This happens.  Later. */
                if (buffer_dirty(old_bh))
@@ -1548,7 +1548,8 @@ static int __block_write_full_page(struc
                        if (buffer_new(bh)) {
                                /* blockdev mappings never come here */
                                clear_buffer_new(bh);
-                               unmap_underlying_metadata(bh);
+                               unmap_underlying_metadata(bh->b_bdev,
+                                                       bh->b_blocknr);
                        }
                }
                bh = bh->b_this_page;
@@ -1689,7 +1690,8 @@ static int __block_prepare_write(struct
                                goto out;
                        if (buffer_new(bh)) {
                                clear_buffer_new(bh);
-                               unmap_underlying_metadata(bh);
+                               unmap_underlying_metadata(bh->b_bdev,
+                                                       bh->b_blocknr);
                                if (PageUptodate(page)) {
                                        if (!buffer_mapped(bh))
                                                buffer_error();
@@ -2191,7 +2193,8 @@ int generic_direct_IO(int rw, struct ino
                        }
                } else {
                        if (buffer_new(&bh))
-                               unmap_underlying_metadata(&bh);
+                               unmap_underlying_metadata(bh.b_bdev,
+                                                       bh.b_blocknr);
                        if (!buffer_mapped(&bh))
                                BUG();
                }

-
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/