NFS O_DIRECT clean-up #2

NFS O_DIRECT clean-up #2

Post by Chuck Leve » Thu, 14 Nov 2002 01:10:10



get NFS O_DIRECT support close to working.  affects only O_DIRECT paths,
no normal I/O paths are touched.

against 2.5.47.

diff -ruN 04-odirect1/fs/nfs/direct.c 05-odirect2/fs/nfs/direct.c
--- 04-odirect1/fs/nfs/direct.c Tue Nov 12 16:07:53 2002
+++ 05-odirect2/fs/nfs/direct.c Tue Nov 12 16:08:54 2002
@@ -35,10 +35,12 @@
  */

 #include <linux/config.h>
+#include <linux/errno.h>
 #include <linux/sched.h>
 #include <linux/kernel.h>
 #include <linux/file.h>
-#include <linux/errno.h>
+#include <linux/pagemap.h>
+
 #include <linux/nfs_fs.h>
 #include <linux/nfs_page.h>
 #include <linux/sunrpc/clnt.h>
@@ -48,33 +50,39 @@

 #define NFSDBG_FACILITY                (NFSDBG_PAGECACHE | NFSDBG_VFS)
 #define VERF_SIZE              (2 * sizeof(__u32))
+#define MAX_DIRECTIO_SIZE      (4096UL << PAGE_SHIFT)

 /**
- * nfs_get_user_pages - find and set up page representing user buffer
- * addr: user-space address of target buffer
- * size: total size in bytes of target buffer
+ * nfs_get_user_pages - find and set up pages representing user buffer
+ * @vec: pointer to vector that defines I/O buffer
  * @pages: returned array of page struct pointers underlying target buffer
- * write: whether or not buffer is target of a write operation
+ * rw: whether or not buffer is target of a write operation
  */
 static inline int
-nfs_get_user_pages(unsigned long addr, size_t size,
-               struct page ***pages, int rw)
+nfs_get_user_pages(int rw, const struct iovec *vec, struct page ***pages)
 {
+       unsigned long addr = (unsigned long) vec->iov_base;
+       size_t size = vec->iov_len;
        int result = -ENOMEM;
-       unsigned page_count = (unsigned) size >> PAGE_SHIFT;
-       unsigned array_size = (page_count * sizeof(struct page *)) + 2U;
+       unsigned long page_count;
+       size_t array_size;
+
+       /* set an arbitrary limit to prevent arithmetic overflow */
+       if (size > MAX_DIRECTIO_SIZE)
+               return -EFBIG;
+
+       page_count = (addr + size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+       page_count -= addr >> PAGE_SHIFT;

-       *pages = (struct page **) kmalloc(array_size, GFP_KERNEL);
+       array_size = (page_count * sizeof(struct page *));
+       *pages = kmalloc(array_size, GFP_KERNEL);
        if (*pages) {
                down_read(&current->mm->mmap_sem);
                result = get_user_pages(current, current->mm, addr,
-                                       page_count, (rw == WRITE), 0,
+                                       page_count, (rw == READ), 0,
                                        *pages, NULL);
                up_read(&current->mm->mmap_sem);
-               if (result < 0)
-                       printk(KERN_ERR "%s: get_user_pages result %d\n",
-                                       __FUNCTION__, result);
        }
        return result;
 }
@@ -84,141 +92,208 @@
  * @pages: array of page struct pointers underlying target buffer
  */
 static inline void
-nfs_free_user_pages(struct page **pages, unsigned count)
+nfs_free_user_pages(struct page **pages)
+{
+       kfree(pages);
+}
+
+/**
+ * nfs_iov2pagelist - convert one iov to a list of page requests
+ * rw: direction (read or write)
+ * @file: file struct of target file
+ * @iov: pointer to vector that defines I/O buffer
+ * offset: where in file to begin the read
+ * @pages: array of mapped pages to use for this I/O
+ * @requests: append new page requests to this list head
+ */
+static int
+nfs_iov2pagelist(int rw, struct file *file, const struct iovec *iov,
+               loff_t offset, struct page **pages, struct list_head *requests)
 {
+       struct inode *inode = file->f_dentry->d_inode;
+       struct rpc_cred *cred = nfs_file_cred(file);
+       unsigned long user_addr = (unsigned long) iov->iov_base;
+       size_t bytes = iov->iov_len;
+
+       unsigned pg_offset = user_addr & ~PAGE_MASK;
        unsigned page = 0;
+       int tot_bytes = 0;
+
+       while (bytes) {
+               struct nfs_page *new;
+               unsigned pg_bytes;
+
+               pg_bytes = PAGE_SIZE - pg_offset;
+               if (pg_bytes > bytes)
+                       pg_bytes = bytes;
+
+               /* page is unlocked by nfs_readpage_result */
+               if (rw == READ)
+                       lock_page(pages[page]);
+
+               new = nfs_create_request(cred, inode, pages[page],
+                                        pg_offset, pg_bytes);
+               if (IS_ERR(new)) {
+                       nfs_release_list(requests);
+                       nfs_free_user_pages(pages);
+                       return PTR_ERR(new);
+               }

-       while (count--)
-               page_cache_release(pages[page++]);
+               new->wb_index = offset >> PAGE_SHIFT;
+               new->wb_offset = offset & ~ PAGE_MASK;
+               if (rw == WRITE)
+                       /* a la nfs_update_request */
+                       nfs_lock_request_dontget(new);
+               else
+                       /* a la nfs_scan_list */
+                       nfs_lock_request(new);
+               nfs_list_add_request(new, requests);
+
+               offset += pg_bytes;
+               tot_bytes += pg_bytes;
+               bytes -= pg_bytes;
+               page++;

-       kfree(pages);
+               /* after the first page */
+               pg_offset = 0;
+       }
+
+       return tot_bytes;
 }

 /**
- * nfs_iov2pagelist - convert an array of iovecs to a list of page requests
- * @inode: inode of target file
- * @cred: credentials of user who requested I/O
+ * nfs_direct_read_async - Read directly using asynchronous RPC
+ * @file: target file
  * @iov: array of vectors that define I/O buffer
  * offset: where in file to begin the read
  * nr_segs: size of iovec array
- * @requests: append new page requests to this list head
+ *
+ * Flush out pending writes in the page cache first so we pick up changes
+ * made by non-direct writers.  Then convert the iovecs into a list of NFS
+ * page requests, and page them in.
  */
 static int
-nfs_iov2pagelist(int rw, const struct inode *inode,
-               const struct rpc_cred *cred,
-               const struct iovec *iov, loff_t offset,
-               unsigned long nr_segs, struct list_head *requests)
+nfs_direct_read_async(struct file *file, const struct iovec *iov,
+               loff_t offset, unsigned long nr_segs)
 {
+       LIST_HEAD(requests);
+       struct page **pages;
+       struct inode *inode = file->f_dentry->d_inode;
        unsigned seg;
+       int result;
        int tot_bytes = 0;
-       struct page **pages;

-       /* for each iovec in the array... */
+       /* push any pending non-direct writes so our reads see them */
+       result = nfs_wb_all(inode);
+       if (result < 0) {
+               result = file->f_error;
+               file->f_error = 0;
+               return result;
+       }
+
        for (seg = 0; seg < nr_segs; seg++) {
-               const unsigned long user_addr =
-                                       (unsigned long) iov[seg].iov_base;
-               size_t bytes = iov[seg].iov_len;
-               unsigned int pg_offset = (user_addr & ~PAGE_MASK);
-               int page_count, page = 0;
+               const struct iovec *vec = &iov[seg];
+               int page_count;

-               page_count = nfs_get_user_pages(user_addr, bytes, &pages, rw);
+               page_count = nfs_get_user_pages(READ, vec, &pages);
                if (page_count < 0) {
-                       nfs_release_list(requests);
+                       nfs_release_list(&requests);
+                       nfs_free_user_pages(pages);
                        return page_count;
                }

-               /* ...build as many page requests as required */
-               while (bytes > 0) {
-                       struct nfs_page *new;
-                       const unsigned int pg_bytes = (bytes > PAGE_SIZE) ?
-                                                       PAGE_SIZE : bytes;
-
-                       new = nfs_create_request((struct rpc_cred *) cred,
-                                                (struct inode *) inode,
-                                                pages[page],
-                                                pg_offset, pg_bytes);
-                       if (IS_ERR(new)) {
-                               nfs_free_user_pages(pages, page_count);
-                               nfs_release_list(requests);
-                               return PTR_ERR(new);
-                       }
-                       new->wb_index = offset;
-                       nfs_list_add_request(new, requests);
-
-                       /* after the first page */
-                       pg_offset = 0;
-                       offset += PAGE_SIZE;
-                       tot_bytes += pg_bytes;
-                       bytes -= pg_bytes;
-                       page++;
+               result = nfs_iov2pagelist(READ, file, vec, offset, pages,
+                               &requests);
+               if (result <= 0) {
+                       nfs_release_list(&requests);
+                       nfs_free_user_pages(pages);
+                       return result;
+               }
+               tot_bytes += result;
+
+               result = nfs_pagein_list(&requests,
+                                       NFS_SERVER(inode)->rpages);
+               if (result < 0) {
+                       tot_bytes = result;
+                       nfs_free_user_pages(pages);
+                       break;
                }

-               /* don't release pages here -- I/O completion will do that */
-               nfs_free_user_pages(pages, 0);
+               while (page_count--) {
+                       struct page *page = pages[page_count];
+
+                       page_cache_get(page);
+                       wait_on_page_locked(page);
+                       if (PageError(page))
+                               tot_bytes = -EIO;
+                       page_cache_release(page);
+               }
+
+               nfs_free_user_pages(pages);
+
+               if (tot_bytes < 0)
+                       break;
        }

        return tot_bytes;
 }

 /**
- * do_nfs_direct_IO - Read or write data without caching
- * @inode: inode of target file
- * @cred: credentials of user who requested I/O
+ * nfs_direct_write_async - Write directly using asynchronous RPC
+ * @file: file struct of target file
  * @iov: array of vectors that define I/O buffer
  * offset: where in file to begin the read
  * nr_segs: size of iovec array
  *
- * Break the passed-in iovec into a series of page-sized or smaller
- * requests, where each page is mapped for direct user-land I/O.
- *
- * For each of these pages, create an NFS page request and
- * append it to an automatic list of page requests.
+ * Convert the iovecs into a list of NFS page requests, then flush
+ * them out.  Invalidate any cached pages for this file so that normal
+ * accessors will see these changes.
  *
- * When all page requests have been queued, start the I/O on the
- * whole list.  The underlying routines coalesce the pages on the
- * list into a bunch of asynchronous "r/wsize" network requests.
+ * nfs_writeback_done will emit COMMIT requests should they be required.
  *
- * I/O completion automatically unmaps and releases the pages.
+ * O_SYNC for files and "sync" for file systems causes STABLE writes
+ * that are sent one at a time.  This is for applications that have
+ * ultra-strict write ordering requirements.
  */
 static int
-do_nfs_direct_IO(int rw, const struct inode *inode,
-               const struct rpc_cred *cred, const struct iovec *iov,
+nfs_direct_write_async(struct file *file, const struct iovec *iov,
                loff_t offset, unsigned long nr_segs)
 {
        LIST_HEAD(requests);
-       int result, tot_bytes;
+       struct page **pages;
+       struct inode *inode = file->f_dentry->d_inode;
+       unsigned seg;
+       int result;
+       int tot_bytes = 0;
+       int flags = IS_SYNC(inode) ?  FLUSH_STABLE | FLUSH_WAIT : FLUSH_WAIT;

-       result = nfs_iov2pagelist(rw, inode, cred, iov, offset, nr_segs,
-                                                               &requests);
-       if (result < 0)
-               return result;
-       tot_bytes = result;
+       for (seg = 0; seg < nr_segs; seg++) {
+               const struct iovec *vec = &iov[seg];

-       switch (rw) {
-       case READ:
-               if (IS_SYNC(inode) || (NFS_SERVER(inode)->rsize < PAGE_SIZE)) {
-                       result = nfs_direct_read_sync(inode, cred, iov, offset, nr_segs);
-                       break;
+               result = nfs_get_user_pages(WRITE, vec, &pages);
+               if (result < 0) {
+                       nfs_release_list(&requests);
+                       nfs_free_user_pages(pages);
+                       return result;
                }
-               result = nfs_pagein_list(&requests, NFS_SERVER(inode)->rpages);
-               break;
-       case WRITE:
-               if (IS_SYNC(inode) || (NFS_SERVER(inode)->wsize < PAGE_SIZE))
-                       result = nfs_direct_write_sync(inode, cred, iov, offset, nr_segs);
-               else
-                       result = nfs_flush_list(&requests,
-                                       NFS_SERVER(inode)->wpages, FLUSH_WAIT);

-               /* invalidate cache so non-direct readers pick up changes */
-               invalidate_inode_pages((struct inode *) inode);
-               break;
-       default:
-               result = -EINVAL;
-               break;
+               result = nfs_iov2pagelist(WRITE, file, vec, offset,
+                                                       pages, &requests);
+               nfs_free_user_pages(pages);
+               if (result <= 0)
+                       break;
+               tot_bytes += result;
+
+               result = nfs_flush_list(&requests,
+                                       NFS_SERVER(inode)->wpages, flags);
+               if (result < 0) {
+                       tot_bytes = result;
+                       break;
+               }
        }

-       if (result < 0)
-               return result;
+       /* cause any non-direct readers to pick up our writes */
+       invalidate_inode_pages(inode->i_mapping);
        return tot_bytes;
 }

@@ -232,27 +307,39 @@
  *
  * The inode's i_sem is no longer held by the VFS layer before it calls
  * this function to do a write.
+ *
+ * For now, NFS O_DIRECT returns -EINVAL when r/wsize is smaller than
+ * PAGE_SIZE.
  */
 int
 nfs_direct_IO(int rw, struct file *file, const struct iovec *iov,
                loff_t offset, unsigned long nr_segs)
 {
-       /* None of this works yet, so prevent it from compiling. */
-#if 0
-       int result;
-       struct dentry *dentry = file->f_dentry;
-       const struct inode *inode = dentry->d_inode->i_mapping->host;
-       const struct rpc_cred *cred = nfs_file_cred(file);
-#endif
-
-       dfprintk(VFS, "NFS: direct_IO(%s) (%s/%s) off/no(%Lu/%lu)\n",
-                               ((rw == READ) ? "READ" : "WRITE"),
-                               dentry->d_parent->d_name.name,
-                               dentry->d_name.name, offset, nr_segs);
+       int result = -EINVAL;
+       struct inode *inode = file->f_dentry->d_inode;

-       result = do_nfs_direct_IO(rw, inode, cred, iov, offset, nr_segs);
+       switch (rw) {
+       case READ:
+               dfprintk(VFS, "NFS: direct_IO(read) (%s) off/no(%Lu/%lu)\n",
+                       file->f_dentry->d_name.name, offset, nr_segs);
+
+               if (NFS_SERVER(inode)->rpages)
+                       result = nfs_direct_read_async(file, iov, offset,
+                                       nr_segs);
+               break;
+       case WRITE:
+               dfprintk(VFS, "NFS: direct_IO(write) (%s) off/no(%Lu/%lu)\n",
+                       file->f_dentry->d_name.name, offset, nr_segs);
+
+               if (NFS_SERVER(inode)->wpages)
+                       result = nfs_direct_write_async(file, iov, offset,
+                                       nr_segs);
+               break;
+       default:
+               break;
+       }

-       dfprintk(VFS, "NFS: direct_IO result = %d\n", result);
+       dfprintk(VFS, "NFS: direct_IO result=%d\n", result);

        return result;
 }

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

 
 
 

1. O_DIRECT! or O_DIRECT?

Several of us are working on it.

                -ben

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in

More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

2. Why doesn't my dmfe1 work ?

3. Shape Ups,Men's Shape Ups,Men's Skechers Shape Ups - new styles!

4. non-ascii keyboard (spanish)

5. NFS O_DIRECT update

6. Can i install another WM without stuffing my WindowMaker installation up...

7. Experimental O_DIRECT support for NFS

8. RH72, XP and Win98 on one drive

9. NFS O_DIRECT support for 2.5

10. IPv6: Miscellaneous clean-ups

11. IDE iops clean ups

12. x86 timer clean ups

13. Clean stop of NFS server