On Mon, May 13, 2002 at 01:48:55AM +0200, Trond Myklebust wrote:
> >>>>> " " == Mario Vanoni <vano...@dial.eunet.ch> writes:
> > Hi Trond, hi Andrea, hi All In production environment, since >6
> > months, ethernet 10Mbits/s, on backup_machine mount -t nfs
> > production_machine /mnt.
> > find `listing from production_machine` | \ cpio -pdm
> > backup_machine
> > Volume ~320MB, nearly constant.
> > Medium times:
> > 2.4.17-rc1aa1: 1m58s, _the_ champion !!!
> > all later's, e.g.:
> > 2.4.19-pre8aa2; 4m35s 2.4.19-pre8-ac1: 4m00s
> > 2.4.19-pre7-rmap13a: 4m02s 2.4.19-pre7: 4m35s 2.4.19-pre4:
> > 4m20s
> > the last usable was:
> > 2.4.19-pre3: 2m35s, _not_ a champion
> > All benchmarks don't reflect some production needs, <2 minutes
> > or >4 minutes is a great difference !!!
> > Mario, not in lkml, but active reader (and tester).
> Mario,
> Your case where you transfer 320MB in 1'58" is either a measurement
> error, or it involves some pretty heavy caching, since otherwise you
> would be reading at ~3MB/sec == ~24Mbit/s over a 10Mbit line.
> 4 minutes is in fact wire speed for 320MB of data over a 10Mbit
> connection. To imply that is 'unusable' would be a tad exaggerating...
> It may indeed be that the CTO patch is having an effect on the cache
> but it should only do so if the file's mtimes or inode number or NFS
> filehandle are changing with time.
> If not, then the only thing that could be causing cache invalidation
> is memory pressure and the standard Linux memory reclamation scheme.
the thing that makes the difference is the backout-cto patch according
those numbers and I doubt it is influencing the page replacement in any
way (is kernel-side memory pressure going to increase significantly with
cto?).
2.4.19-pre3 vanilla
first time after boot 5m15s, then 1m56s 1m57s 1m56s 1m56s 1m57s
2.4.19-pre4 vanilla
5m20s, then 4m00s 4m01s 4m00s 4m01s 4m01a
2.4.19-pre4-nfs-backout-cto, from pr8aa2, 2 Hunks
5m13a, then 1m57s 1m58s 1m57s 1m58s 1m59s
nfs-backout-cto is appended. Now if the previous kernel was buggy and it
was not invalidating "invalid" cache then cto is right, otherwise it
sounds like the cto patch is invalidating more cache than necessary.
diff -urN 2.4.19pre6aa1/fs/nfs/dir.c nfs/fs/nfs/dir.c
--- 2.4.19pre6aa1/fs/nfs/dir.c Fri Apr 5 20:19:43 2002
+++ nfs/fs/nfs/dir.c Fri Apr 5 20:22:01 2002
@@ -423,21 +423,6 @@
}
/*
- * A check for whether or not the parent directory has changed.
- * In the case it has, we assume that the dentries are untrustworthy
- * and may need to be looked up again.
- */
-static inline
-int nfs_check_verifier(struct inode *dir, struct dentry *dentry)
-{
- if (IS_ROOT(dentry))
- return 1;
- if (nfs_revalidate_inode(NFS_SERVER(dir), dir))
- return 0;
- return time_after(dentry->d_time, NFS_MTIME_UPDATE(dir));
-}
-
-/*
* Whenever an NFS operation succeeds, we know that the dentry
* is valid, so we update the revalidation timestamp.
*/
@@ -446,31 +431,48 @@
dentry->d_time = jiffies;
}
-static inline
-int nfs_lookup_verify_inode(struct inode *inode, int flags)
+static inline int nfs_dentry_force_reval(struct dentry *dentry, int flags)
{
- struct nfs_server *server = NFS_SERVER(inode);
+ struct inode *inode = dentry->d_inode;
+ unsigned long timeout = NFS_ATTRTIMEO(inode);
+
/*
- * If we're interested in close-to-open cache consistency,
- * then we revalidate the inode upon lookup.
+ * If it's the last lookup in a series, we use a stricter
+ * cache consistency check by looking at the parent mtime.
+ *
+ * If it's been modified in the last hour, be really strict.
+ * (This still means that we can avoid doing unnecessary
+ * work on directories like /usr/share/bin etc which basically
+ * never change).
*/
- if (!(server->flags & NFS_MOUNT_NOCTO) && !(flags & LOOKUP_CONTINUE))
- NFS_CACHEINV(inode);
- return nfs_revalidate_inode(server, inode);
+ if (!(flags & LOOKUP_CONTINUE)) {
+ long diff = CURRENT_TIME - dentry->d_parent->d_inode->i_mtime;
+
+ if (diff < 15*60)
+ timeout = 0;
+ }
+
+ return time_after(jiffies,dentry->d_time + timeout);
}
/*
* We judge how long we want to trust negative
* dentries by looking at the parent inode mtime.
*
- * If parent mtime has changed, we revalidate, else we wait for a
- * period corresponding to the parent's attribute cache timeout value.
+ * If mtime is close to present time, we revalidate
+ * more often.
*/
-static inline int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry)
+#define NFS_REVALIDATE_NEGATIVE (1 * HZ)
+static inline int nfs_neg_need_reval(struct dentry *dentry)
{
- if (!nfs_check_verifier(dir, dentry))
- return 1;
- return time_after(jiffies, dentry->d_time + NFS_ATTRTIMEO(dir));
+ struct inode *dir = dentry->d_parent->d_inode;
+ unsigned long timeout = NFS_ATTRTIMEO(dir);
+ long diff = CURRENT_TIME - dir->i_mtime;
+
+ if (diff < 5*60 && timeout > NFS_REVALIDATE_NEGATIVE)
+ timeout = NFS_REVALIDATE_NEGATIVE;
+
+ return time_after(jiffies, dentry->d_time + timeout);
}
/*
@@ -481,8 +483,9 @@
* NOTE! The hit can be a negative hit too, don't assume
* we have an inode!
*
- * If the parent directory is seen to have changed, we throw out the
- * cached dentry and do a new lookup.
+ * If the dentry is older than the revalidation interval,
+ * we do a new lookup and verify that the dentry is still
+ * correct.
*/
static int nfs_lookup_revalidate(struct dentry * dentry, int flags)
{
@@ -495,9 +498,13 @@
lock_kernel();
dir = dentry->d_parent->d_inode;
inode = dentry->d_inode;
-
+ /*
+ * If we don't have an inode, let's look at the parent
+ * directory mtime to get a hint about how often we
+ * should validate things..
+ */
if (!inode) {
- if (nfs_neg_need_reval(dir, dentry))
+ if (nfs_neg_need_reval(dentry))
goto out_bad;
goto out_valid;
}
@@ -508,39 +515,48 @@
goto out_bad;
}
- /* Force a full look up iff the parent directory has changed */
- if (nfs_check_verifier(dir, dentry)) {
- if (nfs_lookup_verify_inode(inode, flags))
- goto out_bad;
+ if (!nfs_dentry_force_reval(dentry, flags))
goto out_valid;
- }
- if (NFS_STALE(inode))
- goto out_bad;
+ if (IS_ROOT(dentry)) {
+ __nfs_revalidate_inode(NFS_SERVER(inode), inode);
+ goto out_valid_renew;
+ }
+ /*
+ * Do a new lookup and check the dentry attributes.
+ */
error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, &fhandle, &fattr);
if (error)
goto out_bad;
- if (memcmp(NFS_FH(inode), &fhandle, sizeof(struct nfs_fh))!= 0)
+
+ /* Inode number matches? */
+ if (!(fattr.valid & NFS_ATTR_FATTR) ||
+ NFS_FSID(inode) != fattr.fsid ||
+ NFS_FILEID(inode) != fattr.fileid)
goto out_bad;
- if ((error = nfs_refresh_inode(inode, &fattr)) != 0)
+
+ /* Ok, remember that we successfully checked it.. */
+ nfs_refresh_inode(inode, &fattr);
+
+ if (nfs_inode_is_stale(inode, &fhandle, &fattr))
goto out_bad;
+ out_valid_renew:
nfs_renew_times(dentry);
- out_valid:
+out_valid:
unlock_kernel();
return 1;
- out_bad:
- NFS_CACHEINV(dir);
- if (inode && S_ISDIR(inode->i_mode)) {
- /* Purge readdir caches. */
- nfs_zap_caches(inode);
- /* If we have submounts, don't unhash ! */
- if (have_submounts(dentry))
- goto out_valid;
- shrink_dcache_parent(dentry);
- }
+out_bad:
+ shrink_dcache_parent(dentry);
+ /* If we have submounts, don't unhash ! */
+ if (have_submounts(dentry))
+ goto out_valid;
d_drop(dentry);
+ /* Purge readdir caches. */
+ nfs_zap_caches(dir);
+ if (inode && S_ISDIR(inode->i_mode))
+ nfs_zap_caches(inode);
unlock_kernel();
return 0;
}
@@ -573,8 +589,6 @@
nfs_complete_unlink(dentry);
unlock_kernel();
}
- if (is_bad_inode(inode))
- force_delete(inode);
iput(inode);
}
@@ -611,9 +625,9 @@
if (inode) {
no_entry:
d_add(dentry, inode);
+ nfs_renew_times(dentry);
error = 0;
}
- nfs_renew_times(dentry);
}
out:
return ERR_PTR(error);
diff -urN 2.4.19pre6aa1/fs/nfs/inode.c nfs/fs/nfs/inode.c
--- 2.4.19pre6aa1/fs/nfs/inode.c Fri Apr 5 20:19:43 2002
+++ nfs/fs/nfs/inode.c Fri Apr 5 20:22:01 2002
@@ -660,6 +660,7 @@
*/
if (inode->i_mode == 0) {
NFS_FILEID(inode) = fattr->fileid;
+ NFS_FSID(inode) = fattr->fsid;
inode->i_mode = fattr->mode;
/* Why so? Because we want revalidate for devices/FIFOs, and
* that's precisely what we have in nfs_file_inode_operations.
@@ -698,18 +699,39 @@
struct nfs_fh *fh = desc->fh;
struct nfs_fattr *fattr = desc->fattr;
+ if (NFS_FSID(inode) != fattr->fsid)
+ return 0;
if (NFS_FILEID(inode) != fattr->fileid)
return 0;
if (memcmp(&inode->u.nfs_i.fh, fh, sizeof(inode->u.nfs_i.fh)) != 0)
return 0;
- if (is_bad_inode(inode))
- return 0;
/* Force an attribute cache update if inode->i_count == 0 */
if (!atomic_read(&inode->i_count))
NFS_CACHEINV(inode);
return 1;
}
+int
+nfs_inode_is_stale(struct inode *inode, struct nfs_fh *fh, struct nfs_fattr *fattr)
+{
+ /* Empty inodes are not stale */
+ if (!inode->i_mode)
+ return 0;
+
+ if ((fattr->mode & S_IFMT) != (inode->i_mode & S_IFMT))
+ return 1;
+
+ if (is_bad_inode(inode) || NFS_STALE(inode))
+ return 1;
+
+ /* Has the filehandle changed? If so is the old one stale? */
+ if (memcmp(&inode->u.nfs_i.fh, fh, sizeof(inode->u.nfs_i.fh)) != 0 &&
+ __nfs_revalidate_inode(NFS_SERVER(inode),inode) == -ESTALE)
+ return 1;
+
+ return 0;
+}
+
/*
* This is our own version of iget that looks up inodes by file handle
* instead of inode number. We use this technique instead of using
@@ -775,7 +797,7 @@
/*
* Make sure the inode is up-to-date.
*/
- error = nfs_revalidate_inode(NFS_SERVER(inode),inode);
+ error = nfs_revalidate(dentry);
if (error) {
#ifdef NFS_PARANOIA
printk("nfs_notify_change: revalidate failed, error=%d\n", error);
@@ -1021,11 +1043,12 @@
inode->i_dev, inode->i_ino,
atomic_read(&inode->i_count), fattr->valid);
- if
...
read more »