ext2 directory handling

ext2 directory handling

Post by Andrew Morto » Thu, 11 Apr 2002 20:00:14



ext2's directory parsing relies on the preservation of the
data outside i_size in the last page of the directory mapping.

But with the stuff I'm doing, ext2 directory pages are marked
dirty, and can be written out at any time via writepage.

And block_write_full_page zaps the data outside i_size, which
causes the ext2 directory scan code to go into an infinite
loop when it hits rec_len == 0.

The patch teaches ext2 to not look at anything outside
i_size when the backing page is not locked.

It also adds lots of checks for rec_len == 0, which fixes the
lockup.  In a perfect world, those checks are not needed, because
the checking was already performed in ext2_check_page.   But
these checks do help to catch problems which have a tendency to
arise with the stuff I'm doing.

I'd really appreciate it if someone could double-check this
patch, please.

Patch against
2.5.8-pre3+ratcache+readahead+pageprivate+pdflush+pdflush_inodes+pdflush_buffers

--- 2.5.8-pre3/fs/ext2/dir.c~dallocbase-55-ext2_dir     Wed Apr 10 00:42:47 2002
+++ 2.5.8-pre3-akpm/fs/ext2/dir.c       Wed Apr 10 02:44:30 2002
@@ -46,6 +46,21 @@ static inline unsigned long dir_pages(st
        return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT;
 }

+/*
+ * Return the offset into page `page_nr' of the last valid
+ * byte in that page, plus one.
+ */
+static unsigned
+ext2_last_byte(struct inode *inode, unsigned long page_nr)
+{
+       unsigned last_byte = inode->i_size;
+
+       last_byte -= page_nr << PAGE_CACHE_SHIFT;
+       if (last_byte > PAGE_CACHE_SIZE)
+               last_byte = PAGE_CACHE_SIZE;
+       return last_byte;
+}
+
 static int ext2_commit_chunk(struct page *page, unsigned from, unsigned to)
 {
        struct inode *dir = page->mapping->host;
@@ -78,10 +93,6 @@ static void ext2_check_page(struct page
                limit = dir->i_size & ~PAGE_CACHE_MASK;
                if (limit & (chunk_size - 1))
                        goto Ebadsize;
-               for (offs = limit; offs<PAGE_CACHE_SIZE; offs += chunk_size) {
-                       ext2_dirent *p = (ext2_dirent*)(kaddr + offs);
-                       p->rec_len = cpu_to_le16(chunk_size);
-               }
                if (!limit)
                        goto out;
        }
@@ -197,8 +208,11 @@ ext2_validate_entry(char *base, unsigned
 {
        ext2_dirent *de = (ext2_dirent*)(base + offset);
        ext2_dirent *p = (ext2_dirent*)(base + (offset&mask));
-       while ((char*)p < (char*)de)
+       while ((char*)p < (char*)de) {
+               if (p->rec_len == 0)
+                       break;
                p = ext2_next_entry(p);
+       }
        return (char *)p - base;
 }

@@ -245,6 +259,7 @@ ext2_readdir (struct file * filp, void *
        unsigned chunk_mask = ~(ext2_chunk_size(inode)-1);
        unsigned char *types = NULL;
        int need_revalidate = (filp->f_version != inode->i_version);
+       int ret = 0;

        if (pos > inode->i_size - EXT2_DIR_REC_LEN(1))
                goto done;
@@ -265,8 +280,15 @@ ext2_readdir (struct file * filp, void *
                        need_revalidate = 0;
                }
                de = (ext2_dirent *)(kaddr+offset);
-               limit = kaddr + PAGE_CACHE_SIZE - EXT2_DIR_REC_LEN(1);
-               for ( ;(char*)de <= limit; de = ext2_next_entry(de))
+               limit = kaddr + ext2_last_byte(inode, n) - EXT2_DIR_REC_LEN(1);
+               for ( ;(char*)de <= limit; de = ext2_next_entry(de)) {
+                       if (de->rec_len == 0) {
+                               ext2_error(sb, __FUNCTION__,
+                                       "zero-length directory entry");
+                               ret = -EIO;
+                               ext2_put_page(page);
+                               goto done;
+                       }
                        if (de->inode) {
                                int over;
                                unsigned char d_type = DT_UNKNOWN;
@@ -283,6 +305,7 @@ ext2_readdir (struct file * filp, void *
                                        goto done;
                                }
                        }
+               }
                ext2_put_page(page);
        }

@@ -326,10 +349,16 @@ struct ext2_dir_entry_2 * ext2_find_entr
                if (!IS_ERR(page)) {
                        kaddr = page_address(page);
                        de = (ext2_dirent *) kaddr;
-                       kaddr += PAGE_CACHE_SIZE - reclen;
+                       kaddr += ext2_last_byte(dir, n) - reclen;
                        while ((char *) de <= kaddr) {
                                if (ext2_match (namelen, name, de))
                                        goto found;
+                               if (de->rec_len == 0) {
+                                       ext2_error(dir->i_sb, __FUNCTION__,
+                                               "zero-length directory entry");
+                                       ext2_put_page(page);
+                                       goto out;
+                               }
                                de = ext2_next_entry(de);
                        }
                        ext2_put_page(page);
@@ -337,6 +366,7 @@ struct ext2_dir_entry_2 * ext2_find_entr
                if (++n >= npages)
                        n = 0;
        } while (n != start);
+out:
        return NULL;

 found:
@@ -401,6 +431,7 @@ int ext2_add_link (struct dentry *dentry
        struct inode *dir = dentry->d_parent->d_inode;
        const char *name = dentry->d_name.name;
        int namelen = dentry->d_name.len;
+       unsigned chunk_size = ext2_chunk_size(dir);
        unsigned reclen = EXT2_DIR_REC_LEN(namelen);
        unsigned short rec_len, name_len;
        struct page *page = NULL;
@@ -411,27 +442,50 @@ int ext2_add_link (struct dentry *dentry
        unsigned from, to;
        int err;

-       /* We take care of directory expansion in the same loop */
+       /*
+        * We take care of directory expansion in the same loop.
+        * This code plays outside i_size, so it locks the page
+        * to protect that region.
+        */
        for (n = 0; n <= npages; n++) {
+               char *dir_end;
+
                page = ext2_get_page(dir, n);
                err = PTR_ERR(page);
                if (IS_ERR(page))
                        goto out;
+               lock_page(page);
                kaddr = page_address(page);
+               dir_end = kaddr + ext2_last_byte(dir, n);
                de = (ext2_dirent *)kaddr;
                kaddr += PAGE_CACHE_SIZE - reclen;
                while ((char *)de <= kaddr) {
                        err = -EEXIST;
                        if (ext2_match (namelen, name, de))
-                               goto out_page;
+                               goto out_unlock;
+                       if ((char *)de == dir_end) {
+                               /* We hit i_size */
+                               name_len = 0;
+                               rec_len = chunk_size;
+                               de->rec_len = cpu_to_le16(chunk_size);
+                               de->inode = 0;
+                               goto got_it;
+                       }
                        name_len = EXT2_DIR_REC_LEN(de->name_len);
                        rec_len = le16_to_cpu(de->rec_len);
                        if (!de->inode && rec_len >= reclen)
                                goto got_it;
                        if (rec_len >= name_len + reclen)
                                goto got_it;
+                       if (de->rec_len == 0) {
+                               ext2_error(dir->i_sb, __FUNCTION__,
+                                       "zero-length directory entry");
+                               err = -EIO;
+                               goto out_unlock;
+                       }
                        de = (ext2_dirent *) ((char *) de + rec_len);
                }
+               unlock_page(page);
                ext2_put_page(page);
        }
        BUG();
@@ -440,7 +494,6 @@ int ext2_add_link (struct dentry *dentry
 got_it:
        from = (char*)de - (char*)page_address(page);
        to = from + rec_len;
-       lock_page(page);
        err = page->mapping->a_ops->prepare_write(NULL, page, from, to);
        if (err)
                goto out_unlock;
@@ -460,7 +513,6 @@ got_it:
        /* OFFSET_CACHE */
 out_unlock:
        UnlockPage(page);
-out_page:
        ext2_put_page(page);
 out:
        return err;
@@ -484,6 +536,12 @@ int ext2_delete_entry (struct ext2_dir_e
        while ((char*)de < (char*)dir) {
                pde = de;
                de = ext2_next_entry(de);
+               if (de->rec_len == 0) {
+                       ext2_error(inode->i_sb, __FUNCTION__,
+                               "zero-length directory entry");
+                       err = -EIO;
+                       goto out;
+               }
        }
        if (pde)
                from = (char*)pde - (char*)page_address(page);
@@ -496,9 +554,10 @@ int ext2_delete_entry (struct ext2_dir_e
        dir->inode = 0;
        err = ext2_commit_chunk(page, from, to);
        UnlockPage(page);
-       ext2_put_page(page);
        inode->i_ctime = inode->i_mtime = CURRENT_TIME;
        mark_inode_dirty(inode);
+out:
+       ext2_put_page(page);
        return err;
 }

@@ -550,7 +609,7 @@ int ext2_empty_dir (struct inode * inode
 {
        struct page *page = NULL;
        unsigned long i, npages = dir_pages(inode);
-      
+
        for (i = 0; i < npages; i++) {
                char *kaddr;
                ext2_dirent * de;
@@ -561,7 +620,7 @@ int ext2_empty_dir (struct inode * inode

                kaddr = page_address(page);
                de = (ext2_dirent *)kaddr;
-               kaddr += PAGE_CACHE_SIZE-EXT2_DIR_REC_LEN(1);
+               kaddr += ext2_last_byte(inode, i) - EXT2_DIR_REC_LEN(1);

                while ((char *)de <= kaddr) {
                        if (de->inode != 0) {
@@ -578,6 +637,11 @@ int ext2_empty_dir (struct inode * inode
                                        goto not_empty;
                        }
                        de = ext2_next_entry(de);
+                       if (de->rec_len == 0) {
+                               ext2_error(inode->i_sb, __FUNCTION__,
+                                       "zero-length directory entry");
+                               goto not_empty;
+                       }
                }
                ext2_put_page(page);
        }

-
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

 
 
 

ext2 directory handling

Post by Andrew Morto » Thu, 11 Apr 2002 20:30:19


That was an old version.  This is it:

--- 2.5.8-pre3/fs/ext2/dir.c~dallocbase-55-ext2_dir     Wed Apr 10 00:42:47 2002
+++ 2.5.8-pre3-akpm/fs/ext2/dir.c       Wed Apr 10 03:11:50 2002
@@ -46,6 +46,21 @@ static inline unsigned long dir_pages(st
        return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT;
 }

+/*
+ * Return the offset into page `page_nr' of the last valid
+ * byte in that page, plus one.
+ */
+static unsigned
+ext2_last_byte(struct inode *inode, unsigned long page_nr)
+{
+       unsigned last_byte = inode->i_size;
+
+       last_byte -= page_nr << PAGE_CACHE_SHIFT;
+       if (last_byte > PAGE_CACHE_SIZE)
+               last_byte = PAGE_CACHE_SIZE;
+       return last_byte;
+}
+
 static int ext2_commit_chunk(struct page *page, unsigned from, unsigned to)
 {
        struct inode *dir = page->mapping->host;
@@ -78,10 +93,6 @@ static void ext2_check_page(struct page
                limit = dir->i_size & ~PAGE_CACHE_MASK;
                if (limit & (chunk_size - 1))
                        goto Ebadsize;
-               for (offs = limit; offs<PAGE_CACHE_SIZE; offs += chunk_size) {
-                       ext2_dirent *p = (ext2_dirent*)(kaddr + offs);
-                       p->rec_len = cpu_to_le16(chunk_size);
-               }
                if (!limit)
                        goto out;
        }
@@ -197,8 +208,11 @@ ext2_validate_entry(char *base, unsigned
 {
        ext2_dirent *de = (ext2_dirent*)(base + offset);
        ext2_dirent *p = (ext2_dirent*)(base + (offset&mask));
-       while ((char*)p < (char*)de)
+       while ((char*)p < (char*)de) {
+               if (p->rec_len == 0)
+                       break;
                p = ext2_next_entry(p);
+       }
        return (char *)p - base;
 }

@@ -245,6 +259,7 @@ ext2_readdir (struct file * filp, void *
        unsigned chunk_mask = ~(ext2_chunk_size(inode)-1);
        unsigned char *types = NULL;
        int need_revalidate = (filp->f_version != inode->i_version);
+       int ret = 0;

        if (pos > inode->i_size - EXT2_DIR_REC_LEN(1))
                goto done;
@@ -265,8 +280,15 @@ ext2_readdir (struct file * filp, void *
                        need_revalidate = 0;
                }
                de = (ext2_dirent *)(kaddr+offset);
-               limit = kaddr + PAGE_CACHE_SIZE - EXT2_DIR_REC_LEN(1);
-               for ( ;(char*)de <= limit; de = ext2_next_entry(de))
+               limit = kaddr + ext2_last_byte(inode, n) - EXT2_DIR_REC_LEN(1);
+               for ( ;(char*)de <= limit; de = ext2_next_entry(de)) {
+                       if (de->rec_len == 0) {
+                               ext2_error(sb, __FUNCTION__,
+                                       "zero-length directory entry");
+                               ret = -EIO;
+                               ext2_put_page(page);
+                               goto done;
+                       }
                        if (de->inode) {
                                int over;
                                unsigned char d_type = DT_UNKNOWN;
@@ -283,6 +305,7 @@ ext2_readdir (struct file * filp, void *
                                        goto done;
                                }
                        }
+               }
                ext2_put_page(page);
        }

@@ -326,8 +349,14 @@ struct ext2_dir_entry_2 * ext2_find_entr
                if (!IS_ERR(page)) {
                        kaddr = page_address(page);
                        de = (ext2_dirent *) kaddr;
-                       kaddr += PAGE_CACHE_SIZE - reclen;
+                       kaddr += ext2_last_byte(dir, n) - reclen;
                        while ((char *) de <= kaddr) {
+                               if (de->rec_len == 0) {
+                                       ext2_error(dir->i_sb, __FUNCTION__,
+                                               "zero-length directory entry");
+                                       ext2_put_page(page);
+                                       goto out;
+                               }
                                if (ext2_match (namelen, name, de))
                                        goto found;
                                de = ext2_next_entry(de);
@@ -337,6 +366,7 @@ struct ext2_dir_entry_2 * ext2_find_entr
                if (++n >= npages)
                        n = 0;
        } while (n != start);
+out:
        return NULL;

 found:
@@ -401,6 +431,7 @@ int ext2_add_link (struct dentry *dentry
        struct inode *dir = dentry->d_parent->d_inode;
        const char *name = dentry->d_name.name;
        int namelen = dentry->d_name.len;
+       unsigned chunk_size = ext2_chunk_size(dir);
        unsigned reclen = EXT2_DIR_REC_LEN(namelen);
        unsigned short rec_len, name_len;
        struct page *page = NULL;
@@ -411,19 +442,41 @@ int ext2_add_link (struct dentry *dentry
        unsigned from, to;
        int err;

-       /* We take care of directory expansion in the same loop */
+       /*
+        * We take care of directory expansion in the same loop.
+        * This code plays outside i_size, so it locks the page
+        * to protect that region.
+        */
        for (n = 0; n <= npages; n++) {
+               char *dir_end;
+
                page = ext2_get_page(dir, n);
                err = PTR_ERR(page);
                if (IS_ERR(page))
                        goto out;
+               lock_page(page);
                kaddr = page_address(page);
+               dir_end = kaddr + ext2_last_byte(dir, n);
                de = (ext2_dirent *)kaddr;
                kaddr += PAGE_CACHE_SIZE - reclen;
                while ((char *)de <= kaddr) {
                        err = -EEXIST;
                        if (ext2_match (namelen, name, de))
-                               goto out_page;
+                               goto out_unlock;
+                       if ((char *)de == dir_end) {
+                               /* We hit i_size */
+                               name_len = 0;
+                               rec_len = chunk_size;
+                               de->rec_len = cpu_to_le16(chunk_size);
+                               de->inode = 0;
+                               goto got_it;
+                       }
+                       if (de->rec_len == 0) {
+                               ext2_error(dir->i_sb, __FUNCTION__,
+                                       "zero-length directory entry");
+                               err = -EIO;
+                               goto out_unlock;
+                       }
                        name_len = EXT2_DIR_REC_LEN(de->name_len);
                        rec_len = le16_to_cpu(de->rec_len);
                        if (!de->inode && rec_len >= reclen)
@@ -432,6 +485,7 @@ int ext2_add_link (struct dentry *dentry
                                goto got_it;
                        de = (ext2_dirent *) ((char *) de + rec_len);
                }
+               unlock_page(page);
                ext2_put_page(page);
        }
        BUG();
@@ -440,7 +494,6 @@ int ext2_add_link (struct dentry *dentry
 got_it:
        from = (char*)de - (char*)page_address(page);
        to = from + rec_len;
-       lock_page(page);
        err = page->mapping->a_ops->prepare_write(NULL, page, from, to);
        if (err)
                goto out_unlock;
@@ -460,7 +513,6 @@ got_it:
        /* OFFSET_CACHE */
 out_unlock:
        UnlockPage(page);
-out_page:
        ext2_put_page(page);
 out:
        return err;
@@ -482,6 +534,12 @@ int ext2_delete_entry (struct ext2_dir_e
        int err;

        while ((char*)de < (char*)dir) {
+               if (de->rec_len == 0) {
+                       ext2_error(inode->i_sb, __FUNCTION__,
+                               "zero-length directory entry");
+                       err = -EIO;
+                       goto out;
+               }
                pde = de;
                de = ext2_next_entry(de);
        }
@@ -496,9 +554,10 @@ int ext2_delete_entry (struct ext2_dir_e
        dir->inode = 0;
        err = ext2_commit_chunk(page, from, to);
        UnlockPage(page);
-       ext2_put_page(page);
        inode->i_ctime = inode->i_mtime = CURRENT_TIME;
        mark_inode_dirty(inode);
+out:
+       ext2_put_page(page);
        return err;
 }

@@ -550,7 +609,7 @@ int ext2_empty_dir (struct inode * inode
 {
        struct page *page = NULL;
        unsigned long i, npages = dir_pages(inode);
-      
+
        for (i = 0; i < npages; i++) {
                char *kaddr;
                ext2_dirent * de;
@@ -561,9 +620,15 @@ int ext2_empty_dir (struct inode * inode

                kaddr = page_address(page);
                de = (ext2_dirent *)kaddr;
-               kaddr += PAGE_CACHE_SIZE-EXT2_DIR_REC_LEN(1);
+               kaddr += ext2_last_byte(inode, i) - EXT2_DIR_REC_LEN(1);

                while ((char *)de <= kaddr) {
+                       if (de->rec_len == 0) {
+                               ext2_error(inode->i_sb, __FUNCTION__,
+                                       "zero-length directory entry");
+                               printk("kaddr=%p, de=%p\n", kaddr, de);
+                               goto not_empty;
+                       }
                        if (de->inode != 0) {
                                /* check for . and .. */
                                if (de->name[0] != '.')

-
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

 
 
 

1. Ext2 directory index, updated

***N.B.: still for use on test partitions only.***

This update mainly fixes a bug, a one-in-a-million occurance on an untested
code path.  This bug resulted in rm -rf deleting all files but one from a
million-file directory.  I believe that's the last untested code path, and
otherwise it's been very stable.

I didn't expect highmem to work properly, and it didn't.  It's on my to-do
list, but for now highmem has to be off or you will oops on boot.

I elaborated the dx_show_buckets debug output to show dump the full index
tree instead of just one level.  This function now serves as a capsule
summary of the index tree structure, and as you can see, it's simple.

I've done quite a bit more testing, including stress testing on a real
machine and I find that everything works quite comfortably up to about 2
million files, turning in an average time of about 50 microseconds/create and
300 microseconds/delete (1 GHz PIII).  In the 4 million file range things go
pear-shaped, which I believe is not due to the index patch, but to rd.  The
runs do complete, but require exponentially more time, with cpu 98% idle and
block throughput in the 300/second range.  I'll look into that more later.

I did run into some bad mm behavior on 2.4.13.  The icache seems to be too
severely throttled, resulting in delete performance being less than it should
be.  I also find I am rarely unable to create a million file test run on uml
(2.4.13) without oom-ing.  In my experience, such problems are not due to
uml, but to the kernel's memory manager.  These issues may have been
addressed in recent pre-patch kernels, but it seems there is a still some
room for improvement in mm stability.

The patch is available at:

  http://nl.linux.org/~phillips/htree/ext2.index-2.4.13

To apply:

  cd /your/source/tree
  patch -p0 <this.patch

--
Daniel

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in

More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

2. Ping Problem!

3. ext2 performance with large directories

4. OSR 5.0.2 w/Vdisk 1.1 and panic autodump not automatic

5. Ext2 zeros inode in directory entry when deleting files.

6. LyX and KDE?

7. Kernel problem : ext2 fs directory remained locked ?

8. Hi people!

9. Performance of Ext2 directory operations

10. ext2: lost parent directory

11. Ext2 Directory Index - File Structure

12. Q: ext2 refuses hardlink on directory

13. Ext2 Directory Index, updated