concurrent block/inode allocation for EXT3

concurrent block/inode allocation for EXT3

Post by Alex Toma » Tue, 25 Mar 2003 19:10:07



hi!

this time, concurrent block/inode allocation for EXT3 against 2.5.65.
should be applied with ext2-concurrent-balloc because of ext2_set_bit_atomic()
and ext2_clear_bit_atomic().

to see real improvement, you should use 2.5.65-mm4 in which Andrew Morton pushed
BKL'es down into JBD.

1) each group has own spinlock, which is used for group counter
   modifications
2) sb->s_free_blocks_count isn't used any more.  ext2_statfs() and
   find_group_orlov() loop over groups to count free blocks
3) sb->s_free_blocks_count is recalculated at mount/umount/sync_super time
   in order to check consistency and to avoid fsck warnings
4) reserved blocks are distributed over last groups
5) ext3_new_block() tries to use non-reserved blocks and if it fails then
   tries to use reserved blocks
6) ext3_new_block() and ext3_free_blocks do not modify sb->s_free_blocks,
   therefore they do not call mark_buffer_dirty() for superblock's
   buffer_head. this should reduce I/O a bit

diff -puNr linux-2.5.65/fs/ext3/balloc.c edited/fs/ext3/balloc.c
--- linux-2.5.65/fs/ext3/balloc.c       Thu Feb 20 16:19:06 2003
+++ edited/fs/ext3/balloc.c     Mon Mar 24 16:17:40 2003
@@ -118,7 +118,6 @@ void ext3_free_blocks (handle_t *handle,
                printk ("ext3_free_blocks: nonexistent device");
                return;
        }
-       lock_super (sb);
        es = EXT3_SB(sb)->s_es;
        if (block < le32_to_cpu(es->s_first_data_block) ||
            block + count < block ||
@@ -184,11 +183,6 @@ do_more:
        if (err)
                goto error_return;

-       BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "get_write_access");
-       err = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh);
-       if (err)
-               goto error_return;
-
        for (i = 0; i < count; i++) {
                /*
                 * An HJ special.  This is expensive...
@@ -208,18 +202,15 @@ do_more:
                }
 #endif
                BUFFER_TRACE(bitmap_bh, "clear bit");
-               if (!ext3_clear_bit (bit + i, bitmap_bh->b_data)) {
+               if (!ext3_clear_bit_atomic (&EXT3_SB(sb)->s_bgi[block_group].bg_balloc_lock,
+                                               bit + i, bitmap_bh->b_data)) {
                        ext3_error (sb, __FUNCTION__,
                                      "bit already cleared for block %lu",
                                      block + i);
                        BUFFER_TRACE(bitmap_bh, "bit already cleared");
-               } else {
+               } else
                        dquot_freed_blocks++;
-                       gdp->bg_free_blocks_count =
-                         cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count)+1);
-                       es->s_free_blocks_count =
-                         cpu_to_le32(le32_to_cpu(es->s_free_blocks_count)+1);
-               }
+
                /* @@@ This prevents newly-allocated data from being
                 * freed and then reallocated within the same
                 * transaction.
@@ -244,6 +235,11 @@ do_more:
                ext3_set_bit(bit + i, bh2jh(bitmap_bh)->b_committed_data);
        }

+       spin_lock(&EXT3_SB(sb)->s_bgi[block_group].bg_balloc_lock);
+       gdp->bg_free_blocks_count =
+               cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + dquot_freed_blocks);
+       spin_unlock(&EXT3_SB(sb)->s_bgi[block_group].bg_balloc_lock);
+
        /* We dirtied the bitmap block */
        BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
        err = ext3_journal_dirty_metadata(handle, bitmap_bh);
@@ -253,11 +249,6 @@ do_more:
        ret = ext3_journal_dirty_metadata(handle, gd_bh);
        if (!err) err = ret;

-       /* And the superblock */
-       BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "dirtied superblock");
-       ret = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
-       if (!err) err = ret;
-
        if (overflow && !err) {
                block += count;
                count = overflow;
@@ -267,7 +258,6 @@ do_more:
 error_return:
        brelse(bitmap_bh);
        ext3_std_error(sb, err);
-       unlock_super(sb);
        if (dquot_freed_blocks)
                DQUOT_FREE_BLOCK(inode, dquot_freed_blocks);
        return;
@@ -367,6 +357,61 @@ static int find_next_usable_block(int st
        return -1;
 }

+
+int
+ext3_try_to_allocate(struct super_block *sb, handle_t *handle, int group,
+                               struct buffer_head *bitmap_bh, int goal,
+                               int *errp)
+{
+       int i, fatal = 0;
+
+       *errp = 0;
+
+       if (goal >= 0 && ext3_test_allocatable(goal, bitmap_bh))
+               goto got;
+
+repeat:
+       goal = find_next_usable_block(goal, bitmap_bh,
+                               EXT3_BLOCKS_PER_GROUP(sb));
+       if (goal < 0)
+               return -1;
+
+       for (i = 0;
+               i < 7 && goal > 0 && ext3_test_allocatable(goal - 1, bitmap_bh);
+               i++, goal--);
+
+got:
+       /* Make sure we use undo access for the bitmap, because it is
+        * critical that we do the frozen_data COW on bitmap buffers in
+        * all cases even if the buffer is in BJ_Forget state in the
+        * committing transaction.  */
+       BUFFER_TRACE(bitmap_bh, "get undo access for marking new block");
+       fatal = ext3_journal_get_undo_access(handle, bitmap_bh);
+       if (fatal) {
+               *errp = fatal;
+               return -1;
+       }
+
+       if (ext3_set_bit_atomic(&EXT3_SB(sb)->s_bgi[group].bg_balloc_lock,
+                               goal, bitmap_bh->b_data)) {
+               /* already allocated by concurrent thread -bzzz */
+               goal++;
+               if (goal >= EXT3_BLOCKS_PER_GROUP(sb))
+                       return -1;
+               goto repeat;
+       }
+
+       BUFFER_TRACE(bitmap_bh, "journal_dirty_metadata for bitmap block");
+       fatal = ext3_journal_dirty_metadata(handle, bitmap_bh);
+       if (fatal) {
+               *errp = fatal;
+               return -1;
+       }
+
+       return goal;
+}
+
+
 /*
  * ext3_new_block uses a goal block to assist allocation.  If the goal is
  * free, or there is a free block within 32 blocks of the goal, that block
@@ -387,6 +432,7 @@ ext3_new_block(handle_t *handle, struct
        int target_block;                       /* tmp */
        int fatal = 0, err;
        int performed_allocation = 0;
+       int free, use_reserve = 0;
        struct super_block *sb;
        struct ext3_group_desc *gdp;
        struct ext3_super_block *es;
@@ -408,16 +454,7 @@ ext3_new_block(handle_t *handle, struct
                return 0;
        }

-       lock_super(sb);
        es = EXT3_SB(sb)->s_es;
-       if (le32_to_cpu(es->s_free_blocks_count) <=
-                       le32_to_cpu(es->s_r_blocks_count) &&
-           ((EXT3_SB(sb)->s_resuid != current->fsuid) &&
-            (EXT3_SB(sb)->s_resgid == 0 ||
-             !in_group_p(EXT3_SB(sb)->s_resgid)) &&
-            !capable(CAP_SYS_RESOURCE)))
-               goto out;
-
        ext3_debug("goal=%lu.\n", goal);

        /*
@@ -431,40 +468,28 @@ ext3_new_block(handle_t *handle, struct
        gdp = ext3_get_group_desc(sb, group_no, &gdp_bh);
        if (!gdp)
                goto io_error;
-
-       if (le16_to_cpu(gdp->bg_free_blocks_count) > 0) {
+
+       free = le16_to_cpu(gdp->bg_free_blocks_count);
+       free -= EXT3_SB(sb)->s_bgi[group_no].bg_reserved;
+       if (free > 0) {
                ret_block = ((goal - le32_to_cpu(es->s_first_data_block)) %
                                EXT3_BLOCKS_PER_GROUP(sb));
-#ifdef EXT3FS_DEBUG
-               if (ret_block)
-                       goal_attempts++;
-#endif
                bitmap_bh = read_block_bitmap(sb, group_no);
                if (!bitmap_bh)
-                       goto io_error;
-
-               ext3_debug("goal is at %d:%d.\n", group_no, ret_block);
-
-               if (ext3_test_allocatable(ret_block, bitmap_bh)) {
-#ifdef EXT3FS_DEBUG
-                       goal_hits++;
-                       ext3_debug("goal bit allocated.\n");
-#endif
-                       goto got_block;
-               }
-
-               ret_block = find_next_usable_block(ret_block, bitmap_bh,
-                               EXT3_BLOCKS_PER_GROUP(sb));
+                       goto io_error;  
+               ret_block = ext3_try_to_allocate(sb, handle, group_no, bitmap_bh,
+                                               ret_block, &fatal);
+               if (fatal)
+                       goto out;
                if (ret_block >= 0)
-                       goto search_back;
+                       goto allocated;
        }
-
-       ext3_debug("Bit not found in block group %d.\n", group_no);
-
+      
        /*
         * Now search the rest of the groups.  We assume that
         * i and gdp correctly point to the last group visited.
         */
+repeat:
        for (bit = 0; bit < EXT3_SB(sb)->s_groups_count; bit++) {
                group_no++;
                if (group_no >= EXT3_SB(sb)->s_groups_count)
@@ -474,57 +499,47 @@ ext3_new_block(handle_t *handle, struct
                        *errp = -EIO;
                        goto out;
                }
-               if (le16_to_cpu(gdp->bg_free_blocks_count) > 0) {
-                       brelse(bitmap_bh);
-                       bitmap_bh = read_block_bitmap(sb, group_no);
-                       if (!bitmap_bh)
-                               goto io_error;
-                       ret_block = find_next_usable_block(-1, bitmap_bh,
-                                                  EXT3_BLOCKS_PER_GROUP(sb));
-                       if (ret_block >= 0)
-                               goto search_back;
-               }
-       }
+               free = le16_to_cpu(gdp->bg_free_blocks_count);
+               if (!use_reserve)
+                       free -= EXT3_SB(sb)->s_bgi[group_no].bg_reserved;
+               if (free <= 0)
+                       continue;

+               brelse(bitmap_bh);
+               bitmap_bh = read_block_bitmap(sb, group_no);
+               if (!bitmap_bh)
+                       goto io_error;
+               ret_block = ext3_try_to_allocate(sb, handle, group_no,
+                                               bitmap_bh, -1, &fatal);
+               if (fatal)
+                       goto out;
+               if (ret_block >= 0)
+                       goto allocated;
+       }
+
+       if (!use_reserve &&
+               (EXT3_SB(sb)->s_resuid == current->fsuid ||
+                 (EXT3_SB(sb)->s_resgid != 0 && in_group_p(EXT3_SB(sb)->s_resgid)) ||
+                 capable(CAP_SYS_RESOURCE))) {
+               use_reserve = 1;
+               group_no = 0;
+               goto repeat;
+       }
+
        /* No space left on the device */
+       *errp = -ENOSPC;
        goto out;

-search_back:
-       /*
-        * We have succeeded in finding a free byte in the block
-        * bitmap.  Now search backwards up to 7 bits to find the
-        * start of this group of free blocks.
-        */
-       for (   bit = 0;
-               bit < 7 && ret_block > 0 &&
-                       ext3_test_allocatable(ret_block - 1, bitmap_bh);
-               bit++, ret_block--)
-               ;
-      
-got_block:
+allocated:

        ext3_debug("using block group %d(%d)\n",
                        group_no, gdp->bg_free_blocks_count);

-       /* Make sure we use undo access for the bitmap, because it is
-           critical that we do the frozen_data COW on bitmap buffers in
-           all cases even if the buffer is in BJ_Forget state in the
-           committing transaction.  */
-       BUFFER_TRACE(bitmap_bh, "get undo access for marking new block");
-       fatal = ext3_journal_get_undo_access(handle, bitmap_bh);
-       if (fatal)
-               goto out;
-      
        BUFFER_TRACE(gdp_bh, "get_write_access");
        fatal = ext3_journal_get_write_access(handle, gdp_bh);
        if (fatal)
                goto out;

-       BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "get_write_access");
-       fatal = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh);
-       if (fatal)
-               goto out;
-
        target_block = ret_block + group_no * EXT3_BLOCKS_PER_GROUP(sb)
                                + le32_to_cpu(es->s_first_data_block);

@@ -536,11 +551,6 @@ got_block:
                            "Allocating block in system zone - "
                            "block = %u", target_block);

-       /* The superblock lock should guard against anybody else beating
-        * us to this point! */
-       J_ASSERT_BH(bitmap_bh, !ext3_test_bit(ret_block, bitmap_bh->b_data));
-       BUFFER_TRACE(bitmap_bh, "setting bitmap bit");
-       ext3_set_bit(ret_block, bitmap_bh->b_data);
...

read more »

 
 
 

concurrent block/inode allocation for EXT3

Post by Andrew Morto » Wed, 26 Mar 2003 00:50:14



> hi!

> this time, concurrent block/inode allocation for EXT3 against 2.5.65.

the balloc.c part looks OK.  But we do need to be atomic against
b_committed_data as well.

I'm not sure whether it's legal to mix nonatomic ext2_test_bit() with the
atomic test and set operations.  I'll find that out, but it'll be OK for the
while.

You seem to have lost the b_committed_data assertion.  Was there a reason for
that?

--- 25/fs/ext3/balloc.c~ext3-concurrent-block-allocation-1      Mon Mar 24 16:01:22 2003

                BUFFER_TRACE(bitmap_bh, "clear in b_committed_data");
                J_ASSERT_BH(bitmap_bh,
                                bh2jh(bitmap_bh)->b_committed_data != NULL);
-               ext3_set_bit(bit + i, bh2jh(bitmap_bh)->b_committed_data);
+               ext3_set_bit_atomic(&EXT3_SB(sb)->s_bgi[group].bg_balloc_lock,
+                               bit + i, bh2jh(bitmap_bh)->b_committed_data);
        }


        struct buffer_head *gdp_bh;             /* bh2 */
        int group_no;                           /* i */
        int ret_block;                          /* j */
-       int bit;                                /* k */
+       int bgi;                                /* blockgroup iteration index */
        int target_block;                       /* tmp */
        int fatal = 0, err;

                bitmap_bh = read_block_bitmap(sb, group_no);
                if (!bitmap_bh)
                        goto io_error;  
-               ret_block = ext3_try_to_allocate(sb, handle, group_no, bitmap_bh,
-                                               ret_block, &fatal);
+               ret_block = ext3_try_to_allocate(sb, handle, group_no,
+                                       bitmap_bh, ret_block, &fatal);
                if (fatal)
                        goto out;

         * i and gdp correctly point to the last group visited.
         */
 repeat:
-       for (bit = 0; bit < EXT3_SB(sb)->s_groups_count; bit++) {
+       for (bgi = 0; bgi < EXT3_SB(sb)->s_groups_count; bgi++) {
                group_no++;
                if (group_no >= EXT3_SB(sb)->s_groups_count)

                }
        }
 #endif
+       if (buffer_jbd(bitmap_bh) && bh2jh(bitmap_bh)->b_committed_data)
+               J_ASSERT_BH(bitmap_bh,
+                       !ext3_test_bit(ret_block,
+                                       bh2jh(bitmap_bh)->b_committed_data));
        ext3_debug("found bit %d\n", ret_block);

        /* ret_block was blockgroup-relative.  Now it becomes fs-relative */

_

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in

More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

 
 
 

concurrent block/inode allocation for EXT3

Post by Andrew Morto » Wed, 26 Mar 2003 01:00:20



> hi!

> this time, concurrent block/inode allocation for EXT3 against 2.5.65.

And the inode allocator changes look fine, thanks.  It is time to test this
puppy.

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in

More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/