Make Ext3 fsck way faster [2.6.23.13] [LWN.net] (original) (raw)

| From: | | Abhishek Rai abhishekrai@google.com | | --------------- | | ------------------------------------------------------------------- | | To: | | linux-kernel@vger.kernel.org | | Subject: | | [CALL FOR TESTING] Make Ext3 fsck way faster [2.6.23.13] | | Date: | | Sun, 13 Jan 2008 00:41:55 -0500 (EST) | | Message-ID: | | Pine.LNX.4.64.0801130026480.4286@vmfs7.nyc.corp.google.com | | Cc: | | rohitseth@google.com, akpm@linux-foundation.org, phillips@phunq.net |

Hi,

This patch speeds up e2fsck on Ext3 significantly using a technique called Metaclustering. Metaclustering is being discussed on LKML in some other threads, here is the latest patch against the 2.6.23.13 stable kernel, any help in testing and evaluating this will be greatly appreciated! While I'd direct anyone interested in details of metaclustering to other LKML threads discussing it, I've enclosed a brief description here:

Metaclustering refers to storing indirect blocks in clusters on a per-group basis instead of spreading them out along with the data blocks. This makes e2fsck faster since it can now read and verify all indirect blocks without much seeks. However, done naively it can affect IO performance, so we have built in some optimizations to prevent that from happening. Finally, the benefit in fsck performance is noticeable only when indirect block reads are the bottleneck which is not always the case, but quite frequently is, in the case of moderate to large disks with lot of data on them. However, when indirect block reads are not the bottleneck, e2fsck is generally quite fast anyway to warrant any performance improvements.

Thanks! Abhishek

Signed-off-by: Abhishek Rai abhishekrai@google.com

diff -rupdN linux-2.6.23.13-clean/fs/ext3/balloc.c linux-2.6.23.13-ext3mc/fs/ext3/balloc.c --- linux-2.6.23.13-clean/fs/ext3/balloc.c 2008-01-09 12🔞17.000000000 -0500 +++ linux-2.6.23.13-ext3mc/fs/ext3/balloc.c 2008-01-12 23:59:36.000000000 -0500 @@ -33,6 +33,29 @@

@@ -108,6 +131,88 @@ read_block_bitmap(struct super_block sb error_out: return bh; } + + +/

+{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + struct ext3_bg_info bgi = &sbi->s_bginfo[block_group]; + + BUG_ON(!test_opt(sb, METACLUSTER)); + + spin_lock(sb_bgl_lock(sbi, block_group)); + if (bgi->bgi_free_nonmc_blocks_count >= 0) + goto out; + + bgi->bgi_free_nonmc_blocks_count = + ext3_count_free(bitmap_bh, sbi->s_nonmc_blocks_per_group/8); + +out: + spin_unlock(sb_bgl_lock(sbi, block_group)); + BUG_ON(bgi->bgi_free_nonmc_blocks_count > + sbi->s_nonmc_blocks_per_group); +} + +/

+{ + struct ext3_bg_info bginfo = &sbi->s_bginfo[group_no]; + ext3_grpblk_t change; + + BUG_ON(bginfo->bgi_free_nonmc_blocks_count < 0); + BUG_ON(start >= sbi->s_nonmc_blocks_per_group); + + change = min_t(ext3_grpblk_t, start + count, + sbi->s_nonmc_blocks_per_group) - start; + + spin_lock(sb_bgl_lock(sbi, group_no)); + BUG_ON(bginfo->bgi_free_nonmc_blocks_count > + sbi->s_nonmc_blocks_per_group); + BUG_ON(allocation && bginfo->bgi_free_nonmc_blocks_count < change); + + bginfo->bgi_free_nonmc_blocks_count += (allocation ? -change : change); + + BUG_ON(bginfo->bgi_free_nonmc_blocks_count > + sbi->s_nonmc_blocks_per_group); + spin_unlock(sb_bgl_lock(sbi, group_no)); +} + +/

+{ + return !(blk >= 0 && blk >= sbi->s_nonmc_blocks_per_group && + bgi->bgi_free_nonmc_blocks_count >= 8); +} + /*

@@ -424,6 +529,7 @@ void ext3_free_blocks_sb(handle_t *handl struct ext3_group_desc * desc; struct ext3_super_block * es; struct ext3_sb_info *sbi; + struct ext3_bg_info *bgi; int err = 0, ret; ext3_grpblk_t group_freed;

@@ -463,6 +569,13 @@ do_more: if (!desc) goto error_return;

@@ -582,6 +695,9 @@ do_more: if (!err) err = ret; *pdquot_freed_blocks += group_freed;

@@ -687,6 +803,50 @@ bitmap_search_next_usable_block(ext3_grp return -1; }

+static ext3_grpblk_t +bitmap_find_prev_zero_bit(char map, ext3_grpblk_t start, ext3_grpblk_t lowest) +{ + ext3_grpblk_t k, blk; + + k = start & ~7; + while (lowest <= k) { + if (map[k/8] != '\255' && + (blk = ext3_find_next_zero_bit(map, k + 8, k)) + < (k + 8)) + return blk; + + k -= 8; + } + return -1; +} + +static ext3_grpblk_t +bitmap_search_prev_usable_block(ext3_grpblk_t start, struct buffer_head *bh, + ext3_grpblk_t lowest) +{ + ext3_grpblk_t next; + struct journal_head *jh = bh2jh(bh); + + /* + * The bitmap search --- search backward alternately through the actual + * bitmap and the last-committed copy until we find a bit free in + * both + */ + while (start >= lowest) { + next = bitmap_find_prev_zero_bit(bh->b_data, start, lowest); + if (next < lowest) + return -1; + if (ext3_test_allocatable(next, bh)) + return next; + jbd_lock_bh_state(bh); + if (jh->b_committed_data) + start = bitmap_find_prev_zero_bit(jh->b_committed_data, + next, lowest); + jbd_unlock_bh_state(bh); + } + return -1; +} + /*

@@ -851,8 +1019,10 @@ repeat: } start = grp_goal;

@@ -867,8 +1037,8 @@ repeat: grp_goal++; while (num < *count && grp_goal < end && ext3_test_allocatable(grp_goal, bitmap_bh) - && claim_block(sb_bgl_lock(EXT3_SB(sb), group), - grp_goal, bitmap_bh)) { + && (!metaclustering || allow_mc_alloc(sbi, bgi, grp_goal)) + && claim_block(sb_bgl_lock(sbi, group), grp_goal, bitmap_bh)) { num++; grp_goal++; } @@ -1099,7 +1269,9 @@ static int alloc_new_reservation(struct

  /*
   * find_next_reservable_window() simply finds a reservable window

@@ -1131,10 +1303,17 @@ retry: my_rsv->rsv_start - group_first_block, bitmap_bh, group_end_block - group_first_block + 1);

@@ -1237,25 +1416,17 @@ ext3_try_to_allocate_with_rsv(struct sup unsigned int group, struct buffer_head *bitmap_bh, ext3_grpblk_t grp_goal, struct ext3_reserve_window_node * my_rsv, - unsigned long *count, int *errp) + unsigned long *count) { + struct ext3_bg_info *bgi; ext3_fsblk_t group_first_block, group_last_block; ext3_grpblk_t ret = 0; - int fatal; unsigned long num = *count;

@@ -1331,19 +1502,6 @@ ext3_try_to_allocate_with_rsv(struct sup num = *count; } out: - if (ret >= 0) { - BUFFER_TRACE(bitmap_bh, "journal_dirty_metadata for " - "bitmap block"); - fatal = ext3_journal_dirty_metadata(handle, bitmap_bh); - if (fatal) { - *errp = fatal; - return -1; - } - return ret; - }

@@ -1389,22 +1547,149 @@ int ext3_should_retry_alloc(struct super return journal_force_commit_nested(EXT3_SB(sb)->s_journal); }

+/*

+{ + struct ext3_bg_info bgi = &EXT3_SB(sb)->s_bginfo[group_no]; + ext3_grpblk_t blk = EXT3_BLOCKS_PER_GROUP(sb) - 1; + ext3_grpblk_t mc_start = EXT3_SB(sb)->s_nonmc_blocks_per_group; + ext3_fsblk_t group_first_block; + int allocated = 0; + + BUG_ON(!test_opt(sb, METACLUSTER)); + + / This check is racy but that wouldn't harm us. / + if (bgi->bgi_free_nonmc_blocks_count >= + le16_to_cpu(gdp->bg_free_blocks_count)) + return 0; + + group_first_block = ext3_group_first_block_no(sb, group_no); + while (allocated < indirect_blks && blk >= mc_start) { + if (!ext3_test_allocatable(blk, bitmap_bh)) { + blk = bitmap_search_prev_usable_block(blk, bitmap_bh, + mc_start); + continue; + } + if (claim_block(sb_bgl_lock(EXT3_SB(sb), group_no), blk, + bitmap_bh)) { + new_blocks[allocated++] = group_first_block + blk; + } else { + / + * The block was allocated by another thread, or it + * was allocated and then freed by another thread + / + cpu_relax(); + } + if (allocated < indirect_blks) + blk = bitmap_search_prev_usable_block(blk, bitmap_bh, + mc_start); + } + return allocated; +} + +/

+{ + struct ext3_super_block *es = EXT3_SB(sb)->s_es; + struct ext3_sb_info sbi = EXT3_SB(sb); + ext3_fsblk_t grp_blk = blk - ext3_group_first_block_no(sb, group_no); + + if (in_range(le32_to_cpu(gdp->bg_block_bitmap), blk, num) || + in_range(le32_to_cpu(gdp->bg_inode_bitmap), blk, num) || + in_range(blk, le32_to_cpu(gdp->bg_inode_table), + EXT3_SB(sb)->s_itb_per_group) || + in_range(blk + num - 1, le32_to_cpu(gdp->bg_inode_table), + EXT3_SB(sb)->s_itb_per_group)) + ext3_error(sb, "ext3_new_blocks", + "Allocating block in system zone - " + "blocks from "E3FSBLK", length %lu", + blk, num); + +#ifdef CONFIG_JBD_DEBUG + { + struct buffer_head debug_bh; + + / Record bitmap buffer state in the newly allocated block / + debug_bh = sb_find_get_block(sb, blk); + if (debug_bh) { + BUFFER_TRACE(debug_bh, "state when allocated"); + BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap state"); + brelse(debug_bh); + } + } + jbd_lock_bh_state(bitmap_bh); + spin_lock(sb_bgl_lock(sbi, group_no)); + if (buffer_jbd(bitmap_bh) && bh2jh(bitmap_bh)->b_committed_data) { + int i; + + for (i = 0; i < num; i++) { + if (ext3_test_bit(grp_blk+i, + bh2jh(bitmap_bh)->b_committed_data)) + printk(KERN_ERR "%s: block was unexpectedly set" + " in b_committed_data\n", FUNCTION); + } + } + ext3_debug("found bit %d\n", grp_blk); + spin_unlock(sb_bgl_lock(sbi, group_no)); + jbd_unlock_bh_state(bitmap_bh); +#endif + + if (blk + num - 1 >= le32_to_cpu(es->s_blocks_count)) { + ext3_error(sb, "ext3_new_blocks", + "block("E3FSBLK") >= blocks count(%d) - " + "block_group = %d, es == %p ", blk, + le32_to_cpu(es->s_blocks_count), group_no, es); + return 1; + } + + return 0; +} + /

+int ext3_new_blocks(handle_t *handle, struct inode *inode, + ext3_fsblk_t goal, int indirect_blks, int blks, + ext3_fsblk_t new_blocks[4], int *errp) + { struct buffer_head bitmap_bh = NULL; struct buffer_head gdp_bh; @@ -1413,10 +1698,16 @@ ext3_fsblk_t ext3_new_blocks(handle_t h ext3_grpblk_t grp_target_blk; / blockgroup relative goal block / ext3_grpblk_t grp_alloc_blk; / blockgroup-relative allocated block/ ext3_fsblk_t ret_block; / filesyetem-wide allocated block / + ext3_fsblk_t group_first_block; / first block in the group / int bgi; / blockgroup iteration index / int fatal = 0, err; int performed_allocation = 0; ext3_grpblk_t free_blocks; / number of free blocks in a group / + unsigned long ngroups; + unsigned long grp_mc_alloc;/ blocks allocated from mc in a group / + unsigned long grp_alloc; / blocks allocated outside mc in a group / + int indirect_blks_done = 0;/ total ind blocks allocated so far / + int blks_done = 0; / total direct blocks allocated */ struct super_block *sb; struct ext3_group_desc *gdp; struct ext3_super_block *es; @@ -1424,23 +1715,23 @@ ext3_fsblk_t ext3_new_blocks(handle_t *h struct ext3_reserve_window_node *my_rsv = NULL; struct ext3_block_alloc_info *block_i; unsigned short windowsz = 0; + int i; #ifdef EXT3FS_DEBUG static int goal_hits, goal_attempts; #endif - unsigned long ngroups; - unsigned long num = *count;

  *errp = -ENOSPC;
  sb = inode->i_sb;
  if (!sb) {

@@ -1474,73 +1765,194 @@ ext3_fsblk_t ext3_new_blocks(handle_t *h group_no = (goal - le32_to_cpu(es->s_first_data_block)) / EXT3_BLOCKS_PER_GROUP(sb); goal_group = group_no; -retry_alloc: - gdp = ext3_get_group_desc(sb, group_no, &gdp_bh); - if (!gdp) - goto io_error;

+retry_alloc: + grp_target_blk = ((goal - le32_to_cpu(es->s_first_data_block)) % + EXT3_BLOCKS_PER_GROUP(sb)); ngroups = EXT3_SB(sb)->s_groups_count; smp_rmb();

  /*

@@ -1559,96 +1971,11 @@ retry_alloc: goto out;

allocated:

@@ -1661,7 +1988,13 @@ out: * Undo the block allocation */ if (!performed_allocation) - DQUOT_FREE_BLOCK(inode, count); + DQUOT_FREE_BLOCK(inode, indirect_blks + blks); + / + * Free any indirect blocks we allocated already. If the transaction + * has been aborted this is essentially a no-op. + */ + for (i = 0; i < indirect_blks_done; i++) + ext3_free_blocks(handle, inode, new_blocks[i], 1); brelse(bitmap_bh); return 0; } @@ -1669,9 +2002,13 @@ out: ext3_fsblk_t ext3_new_block(handle_t *handle, struct inode *inode, ext3_fsblk_t goal, int *errp) { - unsigned long count = 1; + ext3_fsblk_t new_blocks[4];

diff -rupdN linux-2.6.23.13-clean/fs/ext3/bitmap.c linux-2.6.23.13-ext3mc/fs/ext3/bitmap.c --- linux-2.6.23.13-clean/fs/ext3/bitmap.c 2008-01-09 12🔞17.000000000 -0500 +++ linux-2.6.23.13-ext3mc/fs/ext3/bitmap.c 2008-01-12 22:30:19.000000000 -0500 @@ -11,8 +11,6 @@ #include <linux/jbd.h> #include <linux/ext3_fs.h>

-#ifdef EXT3FS_DEBUG

static int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0};

unsigned long ext3_count_free (struct buffer_head * map, unsigned int numchars) @@ -27,6 +25,3 @@ unsigned long ext3_count_free (struct bu nibblemap[(map->b_data[i] >> 4) & 0xf]; return (sum); }

-#endif /* EXT3FS_DEBUG */

diff -rupdN linux-2.6.23.13-clean/fs/ext3/inode.c linux-2.6.23.13-ext3mc/fs/ext3/inode.c --- linux-2.6.23.13-clean/fs/ext3/inode.c 2008-01-09 12🔞17.000000000 -0500 +++ linux-2.6.23.13-ext3mc/fs/ext3/inode.c 2008-01-13 00:00:14.000000000 -0500 @@ -36,10 +36,33 @@ #include <linux/mpage.h> #include <linux/uio.h> #include <linux/bio.h> +#include <linux/sort.h> #include "xattr.h" #include "acl.h"

+typedef struct { + __le32 *p; + __le32 key; + struct buffer_head *bh; +} Indirect; + +struct ext3_ind_read_info { + int count; + int seq_prefetch; + long size; + struct buffer_head *bh[0]; +}; + +# define EXT3_IND_READ_INFO_SIZE(_c)
+ (sizeof(struct ext3_ind_read_info) +
+ sizeof(struct buffer_head *) * (_c)) + +# define EXT3_IND_READ_MAX (32) + static int ext3_writepage_trans_blocks(struct inode *inode); +static Indirect *ext3_read_indblocks(struct inode *inode, int iblock, + int depth, int offsets[4], + Indirect chain[4], int *err);

/*

-typedef struct { - __le32 *p; - __le32 key; - struct buffer_head *bh; -} Indirect;

static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v) { p->key = *(p->p = v); @@ -352,18 +369,21 @@ static int ext3_block_to_path(struct ino * the whole chain, all way to the data (returns %NULL, *err == 0). */ static Indirect *ext3_get_branch(struct inode *inode, int depth, int *offsets, - Indirect chain[4], int *err) + Indirect chain[4], int ind_readahead, int *err) { struct super_block *sb = inode->i_sb; Indirect *p = chain; struct buffer_head *bh; + int index;

  *err = 0;
  /* i_data is not going away, no lock needed */
  add_chain (chain, NULL, EXT3_I(inode)->i_data + *offsets);
  if (!p->key)
      goto no_block;

@@ -396,7 +416,11 @@ no_block: * It is used when heuristic for sequential allocation fails. * Rules are: * + if there is a block to the left of our position - allocate near it.

@@ -475,8 +501,7 @@ static ext3_fsblk_t ext3_find_goal(struc * @blks: number of data blocks to be mapped. * @blocks_to_boundary: the offset in the indirect block *

@@ -505,75 +530,18 @@ static int ext3_blks_to_allocate(Indirec }

/** - * ext3_alloc_blocks: multiple allocate blocks needed for a branch - * @indirect_blks: the number of blocks need to allocate for indirect - * blocks - * - * @new_blocks: on return it will store the new block numbers for - * the indirect blocks(if needed) and the first direct block, - * @blks: on return it will store the total number of allocated - * direct blocks - */ -static int ext3_alloc_blocks(handle_t *handle, struct inode *inode, - ext3_fsblk_t goal, int indirect_blks, int blks, - ext3_fsblk_t new_blocks[4], int *err) -{ - int target, i; - unsigned long count = 0; - int index = 0; - ext3_fsblk_t current_block = 0; - int ret = 0;

@@ -799,17 +767,21 @@ int ext3_get_blocks_handle(handle_t *han int blocks_to_boundary = 0; int depth; struct ext3_inode_info *ei = EXT3_I(inode); - int count = 0; + int count = 0, ind_readahead; ext3_fsblk_t first_block = 0;

@@ -844,7 +816,7 @@ int ext3_get_blocks_handle(handle_t *han }

  /* Next simple case - plain lookup or failed read of indirect block */

@@ -866,7 +838,8 @@ int ext3_get_blocks_handle(handle_t *han brelse(partial->bh); partial--; } - partial = ext3_get_branch(inode, depth, offsets, chain, &err); + partial = ext3_get_branch(inode, depth, offsets, chain, 0, + &err); if (!partial) { count++; mutex_unlock(&ei->truncate_mutex); @@ -1907,7 +1880,7 @@ static Indirect ext3_find_shared(struct / Make k index the deepest non-null offest + 1 / for (k = depth; k > 1 && !offsets[k-1]; k--) ; - partial = ext3_get_branch(inode, k, offsets, chain, &err); + partial = ext3_get_branch(inode, k, offsets, chain, 0, &err); / Writer: pointers */ if (!partial) partial = chain + k-1; @@ -3230,3 +3203,559 @@ int ext3_change_inode_journal_flag(struc

  return err;

} + +/*

+{ + const struct super_block *sb = inode->i_sb; + unsigned long max_read; + unsigned long ptrs = EXT3_ADDR_PER_BLOCK(inode->i_sb); + unsigned long ptrs_bits = EXT3_ADDR_PER_BLOCK_BITS(inode->i_sb); + unsigned long blocks_in_file = + (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits; + unsigned long remaining_ind_blks_in_dind = + (ptrs >= offset_in_dind_block) ? (ptrs - offset_in_dind_block) + : 0; + unsigned long remaining_ind_blks_before_eof = + ((blocks_in_file - EXT3_NDIR_BLOCKS + ptrs - 1) >> ptrs_bits) - + ((block - EXT3_NDIR_BLOCKS) >> ptrs_bits); + + BUG_ON(block >= blocks_in_file); + + max_read = min_t(unsigned long, remaining_ind_blks_in_dind, + remaining_ind_blks_before_eof); + + BUG_ON(max_read < 1); + + return max_read; +} + +static void ext3_read_indblocks_submit(struct bio **pbio, + struct ext3_ind_read_info **pread_info, + int *read_cnt, int seq_prefetch) +{ + struct bio *bio = *pbio; + struct ext3_ind_read_info *read_info = *pread_info; + + BUG_ON(*read_cnt < 1); + + read_info->seq_prefetch = seq_prefetch; + read_info->count = *read_cnt; + read_info->size = bio->bi_size; + bio->bi_private = read_info; + bio->bi_end_io = ext3_ind_read_end_bio; + submit_bio(READ, bio); + + *pbio = NULL; + *pread_info = NULL; + *read_cnt = 0; +} + +struct ind_block_info { + ext3_fsblk_t blockno; + struct buffer_head *bh; +}; + +static int ind_info_cmp(const void *a, const void *b) +{ + struct ind_block_info *info_a = (struct ind_block_info *)a; + struct ind_block_info *info_b = (struct ind_block_info *)b; + + return info_a->blockno - info_b->blockno; +} + +static void ind_info_swap(void *a, void *b, int size) +{ + struct ind_block_info *info_a = (struct ind_block_info *)a; + struct ind_block_info *info_b = (struct ind_block_info *)b; + struct ind_block_info tmp; + + tmp = *info_a; + *info_a = *info_b; + info_b = tmp; +} + +/

+{ + struct buffer_head *bh; + struct bio *bio = NULL; + struct ext3_ind_read_info *read_info = NULL; + int read_cnt = 0, blk; + ext3_fsblk_t prev_blk = 0, io_start_blk = 0, curr; + struct ind_block_info *ind_info = NULL; + int err = 0, ind_info_count = 0; + + BUG_ON(count < 1); + /* Don't move this to ext3_get_max_read() since callers often need to + * trim the count returned by that function. So this bound must only + * be imposed at the last moment. */ + count = min_t(unsigned long, count, EXT3_IND_READ_MAX); + *blocks_done = 0UL; + + if (count == 1 && first_bh) { + lock_buffer(first_bh); + get_bh(first_bh); + first_bh->b_end_io = end_buffer_read_sync; + submit_bh(READ, first_bh); + *blocks_done = 1UL; + return 0; + } + + ind_info = kmalloc(count * sizeof(ind_info), GFP_KERNEL); + if (unlikely(!ind_info)) + return -ENOMEM; + + / + * First pass: sort block numbers for all indirect blocks that we'll + * read. This allows us to scan blocks in sequenial order during the + * second pass which helps coalasce requests to contiguous blocks. + * Since we sort block numbers here instead of assuming any specific + * layout on the disk, we have some protection against different + * indirect block layout strategies as long as they keep all indirect + * blocks close by. + */ + for (blk = 0; blk < count; blk++) { + curr = le32_to_cpu(ind_blocks[blk]); + if (!curr) + continue; + + /* + * Skip this block if it lies too far from blocks we have + * already decided to read. "Too far" should typically indicate + * lying on a different track on the disk. EXT3_IND_READ_MAX + * seems reasonable for most disks. + */ + if (io_start_blk > 0 && + (max(io_start_blk, curr) - min(io_start_blk, curr) >= + EXT3_IND_READ_MAX)) + continue; + + if (blk == 0 && first_bh) { + bh = first_bh; + get_bh(first_bh); + } else { + bh = sb_getblk(sb, curr); + if (unlikely(!bh)) { + err = -ENOMEM; + goto failure; + } + } + + if (buffer_uptodate(bh)) { + if (ext3_buffer_prefetch(bh)) { + brelse(bh); + break; + } + brelse(bh); + continue; + } + + if (io_start_blk == 0) + io_start_blk = curr; + + ind_info[ind_info_count].blockno = curr; + ind_info[ind_info_count].bh = bh; + ind_info_count++; + } + *blocks_done = blk; + + sort(ind_info, ind_info_count, sizeof(ind_info), + ind_info_cmp, ind_info_swap); + + / Second pass: compose bio requests and issue them. / + for (blk = 0; blk < ind_info_count; blk++) { + bh = ind_info[blk].bh; + curr = ind_info[blk].blockno; + + if (prev_blk > 0 && curr != prev_blk + 1) { + ext3_read_indblocks_submit(&bio, &read_info, + &read_cnt, seq_prefetch); + prev_blk = 0; + } + + / Lock the buffer without blocking, skipping any buffers + * which would require us to block. first_bh when specified is + * an exception as caller typically wants it to be read for + * sure (e.g., ext3_read_indblocks_sync). + / + if (bh == first_bh) { + lock_buffer(bh); + } else if (test_set_buffer_locked(bh)) { + brelse(bh); + continue; + } + + / Check again with the buffer locked. / + if (buffer_uptodate(bh)) { + if (ext3_buffer_prefetch(bh)) { + unlock_buffer(bh); + brelse(bh); + break; + } + unlock_buffer(bh); + brelse(bh); + continue; + } + + if (read_cnt == 0) { + / read_info freed in ext3_ind_read_end_bio(). */ + read_info = kmalloc(EXT3_IND_READ_INFO_SIZE(count), + GFP_KERNEL); + if (unlikely(!read_info)) { + err = -ENOMEM; + goto failure; + } + + bio = bio_alloc(GFP_KERNEL, count); + if (unlikely(!bio)) { + err = -ENOMEM; + goto failure; + } + bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); + bio->bi_bdev = bh->b_bdev; + } + + if (bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh)) + < bh->b_size) { + brelse(bh); + if (read_cnt == 0) + goto failure; + + break; + } + + read_info->bh[read_cnt++] = bh; + prev_blk = curr; + } + + if (read_cnt == 0) + goto done; + + ext3_read_indblocks_submit(&bio, &read_info, &read_cnt, seq_prefetch); + + kfree(ind_info); + return 0; + +failure: + while (--read_cnt >= 0) { + unlock_buffer(read_info->bh[read_cnt]); + brelse(read_info->bh[read_cnt]); + } + blocks_done = 0UL; + +done: + kfree(read_info); + + if (bio) + bio_put(bio); + + kfree(ind_info); + return err; +} + +/

+{ + int err; + + BUG_ON(count < 1); + BUG_ON(!first_bh); + + err = ext3_read_indblocks_async(sb, ind_blocks, count, first_bh, + seq_prefetch, blocks_done); + if (err) + return err; + + wait_on_buffer(first_bh); + if (!buffer_uptodate(first_bh)) + err = -EIO; + + /* if seq_prefetch != 0, ext3_read_indblocks_async() sets prefetch bit + * for all buffers, but the first buffer for sync IO is never a prefetch + * buffer since it's needed presently so mark it so. + / + if (seq_prefetch) + ext3_clear_buffer_prefetch(first_bh); + + BUG_ON(ext3_buffer_prefetch(first_bh)); + + return err; +} + +/

+{ + struct super_block *sb = inode->i_sb; + struct buffer_head *first_bh, *prev_bh; + unsigned long max_read, blocks_done = 0; + __le32 ind_blocks; + + / Must have doubly indirect block for prefetching indirect blocks. / + BUG_ON(depth <= 2); + BUG_ON(!chain[depth-2].key); + + *err = 0; + + /* Handle first block */ + ind_blocks = chain[depth-2].p; + first_bh = sb_getblk(sb, le32_to_cpu(ind_blocks[0])); + if (unlikely(!first_bh)) { + printk(KERN_ERR "Failed to get block %u for sb %p\n", + le32_to_cpu(ind_blocks[0]), sb); + goto failure; + } + + BUG_ON(first_bh->b_size != sb->s_blocksize); + + if (buffer_uptodate(first_bh)) { + / Found the buffer in cache, either it was accessed recently or + * it was prefetched while reading previous indirect block(s). + * We need to figure out if we need to prefetch the following + * indirect blocks. + / + if (!ext3_buffer_prefetch(first_bh)) { + / Either we've seen this indirect block before while + * accessing another data block, or this is a random + * read. In the former case, we must have done the + * needful the first time we had a cache hit on this + * indirect block, in the latter case we obviously + * don't need to do any prefetching. + / + goto done; + } + + max_read = ext3_get_max_read(inode, iblock, + offsets[depth-2]); + + / This indirect block is in the cache due to prefetching and + * this is its first cache hit, clear the prefetch bit and + * make sure the following blocks are also prefetched. + / + ext3_clear_buffer_prefetch(first_bh); + + if (max_read >= 2) { + / ext3_read_indblocks_async() stops at the first + * indirect block which has the prefetch bit set which + * will most likely be the very next indirect block. + / + ext3_read_indblocks_async(sb, &ind_blocks[1], + max_read - 1, + NULL, 1, &blocks_done); + } + + } else { + / Buffer is not in memory, we need to read it. If we are + * reading sequentially from the previous indirect block, we + * have just detected a sequential read and we must prefetch + * some indirect blocks for future. + */ + + max_read = ext3_get_max_read(inode, iblock, + offsets[depth-2]); + + if ((ind_blocks - (__le32 )chain[depth-2].bh->b_data) >= 1) { + prev_bh = sb_getblk(sb, le32_to_cpu(ind_blocks[-1])); + if (buffer_uptodate(prev_bh) && + !ext3_buffer_prefetch(prev_bh)) { + / Detected sequential read. / + brelse(prev_bh); + + / Sync read indirect block, also read the next + * few indirect blocks. + */ + *err = ext3_read_indblocks_sync(sb, ind_blocks, + max_read, first_bh, 1, + &blocks_done); + + if (err) + goto out; + + / In case the very next indirect block is + * discontiguous by a non-trivial amount, + * ext3_read_indblocks_sync() above won't + * prefetch it (indicated by blocks_done < 2). + * So to help sequential read, schedule an + * async request for reading the next + * contiguous indirect block range (which + * in metaclustering case would be the next + * metacluster, without metaclustering it + * would be the next indirect block). This is + * expected to benefit the non-metaclustering + * case. + */ + if (max_read >= 2 && blocks_done < 2) + ext3_read_indblocks_async(sb, + &ind_blocks[1], + max_read - 1, + NULL, 1, &blocks_done); + + goto done; + } + brelse(prev_bh); + } + + /* Either random read, or sequential detection failed above. + * We always prefetch the next indirect block in this case + * whenever possible. + * This is because for random reads of size ~512KB, there is + * >12% chance that a read will span two indirect blocks. + */ + *err = ext3_read_indblocks_sync(sb, ind_blocks, + (max_read >= 2) ? 2 : 1, + first_bh, 0, &blocks_done); + if (err) + goto out; + } + +done: + / Reader: pointers */ + if (!verify_chain(chain, &chain[depth - 2])) { + brelse(first_bh); + goto changed; + } + add_chain(&chain[depth - 1], first_bh, + (_le32 )first_bh->b_data + offsets[depth - 1]); + / Reader: end */ + if (!chain[depth - 1].key) + goto out; + + BUG_ON(!buffer_uptodate(first_bh)); + return NULL; + +changed: + *err = -EAGAIN; + goto out; +failure: + *err = -EIO; +out: + if (*err) { + ext3_debug("Error %d reading indirect blocks\n", *err); + return &chain[depth - 2]; + } else + return &chain[depth - 1]; +} diff -rupdN linux-2.6.23.13-clean/fs/ext3/super.c linux-2.6.23.13-ext3mc/fs/ext3/super.c --- linux-2.6.23.13-clean/fs/ext3/super.c 2008-01-09 12🔞17.000000000 -0500 +++ linux-2.6.23.13-ext3mc/fs/ext3/super.c 2008-01-12 22:30:19.000000000 -0500 @@ -556,6 +556,9 @@ static int ext3_show_options(struct seq else if (test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA) seq_puts(seq, ",data=writeback");

@@ -684,7 +687,7 @@ enum { Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, - Opt_grpquota + Opt_grpquota, Opt_metacluster };

static match_table_t tokens = { @@ -734,6 +737,7 @@ static match_table_t tokens = { {Opt_quota, "quota"}, {Opt_usrquota, "usrquota"}, {Opt_barrier, "barrier=%u"}, + {Opt_metacluster, "metacluster"}, {Opt_err, NULL}, {Opt_resize, "resize"}, }; @@ -1066,6 +1070,9 @@ clear_qf_name: case Opt_bh: clear_opt(sbi->s_mount_opt, NOBH); break; + case Opt_metacluster: + set_opt(sbi->s_mount_opt, METACLUSTER); + break; default: printk (KERN_ERR "EXT3-fs: Unrecognized mount option "%s" " @@ -1594,6 +1601,13 @@ static int ext3_fill_super (struct super } sbi->s_frags_per_block = 1; sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group); + if (test_opt(sb, METACLUSTER)) { + sbi->s_nonmc_blocks_per_group = sbi->s_blocks_per_group - + sbi->s_blocks_per_group / 12; + sbi->s_nonmc_blocks_per_group &= ~7; + } else + sbi->s_nonmc_blocks_per_group = sbi->s_blocks_per_group; + sbi->s_frags_per_group = le32_to_cpu(es->s_frags_per_group); sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group); if (EXT3_INODE_SIZE(sb) == 0) @@ -1695,6 +1709,18 @@ static int ext3_fill_super (struct super sbi->s_rsv_window_head.rsv_goal_size = 0; ext3_rsv_window_add(sb, &sbi->s_rsv_window_head);

@@ -1720,16 +1746,16 @@ static int ext3_fill_super (struct super if (!test_opt(sb, NOLOAD) && EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL)) { if (ext3_load_journal(sb, es, journal_devnum)) - goto failed_mount3; + goto failed_mount4; } else if (journal_inum) { if (ext3_create_journal(sb, es, journal_inum)) - goto failed_mount3; + goto failed_mount4; } else { if (!silent) printk (KERN_ERR "ext3: No journal on filesystem on %s\n", sb->s_id); - goto failed_mount3; + goto failed_mount4; }

  /* We have now updated the journal if required, so we can

@@ -1752,7 +1778,7 @@ static int ext3_fill_super (struct super (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)) { printk(KERN_ERR "EXT3-fs: Journal does not support " "requested data journaling mode\n"); - goto failed_mount4; + goto failed_mount5; } default: break; @@ -1775,13 +1801,13 @@ static int ext3_fill_super (struct super if (!sb->s_root) { printk(KERN_ERR "EXT3-fs: get root inode failed\n"); iput(root); - goto failed_mount4; + goto failed_mount5; } if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { dput(sb->s_root); sb->s_root = NULL; printk(KERN_ERR "EXT3-fs: corrupt root inode, run e2fsck\n"); - goto failed_mount4; + goto failed_mount5; }

  ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY);

@@ -1813,8 +1839,10 @@ cantfind_ext3: sb->s_id); goto failed_mount;

-failed_mount4: +failed_mount5: journal_destroy(sbi->s_journal); +failed_mount4: + kfree(sbi->s_bginfo); failed_mount3: percpu_counter_destroy(&sbi->s_freeblocks_counter); percpu_counter_destroy(&sbi->s_freeinodes_counter); diff -rupdN linux-2.6.23.13-clean/include/linux/ext3_fs.h linux-2.6.23.13-ext3mc/include/linux/ext3_fs.h --- linux-2.6.23.13-clean/include/linux/ext3_fs.h 2008-01-09 12🔞17.000000000 -0500 +++ linux-2.6.23.13-ext3mc/include/linux/ext3_fs.h 2008-01-12 22:30:19.000000000 -0500 @@ -384,6 +384,7 @@ struct ext3_inode { #define EXT3_MOUNT_QUOTA 0x80000 /* Some quota option set / #define EXT3_MOUNT_USRQUOTA 0x100000 / "old" user quota / #define EXT3_MOUNT_GRPQUOTA 0x200000 / "old" group quota / +#define EXT3_MOUNT_METACLUSTER 0x400000 / Indirect block clustering */

/* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ #ifndef _LINUX_EXT2_FS_H @@ -497,6 +498,7 @@ struct ext3_super_block { #ifdef KERNEL #include <linux/ext3_fs_i.h> #include <linux/ext3_fs_sb.h> +#include <linux/buffer_head.h> static inline struct ext3_sb_info * EXT3_SB(struct super_block *sb) { return sb->s_fs_info; @@ -732,6 +734,11 @@ struct dir_private_info { __u32 next_hash; };

+/* Special bh flag used by the metacluster readahead logic. / +enum ext3_bh_state_bits { + EXT3_BH_PREFETCH = BH_JBD_Sentinel, +}; + / calculate the first block number of the group */ static inline ext3_fsblk_t ext3_group_first_block_no(struct super_block *sb, unsigned long group_no) @@ -740,6 +747,24 @@ ext3_group_first_block_no(struct super_b le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block); }

+static inline void +ext3_set_buffer_prefetch(struct buffer_head *bh) +{ + set_bit(EXT3_BH_PREFETCH, &bh->b_state); +} + +static inline void +ext3_clear_buffer_prefetch(struct buffer_head *bh) +{ + clear_bit(EXT3_BH_PREFETCH, &bh->b_state); +} + +static inline int +ext3_buffer_prefetch(struct buffer_head bh) +{ + return test_bit(EXT3_BH_PREFETCH, &bh->b_state); +} + /

-extern ext3_fsblk_t ext3_new_blocks (handle_t *handle, struct inode *inode, - ext3_fsblk_t goal, unsigned long *count, int *errp); +extern int ext3_new_blocks(handle_t *handle, struct inode *inode, + ext3_fsblk_t goal, int indirect_blks, int blks, + ext3_fsblk_t new_blocks[], int *errp); extern void ext3_free_blocks (handle_t *handle, struct inode *inode, ext3_fsblk_t block, unsigned long count); extern void ext3_free_blocks_sb (handle_t *handle, struct super_block *sb, diff -rupdN linux-2.6.23.13-clean/include/linux/ext3_fs_sb.h linux-2.6.23.13-ext3mc/include/linux/ext3_fs_sb.h --- linux-2.6.23.13-clean/include/linux/ext3_fs_sb.h 2008-01-09 12🔞17.000000000 -0500 +++ linux-2.6.23.13-ext3mc/include/linux/ext3_fs_sb.h 2008-01-12 22:30:19.000000000 -0500 @@ -24,6 +24,8 @@ #endif #include <linux/rbtree.h>

+struct ext3_bg_info; + /*

@@ -66,6 +69,9 @@ struct ext3_sb_info { struct rb_root s_rsv_window_root; struct ext3_reserve_window_node s_rsv_window_head;

@@ -82,4 +88,11 @@ struct ext3_sb_info { #endif };

+/*

-- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/