From a379cd1d6bb00f9f5d2759d4a5621a884df5914e Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 11 Jul 2008 19:27:31 -0400 Subject: ext4: Update i_disksize properly when allocating from fallocate area. When allocating unitialized space at the end of file which had been preallocated with the FALLOC_FL_KEEP_SIZE option, the file size is not updated at that time. But the later we are not updating the file size when writing to that preallocated space. These changes are for code correctness. This patch allows us to update the i_disksize at the write_end() callback of filesystem properly. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Mingming Cao Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/ext4/extents.c') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 47929c4e3dae..f7a746b5b7be 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2716,13 +2716,13 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, goto out2; } - if (extend_disksize && inode->i_size > EXT4_I(inode)->i_disksize) - EXT4_I(inode)->i_disksize = inode->i_size; - /* previous routine could use block we allocated */ newblock = ext_pblock(&newex); allocated = ext4_ext_get_actual_len(&newex); outnew: + if (extend_disksize && inode->i_size > EXT4_I(inode)->i_disksize) + EXT4_I(inode)->i_disksize = inode->i_size; + __set_bit(BH_New, &bh_result->b_state); /* Cache only when it is _not_ an uninitialized extent */ -- cgit v1.2.3 From 787e0981fad97a5ca3d07c7afe115a7e345b2861 Mon Sep 17 00:00:00 2001 From: Shen Feng Date: Fri, 11 Jul 2008 19:27:31 -0400 Subject: ext4: return error when calling ext4_ext_split failed ext4_ext_create_new_leaf must return error when its calling to ext4_ext_split failed. Signed-off-by: Shen Feng Signed-off-by: Mingming Cao Reviewed-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/ext4/extents.c') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index f7a746b5b7be..bc17ed742845 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -981,6 +981,8 @@ repeat: /* if we found index with free entry, then use that * entry: create all needed subtree and add new leaf */ err = ext4_ext_split(handle, inode, path, newext, i); + if (err) + goto out; /* refill path */ ext4_ext_drop_refs(path); -- cgit v1.2.3 From 1973adcba570c226de840299056e055a3614185e Mon Sep 17 00:00:00 2001 From: Shen Feng Date: Fri, 11 Jul 2008 19:27:31 -0400 Subject: ext4: Make ext4_ext_find_extent fills ext_path completely When pos=0 or depth, the fields of ext4_ext_path is are not completely filled. This patch also removes some unnecessary code. Signed-off-by: Shen Feng Signed-off-by: Mingming Cao Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs/ext4/extents.c') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index bc17ed742845..3e966daaadee 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -524,6 +524,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, alloc = 1; } path[0].p_hdr = eh; + path[0].p_bh = NULL; i = depth; /* walk through the tree */ @@ -552,12 +553,14 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, } path[ppos].p_depth = i; - path[ppos].p_hdr = eh; path[ppos].p_ext = NULL; path[ppos].p_idx = NULL; /* find extent */ ext4_ext_binsearch(inode, path + ppos, block); + /* if not an empty leaf */ + if (path[ppos].p_ext) + path[ppos].p_block = ext_pblock(path[ppos].p_ext); ext4_ext_show_path(inode, path); -- cgit v1.2.3 From 9102e4fa8016af8bf1a263df913ee8fdafd4dfb0 Mon Sep 17 00:00:00 2001 From: Shen Feng Date: Fri, 11 Jul 2008 19:27:31 -0400 Subject: ext4: Fix ext4_ext_journal_restart() to reflect errors up to the caller Fix ext4_ext_journal_restart() so it returns any errors reported by ext4_journal_extend() and ext4_journal_restart(). Signed-off-by: Shen Feng Reviewed-by: Aneesh Kumar K.V Signed-off-by: Mingming Cao Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) (limited to 'fs/ext4/extents.c') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 3e966daaadee..17ea8bb12aa5 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -92,17 +92,16 @@ static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb) ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff); } -static handle_t *ext4_ext_journal_restart(handle_t *handle, int needed) +static int ext4_ext_journal_restart(handle_t *handle, int needed) { int err; if (handle->h_buffer_credits > needed) - return handle; - if (!ext4_journal_extend(handle, needed)) - return handle; - err = ext4_journal_restart(handle, needed); - - return handle; + return 0; + err = ext4_journal_extend(handle, needed); + if (err) + return err; + return ext4_journal_restart(handle, needed); } /* @@ -1888,11 +1887,9 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb); #endif - handle = ext4_ext_journal_restart(handle, credits); - if (IS_ERR(handle)) { - err = PTR_ERR(handle); + err = ext4_ext_journal_restart(handle, credits); + if (err) goto out; - } err = ext4_ext_get_access(handle, inode, path + depth); if (err) -- cgit v1.2.3 From 7061eba75ceb0835ba61e7cbd757a6f9c1e4af92 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 11 Jul 2008 19:27:31 -0400 Subject: ext4: Use inode preallocation with -o noextents When mballoc is enabled, block allocation for old block-based files are allocated using mballoc allocator instead of old block-based allocator. The old ext3 block reservation is turned off when mballoc is turned on. However, the in-core preallocation is not enabled for block-based/ non-extent based file block allocation. This result in performance regression, as now we don't have "reservation" ore in-core preallocation to prevent interleaved fragmentation in multiple writes workload. This patch fix this by enable per inode in-core preallocation for non extent files when mballoc is used. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Mingming Cao Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/ext4/extents.c') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 17ea8bb12aa5..c729b3507b46 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -187,7 +187,7 @@ ext4_ext_new_block(handle_t *handle, struct inode *inode, ext4_fsblk_t goal, newblock; goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block)); - newblock = ext4_new_block(handle, inode, goal, err); + newblock = ext4_new_meta_block(handle, inode, goal, err); return newblock; } -- cgit v1.2.3 From 654b4908bc17a6318d18f3036fecc5155de92f55 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 11 Jul 2008 19:27:31 -0400 Subject: ext4: cleanup block allocator Move the code related to block allocation to a single function and add helper funtions to differient allocation for data and meta data blocks Signed-off-by: Aneesh Kumar K.V Signed-off-by: Mingming Cao Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'fs/ext4/extents.c') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index c729b3507b46..555155d239df 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -179,8 +179,11 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, return bg_start + colour + block; } +/* + * Allocation for a meta data block + */ static ext4_fsblk_t -ext4_ext_new_block(handle_t *handle, struct inode *inode, +ext4_ext_new_meta_block(handle_t *handle, struct inode *inode, struct ext4_ext_path *path, struct ext4_extent *ex, int *err) { @@ -690,7 +693,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, /* allocate all needed blocks */ ext_debug("allocate %d blocks for indexes/leaf\n", depth - at); for (a = 0; a < depth - at; a++) { - newblock = ext4_ext_new_block(handle, inode, path, newext, &err); + newblock = ext4_ext_new_meta_block(handle, inode, path, + newext, &err); if (newblock == 0) goto cleanup; ablocks[a] = newblock; @@ -886,7 +890,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, ext4_fsblk_t newblock; int err = 0; - newblock = ext4_ext_new_block(handle, inode, path, newext, &err); + newblock = ext4_ext_new_meta_block(handle, inode, path, newext, &err); if (newblock == 0) return err; -- cgit v1.2.3 From 953e622b601f58b7cc0f29fe644457fa40a18456 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Fri, 11 Jul 2008 19:27:31 -0400 Subject: ext4: use atomic functions to set bh_state Use the BUFFER_FNS functions (set_buffer_foo) to set buffer head state atomically instead of nonatomic __set_bit(). Signed-off-by: Eric Sandeen Signed-off-by: Mingming Cao Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'fs/ext4/extents.c') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 555155d239df..bb36a28f8ff3 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2622,8 +2622,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, */ if (allocated > max_blocks) allocated = max_blocks; - /* mark the buffer unwritten */ - __set_bit(BH_Unwritten, &bh_result->b_state); + set_buffer_unwritten(bh_result); goto out2; } @@ -2729,7 +2728,7 @@ outnew: if (extend_disksize && inode->i_size > EXT4_I(inode)->i_disksize) EXT4_I(inode)->i_disksize = inode->i_size; - __set_bit(BH_New, &bh_result->b_state); + set_buffer_new(bh_result); /* Cache only when it is _not_ an uninitialized extent */ if (create != EXT4_CREATE_UNINITIALIZED_EXT) @@ -2739,7 +2738,7 @@ out: if (allocated > max_blocks) allocated = max_blocks; ext4_ext_show_leaf(inode, path); - __set_bit(BH_Mapped, &bh_result->b_state); + set_buffer_mapped(bh_result); bh_result->b_bdev = inode->i_sb->s_bdev; bh_result->b_blocknr = newblock; out2: -- cgit v1.2.3 From cf108bca465dde0c015f32dd453b99457d31c7c7 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 11 Jul 2008 19:27:31 -0400 Subject: ext4: Invert the locking order of page_lock and transaction start This changes are needed to support data=ordered mode handling via inodes. This enables us to get rid of the journal heads and buffer heads for data buffers in the ordered mode. With the changes, during tranasaction commit we writeout the inode pages using the writepages()/writepage(). That implies we take page lock during transaction commit. This can cause a deadlock with the locking order page_lock -> jbd2_journal_start, since the jbd2_journal_start can wait for the journal_commit to happen and the journal_commit now needs to take the page lock. To avoid this dead lock reverse the locking order. Signed-off-by: Jan Kara Signed-off-by: Mingming Cao Signed-off-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) (limited to 'fs/ext4/extents.c') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index bb36a28f8ff3..b722bce7d662 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2749,7 +2749,7 @@ out2: return err ? err : allocated; } -void ext4_ext_truncate(struct inode * inode, struct page *page) +void ext4_ext_truncate(struct inode *inode) { struct address_space *mapping = inode->i_mapping; struct super_block *sb = inode->i_sb; @@ -2762,18 +2762,11 @@ void ext4_ext_truncate(struct inode * inode, struct page *page) */ err = ext4_writepage_trans_blocks(inode) + 3; handle = ext4_journal_start(inode, err); - if (IS_ERR(handle)) { - if (page) { - clear_highpage(page); - flush_dcache_page(page); - unlock_page(page); - page_cache_release(page); - } + if (IS_ERR(handle)) return; - } - if (page) - ext4_block_truncate_page(handle, page, mapping, inode->i_size); + if (inode->i_size & (sb->s_blocksize - 1)) + ext4_block_truncate_page(handle, mapping, inode->i_size); down_write(&EXT4_I(inode)->i_data_sem); ext4_ext_invalidate_cache(inode); -- cgit v1.2.3 From 9ddfc3dc75b5cc55ff3cfa586e962d252f1db9d3 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 11 Jul 2008 19:27:31 -0400 Subject: ext4: Fix lock inversion in ext4_ext_truncate() We cannot call ext4_orphan_add() from under i_data_sem because that causes a lock ordering violation between i_data_sem and and the superblock lock. Updated with Aneesh's locking order fix Signed-off-by: Jan Kara Signed-off-by: Mingming Cao Signed-off-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs/ext4/extents.c') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index b722bce7d662..7844bbb2bac0 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2768,6 +2768,9 @@ void ext4_ext_truncate(struct inode *inode) if (inode->i_size & (sb->s_blocksize - 1)) ext4_block_truncate_page(handle, mapping, inode->i_size); + if (ext4_orphan_add(handle, inode)) + goto out_stop; + down_write(&EXT4_I(inode)->i_data_sem); ext4_ext_invalidate_cache(inode); @@ -2778,8 +2781,6 @@ void ext4_ext_truncate(struct inode *inode) * Probably we need not scan at all, * because page truncation is enough. */ - if (ext4_orphan_add(handle, inode)) - goto out_stop; /* we have to know where to truncate from in crash case */ EXT4_I(inode)->i_disksize = inode->i_size; @@ -2796,6 +2797,7 @@ void ext4_ext_truncate(struct inode *inode) handle->h_sync = 1; out_stop: + up_write(&EXT4_I(inode)->i_data_sem); /* * If this was a simple ftruncate() and the file will remain alive, * then we need to clear up the orphan record which we created above. @@ -2806,7 +2808,6 @@ out_stop: if (inode->i_nlink) ext4_orphan_del(handle, inode); - up_write(&EXT4_I(inode)->i_data_sem); inode->i_mtime = inode->i_ctime = ext4_current_time(inode); ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); -- cgit v1.2.3 From d2a1763791a634e315ec926b62829c1e88842c86 Mon Sep 17 00:00:00 2001 From: Mingming Cao Date: Mon, 14 Jul 2008 17:52:37 -0400 Subject: ext4: delayed allocation ENOSPC handling This patch does block reservation for delayed allocation, to avoid ENOSPC later at page flush time. Blocks(data and metadata) are reserved at da_write_begin() time, the freeblocks counter is updated by then, and the number of reserved blocks is store in per inode counter. At the writepage time, the unused reserved meta blocks are returned back. At unlink/truncate time, reserved blocks are properly released. Updated fix from Aneesh Kumar K.V to fix the oldallocator block reservation accounting with delalloc, added lock to guard the counters and also fix the reservation for meta blocks. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Mingming Cao Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) (limited to 'fs/ext4/extents.c') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 7844bbb2bac0..dabc3b68d249 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -248,6 +248,36 @@ static int ext4_ext_space_root_idx(struct inode *inode) return size; } +/* + * Calculate the number of metadata blocks needed + * to allocate @blocks + * Worse case is one block per extent + */ +int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks) +{ + int lcap, icap, rcap, leafs, idxs, num; + int newextents = blocks; + + rcap = ext4_ext_space_root_idx(inode); + lcap = ext4_ext_space_block(inode); + icap = ext4_ext_space_block_idx(inode); + + /* number of new leaf blocks needed */ + num = leafs = (newextents + lcap - 1) / lcap; + + /* + * Worse case, we need separate index block(s) + * to link all new leaf blocks + */ + idxs = (leafs + icap - 1) / icap; + do { + num += idxs; + idxs = (idxs + icap - 1) / icap; + } while (idxs > rcap); + + return num; +} + static int ext4_ext_max_entries(struct inode *inode, int depth) { @@ -2910,7 +2940,7 @@ retry: } ret = ext4_get_blocks_wrap(handle, inode, block, max_blocks, &map_bh, - EXT4_CREATE_UNINITIALIZED_EXT, 0); + EXT4_CREATE_UNINITIALIZED_EXT, 0, 0); if (ret <= 0) { #ifdef EXT4FS_DEBUG WARN_ON(ret <= 0); -- cgit v1.2.3 From 61628a3f3a37af2bf25daf8e26fd6b76a78c4f76 Mon Sep 17 00:00:00 2001 From: Mingming Cao Date: Fri, 11 Jul 2008 19:27:31 -0400 Subject: ext4: Invert lock ordering of page_lock and transaction start in delalloc With the reverse locking, we need to start a transation before taking the page lock, so in ext4_da_writepages() we need to break the write-out into chunks, and restart the journal for each chunck to ensure the write-out fits in a single transaction. Updated patch from Aneesh Kumar K.V which fixes delalloc sync hang with journal lock inversion, and address the performance regression issue. Signed-off-by: Mingming Cao Signed-off-by: Aneesh Kumar K.V Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'fs/ext4/extents.c') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index dabc3b68d249..42c4c0c892ed 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2565,6 +2565,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, int err = 0, depth, ret; unsigned long allocated = 0; struct ext4_allocation_request ar; + loff_t disksize; __clear_bit(BH_New, &bh_result->b_state); ext_debug("blocks %u/%lu requested for inode %u\n", @@ -2755,8 +2756,13 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, newblock = ext_pblock(&newex); allocated = ext4_ext_get_actual_len(&newex); outnew: - if (extend_disksize && inode->i_size > EXT4_I(inode)->i_disksize) - EXT4_I(inode)->i_disksize = inode->i_size; + if (extend_disksize) { + disksize = ((loff_t) iblock + ar.len) << inode->i_blkbits; + if (disksize > i_size_read(inode)) + disksize = i_size_read(inode); + if (disksize > EXT4_I(inode)->i_disksize) + EXT4_I(inode)->i_disksize = disksize; + } set_buffer_new(bh_result); -- cgit v1.2.3