From a71db86e86e0a1b28ffebe6b161f4a628911ebaf Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Fri, 20 Jun 2014 21:51:43 +0200 Subject: fs/btrfs/tree-log.c: Fix closing brace followed by if Signed-off-by: Rasmus Villemoes Reviewed-by: Josef Bacik Signed-off-by: Jiri Kosina --- fs/btrfs/tree-log.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 9e1f2cd5e67a..50af2b96df6c 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3980,7 +3980,8 @@ again: if (ret < 0) { err = ret; goto out_unlock; - } if (ret) { + } + if (ret) { ins_nr = 0; btrfs_release_path(path); continue; -- cgit v1.2.3 From 962a298f35110edd8f326814ae41a3dd306ecb64 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 4 Jun 2014 18:41:45 +0200 Subject: btrfs: kill the key type accessor helpers btrfs_set_key_type and btrfs_key_type are used inconsistently along with open coded variants. Other members of btrfs_key are accessed directly without any helpers anyway. Signed-off-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/tree-log.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index d296efe2d3e7..2f5000c0a87a 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1498,7 +1498,7 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, return -EIO; key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; - btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); + key.type = BTRFS_ORPHAN_ITEM_KEY; key.offset = objectid; ret = btrfs_insert_empty_item(trans, root, path, &key, 0); @@ -3364,7 +3364,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, * or deletes of this inode don't have to relog the inode * again */ - if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY && + if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY && !skip_csum) { int found_type; extent = btrfs_item_ptr(src, start_slot + i, @@ -4369,7 +4369,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) again: key.objectid = BTRFS_TREE_LOG_OBJECTID; key.offset = (u64)-1; - btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); + key.type = BTRFS_ROOT_ITEM_KEY; while (1) { ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); -- cgit v1.2.3 From 707e8a071528385a87b63a72a37c2322e463c7b8 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 4 Jun 2014 19:22:26 +0200 Subject: btrfs: use nodesize everywhere, kill leafsize The nodesize and leafsize were never of different values. Unify the usage and make nodesize the one. Cleanup the redundant checks and helpers. Shaves a few bytes from .text: text data bss dec hex filename 852418 24560 23112 900090 dbbfa btrfs.ko.before 851074 24584 23112 898770 db6d2 btrfs.ko.after Signed-off-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/tree-log.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 2f5000c0a87a..7b6d1428f033 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2157,7 +2157,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, bytenr = btrfs_node_blockptr(cur, path->slots[*level]); ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); - blocksize = btrfs_level_size(root, *level - 1); + blocksize = root->nodesize; parent = path->nodes[*level]; root_owner = btrfs_header_owner(parent); -- cgit v1.2.3 From f98de9b9c07485f7e21edfd5b2b20c89d662af3c Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 4 Aug 2014 19:37:21 +0100 Subject: Btrfs: make btrfs_search_forward return with nodes unlocked None of the uses of btrfs_search_forward() need to have the path nodes (level >= 1) read locked, only the leaf needs to be locked while the caller processes it. Therefore make it return a path with all nodes unlocked, except for the leaf. This change is motivated by the observation that during a file fsync we repeatdly call btrfs_search_forward() and process the returned leaf while upper nodes of the returned path (level >= 1) are read locked, which unnecessarily blocks other tasks that want to write to the same fs/subvol btree. Therefore instead of modifying the fsync code to unlock all nodes with level >= 1 immediately after calling btrfs_search_forward(), change btrfs_search_forward() to do it, so that it benefits all callers. Signed-off-by: Filipe Manana Signed-off-by: Chris Mason --- fs/btrfs/tree-log.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 7b6d1428f033..82db14f5cf87 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2983,8 +2983,6 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, min_key.type = key_type; min_key.offset = min_offset; - path->keep_locks = 1; - ret = btrfs_search_forward(root, &min_key, path, trans->transid); /* @@ -3964,7 +3962,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, err = ret; goto out_unlock; } - path->keep_locks = 1; while (1) { ins_nr = 0; -- cgit v1.2.3 From a2cc11db245b9d8fbd4e3adbe2a1e7cf60473950 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 8 Sep 2014 22:53:18 +0100 Subject: Btrfs: fix directory recovery from fsync log When replaying a directory from the fsync log, if a directory entry exists both in the fs/subvol tree and in the log, the directory's inode got its i_size updated incorrectly, accounting for the dentry's name twice. Reproducer, from a test for xfstests: _scratch_mkfs >> $seqres.full 2>&1 _init_flakey _mount_flakey touch $SCRATCH_MNT/foo sync touch $SCRATCH_MNT/bar xfs_io -c "fsync" $SCRATCH_MNT xfs_io -c "fsync" $SCRATCH_MNT/bar _load_flakey_table $FLAKEY_DROP_WRITES _unmount_flakey _load_flakey_table $FLAKEY_ALLOW_WRITES _mount_flakey [ -f $SCRATCH_MNT/foo ] || echo "file foo is missing" [ -f $SCRATCH_MNT/bar ] || echo "file bar is missing" _unmount_flakey _check_scratch_fs $FLAKEY_DEV The filesystem check at the end failed with the message: "root 5 root dir 256 error". A test case for xfstests follows. Signed-off-by: Filipe Manana Signed-off-by: Chris Mason --- fs/btrfs/tree-log.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 82db14f5cf87..dce33b5a6942 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1637,6 +1637,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, found_key.type == log_key.type && found_key.offset == log_key.offset && btrfs_dir_type(path->nodes[0], dst_di) == log_type) { + update_size = false; goto out; } -- cgit v1.2.3 From 8407f553268a4611f2542ed90677f0edfaa2c9c4 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 5 Sep 2014 15:14:39 +0100 Subject: Btrfs: fix data corruption after fast fsync and writeback error When we do a fast fsync, we start all ordered operations and then while they're running in parallel we visit the list of modified extent maps and construct their matching file extent items and write them to the log btree. After that, in btrfs_sync_log() we wait for all the ordered operations to finish (via btrfs_wait_logged_extents). The problem with this is that we were completely ignoring errors that can happen in the extent write path, such as -ENOSPC, a temporary -ENOMEM or -EIO errors for example. When such error happens, it means we have parts of the on disk extent that weren't written to, and so we end up logging file extent items that point to these extents that contain garbage/random data - so after a crash/reboot plus log replay, we get our inode's metadata pointing to those extents. This worked in contrast with the full (non-fast) fsync path, where we start all ordered operations, wait for them to finish and then write to the log btree. In this path, after each ordered operation completes we check if it's flagged with an error (BTRFS_ORDERED_IOERR) and return -EIO if so (via btrfs_wait_ordered_range). So if an error happens with any ordered operation, just return a -EIO error to userspace, so that it knows that not all of its previous writes were durably persisted and the application can take proper action (like redo the writes for e.g.) - and definitely not leave any file extent items in the log refer to non fully written extents. Signed-off-by: Filipe Manana Signed-off-by: Chris Mason --- fs/btrfs/tree-log.c | 247 ++++++++++++++++++++++++++++++---------------------- 1 file changed, 145 insertions(+), 102 deletions(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index dce33b5a6942..2b26dad35d88 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -97,7 +97,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, int inode_only, const loff_t start, - const loff_t end); + const loff_t end, + struct btrfs_log_ctx *ctx); static int link_to_fixup_dir(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 objectid); @@ -3572,107 +3573,33 @@ static int extent_cmp(void *priv, struct list_head *a, struct list_head *b) return 0; } -static int log_one_extent(struct btrfs_trans_handle *trans, - struct inode *inode, struct btrfs_root *root, - struct extent_map *em, struct btrfs_path *path, - struct list_head *logged_list) +static int wait_ordered_extents(struct btrfs_trans_handle *trans, + struct inode *inode, + struct btrfs_root *root, + const struct extent_map *em, + const struct list_head *logged_list, + bool *ordered_io_error) { - struct btrfs_root *log = root->log_root; - struct btrfs_file_extent_item *fi; - struct extent_buffer *leaf; struct btrfs_ordered_extent *ordered; - struct list_head ordered_sums; - struct btrfs_map_token token; - struct btrfs_key key; + struct btrfs_root *log = root->log_root; u64 mod_start = em->mod_start; u64 mod_len = em->mod_len; + const bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; u64 csum_offset; u64 csum_len; - u64 extent_offset = em->start - em->orig_start; - u64 block_len; - int ret; - bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; - int extent_inserted = 0; - - INIT_LIST_HEAD(&ordered_sums); - btrfs_init_map_token(&token); - - ret = __btrfs_drop_extents(trans, log, inode, path, em->start, - em->start + em->len, NULL, 0, 1, - sizeof(*fi), &extent_inserted); - if (ret) - return ret; - - if (!extent_inserted) { - key.objectid = btrfs_ino(inode); - key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = em->start; - - ret = btrfs_insert_empty_item(trans, log, path, &key, - sizeof(*fi)); - if (ret) - return ret; - } - leaf = path->nodes[0]; - fi = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - - btrfs_set_token_file_extent_generation(leaf, fi, em->generation, - &token); - if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { - skip_csum = true; - btrfs_set_token_file_extent_type(leaf, fi, - BTRFS_FILE_EXTENT_PREALLOC, - &token); - } else { - btrfs_set_token_file_extent_type(leaf, fi, - BTRFS_FILE_EXTENT_REG, - &token); - if (em->block_start == EXTENT_MAP_HOLE) - skip_csum = true; - } - - block_len = max(em->block_len, em->orig_block_len); - if (em->compress_type != BTRFS_COMPRESS_NONE) { - btrfs_set_token_file_extent_disk_bytenr(leaf, fi, - em->block_start, - &token); - btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, - &token); - } else if (em->block_start < EXTENT_MAP_LAST_BYTE) { - btrfs_set_token_file_extent_disk_bytenr(leaf, fi, - em->block_start - - extent_offset, &token); - btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, - &token); - } else { - btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token); - btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0, - &token); - } - - btrfs_set_token_file_extent_offset(leaf, fi, - em->start - em->orig_start, - &token); - btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token); - btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token); - btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type, - &token); - btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token); - btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token); - btrfs_mark_buffer_dirty(leaf); + LIST_HEAD(ordered_sums); + int ret = 0; - btrfs_release_path(path); - if (ret) { - return ret; - } + *ordered_io_error = false; - if (skip_csum) + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || + em->block_start == EXTENT_MAP_HOLE) return 0; /* - * First check and see if our csums are on our outstanding ordered - * extents. + * Wait far any ordered extent that covers our extent map. If it + * finishes without an error, first check and see if our csums are on + * our outstanding ordered extents. */ list_for_each_entry(ordered, logged_list, log_list) { struct btrfs_ordered_sum *sum; @@ -3684,6 +3611,24 @@ static int log_one_extent(struct btrfs_trans_handle *trans, mod_start + mod_len <= ordered->file_offset) continue; + if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) && + !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) && + !test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) { + const u64 start = ordered->file_offset; + const u64 end = ordered->file_offset + ordered->len - 1; + + WARN_ON(ordered->inode != inode); + filemap_fdatawrite_range(inode->i_mapping, start, end); + } + + wait_event(ordered->wait, + (test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) || + test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))); + + if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) { + *ordered_io_error = true; + break; + } /* * We are going to copy all the csums on this ordered extent, so * go ahead and adjust mod_start and mod_len in case this @@ -3715,6 +3660,9 @@ static int log_one_extent(struct btrfs_trans_handle *trans, } } + if (skip_csum) + continue; + /* * To keep us from looping for the above case of an ordered * extent that falls inside of the logged extent. @@ -3732,18 +3680,16 @@ static int log_one_extent(struct btrfs_trans_handle *trans, list_for_each_entry(sum, &ordered->list, list) { ret = btrfs_csum_file_blocks(trans, log, sum); if (ret) - goto unlocked; + break; } - } -unlocked: - if (!mod_len || ret) + if (*ordered_io_error || !mod_len || ret || skip_csum) return ret; if (em->compress_type) { csum_offset = 0; - csum_len = block_len; + csum_len = max(em->block_len, em->orig_block_len); } else { csum_offset = mod_start - em->start; csum_len = mod_len; @@ -3770,11 +3716,106 @@ unlocked: return ret; } +static int log_one_extent(struct btrfs_trans_handle *trans, + struct inode *inode, struct btrfs_root *root, + const struct extent_map *em, + struct btrfs_path *path, + const struct list_head *logged_list, + struct btrfs_log_ctx *ctx) +{ + struct btrfs_root *log = root->log_root; + struct btrfs_file_extent_item *fi; + struct extent_buffer *leaf; + struct btrfs_map_token token; + struct btrfs_key key; + u64 extent_offset = em->start - em->orig_start; + u64 block_len; + int ret; + int extent_inserted = 0; + bool ordered_io_err = false; + + ret = wait_ordered_extents(trans, inode, root, em, logged_list, + &ordered_io_err); + if (ret) + return ret; + + if (ordered_io_err) { + ctx->io_err = -EIO; + return 0; + } + + btrfs_init_map_token(&token); + + ret = __btrfs_drop_extents(trans, log, inode, path, em->start, + em->start + em->len, NULL, 0, 1, + sizeof(*fi), &extent_inserted); + if (ret) + return ret; + + if (!extent_inserted) { + key.objectid = btrfs_ino(inode); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = em->start; + + ret = btrfs_insert_empty_item(trans, log, path, &key, + sizeof(*fi)); + if (ret) + return ret; + } + leaf = path->nodes[0]; + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + + btrfs_set_token_file_extent_generation(leaf, fi, em->generation, + &token); + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + btrfs_set_token_file_extent_type(leaf, fi, + BTRFS_FILE_EXTENT_PREALLOC, + &token); + else + btrfs_set_token_file_extent_type(leaf, fi, + BTRFS_FILE_EXTENT_REG, + &token); + + block_len = max(em->block_len, em->orig_block_len); + if (em->compress_type != BTRFS_COMPRESS_NONE) { + btrfs_set_token_file_extent_disk_bytenr(leaf, fi, + em->block_start, + &token); + btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, + &token); + } else if (em->block_start < EXTENT_MAP_LAST_BYTE) { + btrfs_set_token_file_extent_disk_bytenr(leaf, fi, + em->block_start - + extent_offset, &token); + btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, + &token); + } else { + btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token); + btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0, + &token); + } + + btrfs_set_token_file_extent_offset(leaf, fi, extent_offset, &token); + btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token); + btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token); + btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type, + &token); + btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token); + btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token); + btrfs_mark_buffer_dirty(leaf); + + btrfs_release_path(path); + + return ret; +} + static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, struct btrfs_path *path, - struct list_head *logged_list) + struct list_head *logged_list, + struct btrfs_log_ctx *ctx) { struct extent_map *em, *n; struct list_head extents; @@ -3832,7 +3873,8 @@ process: write_unlock(&tree->lock); - ret = log_one_extent(trans, inode, root, em, path, logged_list); + ret = log_one_extent(trans, inode, root, em, path, logged_list, + ctx); write_lock(&tree->lock); clear_em_logging(tree, em); free_extent_map(em); @@ -3862,7 +3904,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, int inode_only, const loff_t start, - const loff_t end) + const loff_t end, + struct btrfs_log_ctx *ctx) { struct btrfs_path *path; struct btrfs_path *dst_path; @@ -4046,7 +4089,7 @@ log_extents: btrfs_release_path(dst_path); if (fast_search) { ret = btrfs_log_changed_extents(trans, root, inode, dst_path, - &logged_list); + &logged_list, ctx); if (ret) { err = ret; goto out_unlock; @@ -4246,7 +4289,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, if (ret) goto end_no_trans; - ret = btrfs_log_inode(trans, root, inode, inode_only, start, end); + ret = btrfs_log_inode(trans, root, inode, inode_only, start, end, ctx); if (ret) goto end_trans; @@ -4275,7 +4318,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, if (BTRFS_I(inode)->generation > root->fs_info->last_trans_committed) { ret = btrfs_log_inode(trans, root, inode, inode_only, - 0, LLONG_MAX); + 0, LLONG_MAX, ctx); if (ret) goto end_trans; } -- cgit v1.2.3 From 1a4ed8fdca077d2489ec47d548451be69389e926 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 27 Oct 2014 10:44:24 +0000 Subject: Btrfs: fix invalid leaf slot access in btrfs_lookup_extent() If we couldn't find our extent item, we accessed the current slot (path->slots[0]) to check if it corresponds to an equivalent skinny metadata item. However this slot could be beyond our last item in the leaf (i.e. path->slots[0] >= btrfs_header_nritems(leaf)), in which case we shouldn't process it. Since btrfs_lookup_extent() is only used to find extent items for data extents, fix this by removing completely the logic that looks up for an equivalent skinny metadata item, since it can not exist. Signed-off-by: Filipe Manana Signed-off-by: Chris Mason --- fs/btrfs/tree-log.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 2b26dad35d88..6d58d72705ae 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -672,7 +672,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, * is this extent already allocated in the extent * allocation tree? If so, just add a reference */ - ret = btrfs_lookup_extent(root, ins.objectid, + ret = btrfs_lookup_data_extent(root, ins.objectid, ins.offset); if (ret == 0) { ret = btrfs_inc_extent_ref(trans, root, -- cgit v1.2.3 From 9dba8cf128ef98257ca719722280c9634e7e9dc7 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 6 Nov 2014 10:19:54 -0500 Subject: Btrfs: make sure we wait on logged extents when fsycning two subvols If we have two fsync()'s race on different subvols one will do all of its work to get into the log_tree, wait on it's outstanding IO, and then allow the log_tree to finish it's commit. The problem is we were just free'ing that subvols logged extents instead of waiting on them, so whoever lost the race wouldn't really have their data on disk. Fix this by waiting properly instead of freeing the logged extents. Thanks, cc: stable@vger.kernel.org Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/tree-log.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 286213cec861..fc715ff31d26 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2600,9 +2600,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, if (atomic_read(&log_root_tree->log_commit[index2])) { blk_finish_plug(&plug); btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); + btrfs_wait_logged_extents(log, log_transid); wait_log_commit(trans, log_root_tree, root_log_ctx.log_transid); - btrfs_free_logged_extents(log, log_transid); mutex_unlock(&log_root_tree->log_mutex); ret = root_log_ctx.log_ret; goto out; -- cgit v1.2.3 From 50d9aa99bd35c77200e0e3dd7a72274f8304701f Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 21 Nov 2014 14:52:38 -0500 Subject: Btrfs: make sure logged extents complete in the current transaction V3 Liu Bo pointed out that my previous fix would lose the generation update in the scenario I described. It is actually much worse than that, we could lose the entire extent if we lose power right after the transaction commits. Consider the following write extent 0-4k log extent in log tree commit transaction < power fail happens here ordered extent completes We would lose the 0-4k extent because it hasn't updated the actual fs tree, and the transaction commit will reset the log so it isn't replayed. If we lose power before the transaction commit we are save, otherwise we are not. Fix this by keeping track of all extents we logged in this transaction. Then when we go to commit the transaction make sure we wait for all of those ordered extents to complete before proceeding. This will make sure that if we lose power after the transaction commit we still have our data. This also fixes the problem of the improperly updated extent generation. Thanks, cc: stable@vger.kernel.org Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/tree-log.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index fc715ff31d26..7d96cc961663 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2600,7 +2600,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, if (atomic_read(&log_root_tree->log_commit[index2])) { blk_finish_plug(&plug); btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); - btrfs_wait_logged_extents(log, log_transid); + btrfs_wait_logged_extents(trans, log, log_transid); wait_log_commit(trans, log_root_tree, root_log_ctx.log_transid); mutex_unlock(&log_root_tree->log_mutex); @@ -2645,7 +2645,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, btrfs_wait_marked_extents(log_root_tree, &log_root_tree->dirty_log_pages, EXTENT_NEW | EXTENT_DIRTY); - btrfs_wait_logged_extents(log, log_transid); + btrfs_wait_logged_extents(trans, log, log_transid); btrfs_set_super_log_root(root->fs_info->super_for_commit, log_root_tree->node->start); @@ -3766,7 +3766,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans, fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); - btrfs_set_token_file_extent_generation(leaf, fi, em->generation, + btrfs_set_token_file_extent_generation(leaf, fi, trans->transid, &token); if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) btrfs_set_token_file_extent_type(leaf, fi, -- cgit v1.2.3 From 5ab5e44a36164f0366a98b47289c868d8fbcb256 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 13 Nov 2014 16:59:53 +0000 Subject: Btrfs: don't ignore log btree writeback errors If an error happens during writeback of log btree extents, make sure the error is returned to the caller (fsync), so that it takes proper action (commit current transaction) instead of writing a superblock that points to log btrees with all or some nodes that weren't durably persisted. Signed-off-by: Filipe Manana Signed-off-by: Chris Mason --- fs/btrfs/tree-log.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 7d96cc961663..afe483cb27b0 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2599,12 +2599,14 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, index2 = root_log_ctx.log_transid % 2; if (atomic_read(&log_root_tree->log_commit[index2])) { blk_finish_plug(&plug); - btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); + ret = btrfs_wait_marked_extents(log, &log->dirty_log_pages, + mark); btrfs_wait_logged_extents(trans, log, log_transid); wait_log_commit(trans, log_root_tree, root_log_ctx.log_transid); mutex_unlock(&log_root_tree->log_mutex); - ret = root_log_ctx.log_ret; + if (!ret) + ret = root_log_ctx.log_ret; goto out; } ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid); @@ -2641,10 +2643,17 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, mutex_unlock(&log_root_tree->log_mutex); goto out_wake_log_root; } - btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); - btrfs_wait_marked_extents(log_root_tree, - &log_root_tree->dirty_log_pages, - EXTENT_NEW | EXTENT_DIRTY); + ret = btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); + if (!ret) + ret = btrfs_wait_marked_extents(log_root_tree, + &log_root_tree->dirty_log_pages, + EXTENT_NEW | EXTENT_DIRTY); + if (ret) { + btrfs_set_log_full_commit(root->fs_info, trans); + btrfs_free_logged_extents(log, log_transid); + mutex_unlock(&log_root_tree->log_mutex); + goto out_wake_log_root; + } btrfs_wait_logged_extents(trans, log, log_transid); btrfs_set_super_log_root(root->fs_info->super_for_commit, -- cgit v1.2.3 From 0870295b2371673b3563735825ad559409d8cedc Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 13 Nov 2014 17:00:35 +0000 Subject: Btrfs: collect only the necessary ordered extents on ranged fsync Instead of collecting all ordered extents from the inode's ordered tree and then wait for all of them to complete, just collect the ones that overlap the fsync range. Signed-off-by: Filipe Manana Signed-off-by: Chris Mason --- fs/btrfs/tree-log.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index afe483cb27b0..3883d0febd82 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3972,7 +3972,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, mutex_lock(&BTRFS_I(inode)->log_mutex); - btrfs_get_logged_extents(inode, &logged_list); + btrfs_get_logged_extents(inode, &logged_list, start, end); /* * a brute force approach to making sure we get the most uptodate -- cgit v1.2.3 From b38ef71cb102208dffcf4e8524e9d5ec4ec0eaa9 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 13 Nov 2014 17:01:45 +0000 Subject: Btrfs: ensure ordered extent errors aren't missed on fsync When doing a fsync with a fast path we have a time window where we can miss the fact that writeback of some file data failed, and therefore we endup returning success (0) from fsync when we should return an error. The steps that lead to this are the following: 1) We start all ordered extents by calling filemap_fdatawrite_range(); 2) We do some other work like locking the inode's i_mutex, start a transaction, start a log transaction, etc; 3) We enter btrfs_log_inode(), acquire the inode's log_mutex and collect all the ordered extents from inode's ordered tree into a list; 4) But by the time we do ordered extent collection, some ordered extents we started at step 1) might have already completed with an error, and therefore we didn't found them in the ordered tree and had no idea they finished with an error. This makes our fsync return success (0) to userspace, but has no bad effects on the log like for example insertion of file extent items into the log that point to unwritten extents, because the invalid extent maps were removed before the ordered extent completed (in inode.c:btrfs_finish_ordered_io). So after collecting the ordered extents just check if the inode's i_mapping has any error flags set (AS_EIO or AS_ENOSPC) and leave with an error if it does. Whenever writeback fails for a page of an ordered extent, we call mapping_set_error (done in extent_io.c:end_extent_writepage, called by extent_io.c:end_bio_extent_writepage) that sets one of those error flags in the inode's i_mapping flags. This change also has the side effect of fixing the issue where for fast fsyncs we never checked/cleared the error flags from the inode's i_mapping flags, which means that a full fsync performed after a fast fsync could get such errors that belonged to the fast fsync - because the full fsync calls btrfs_wait_ordered_range() which calls filemap_fdatawait_range(), and the later checks for and clears those flags, while for fast fsyncs we never call filemap_fdatawait_range() or anything else that checks for and clears the error flags from the inode's i_mapping. Signed-off-by: Filipe Manana Signed-off-by: Chris Mason --- fs/btrfs/tree-log.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 3883d0febd82..9a02da16f2be 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3635,6 +3635,12 @@ static int wait_ordered_extents(struct btrfs_trans_handle *trans, test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))); if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) { + /* + * Clear the AS_EIO/AS_ENOSPC flags from the inode's + * i_mapping flags, so that the next fsync won't get + * an outdated io error too. + */ + btrfs_inode_check_errors(inode); *ordered_io_error = true; break; } @@ -4098,6 +4104,21 @@ log_extents: btrfs_release_path(path); btrfs_release_path(dst_path); if (fast_search) { + /* + * Some ordered extents started by fsync might have completed + * before we collected the ordered extents in logged_list, which + * means they're gone, not in our logged_list nor in the inode's + * ordered tree. We want the application/user space to know an + * error happened while attempting to persist file data so that + * it can take proper action. If such error happened, we leave + * without writing to the log tree and the fsync must report the + * file data write error and not commit the current transaction. + */ + err = btrfs_inode_check_errors(inode); + if (err) { + ctx->io_err = err; + goto out_unlock; + } ret = btrfs_log_changed_extents(trans, root, inode, dst_path, &logged_list, ctx); if (ret) { -- cgit v1.2.3