diff options
Diffstat (limited to 'fs/btrfs')
36 files changed, 531 insertions, 212 deletions
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 80e9c18ea64f..fd6b67c40d9d 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -9,6 +9,8 @@ config BTRFS_FS select RAID6_PQ select XOR_BLOCKS select SRCU + depends on !PPC_256K_PAGES # powerpc + depends on !PAGE_SIZE_256KB # hexagon help Btrfs is a general purpose copy-on-write filesystem with extents, diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index a09264d8b853..d096254d9acc 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -261,6 +261,13 @@ static void run_ordered_work(struct __btrfs_workqueue *wq) ordered_list); if (!test_bit(WORK_DONE_BIT, &work->flags)) break; + /* + * Orders all subsequent loads after reading WORK_DONE_BIT, + * paired with the smp_mb__before_atomic in btrfs_work_helper + * this guarantees that the ordered function will see all + * updates from ordinary work function. + */ + smp_rmb(); /* * we are going to call the ordered done function, but @@ -310,6 +317,13 @@ static void normal_work_helper(struct btrfs_work *work) thresh_exec_hook(wq); work->func(work); if (need_order) { + /* + * Ensures all memory accesses done in the work function are + * ordered before setting the WORK_DONE_BIT. Ensuring the thread + * which is going to executed the ordered work sees them. + * Pairs with the smp_rmb in run_ordered_work. + */ + smp_mb__before_atomic(); set_bit(WORK_DONE_BIT, &work->flags); run_ordered_work(wq); } @@ -389,3 +403,11 @@ void btrfs_set_work_high_priority(struct btrfs_work *work) { set_bit(WORK_HIGH_PRIO_BIT, &work->flags); } + +void btrfs_flush_workqueue(struct btrfs_workqueue *wq) +{ + if (wq->high) + flush_workqueue(wq->high->normal_wq); + + flush_workqueue(wq->normal->normal_wq); +} diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index 8e1d6576d764..7ea220726de2 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -81,4 +81,6 @@ void btrfs_destroy_workqueue(struct btrfs_workqueue *wq); void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max); void btrfs_set_work_high_priority(struct btrfs_work *work); bool btrfs_workqueue_normal_congested(struct btrfs_workqueue *wq); +void btrfs_flush_workqueue(struct btrfs_workqueue *wq); + #endif diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 81c5d07a2af1..00c9a9e719ec 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -975,7 +975,12 @@ again: ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0); if (ret < 0) goto out; - BUG_ON(ret == 0); + if (ret == 0) { + /* This shouldn't happen, indicates a bug or fs corruption. */ + ASSERT(ret != 0); + ret = -EUCLEAN; + goto out; + } #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS if (trans && likely(trans->type != __TRANS_DUMMY) && @@ -1104,10 +1109,18 @@ again: goto out; if (!ret && extent_item_pos) { /* - * we've recorded that parent, so we must extend - * its inode list here + * We've recorded that parent, so we must extend + * its inode list here. + * + * However if there was corruption we may not + * have found an eie, return an error in this + * case. */ - BUG_ON(!eie); + ASSERT(eie); + if (!eie) { + ret = -EUCLEAN; + goto out; + } while (eie->next) eie = eie->next; eie->next = ref->inode_list; @@ -1221,6 +1234,7 @@ static int __btrfs_find_all_roots(struct btrfs_trans_handle *trans, if (ret < 0 && ret != -ENOENT) { ulist_free(tmp); ulist_free(*roots); + *roots = NULL; return ret; } node = ulist_next(tmp, &uiter); diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index bae05c5c75ba..92601775ec5e 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -290,7 +290,7 @@ static void end_compressed_bio_write(struct bio *bio) cb->start, cb->start + cb->len - 1, NULL, - bio->bi_error ? 0 : 1); + !cb->errors); cb->compressed_pages[0]->mapping = NULL; end_compressed_writeback(inode, cb); diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index a980b3309770..fbb4c81f6311 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -332,26 +332,6 @@ struct tree_mod_elem { struct tree_mod_root old_root; }; -static inline void tree_mod_log_read_lock(struct btrfs_fs_info *fs_info) -{ - read_lock(&fs_info->tree_mod_log_lock); -} - -static inline void tree_mod_log_read_unlock(struct btrfs_fs_info *fs_info) -{ - read_unlock(&fs_info->tree_mod_log_lock); -} - -static inline void tree_mod_log_write_lock(struct btrfs_fs_info *fs_info) -{ - write_lock(&fs_info->tree_mod_log_lock); -} - -static inline void tree_mod_log_write_unlock(struct btrfs_fs_info *fs_info) -{ - write_unlock(&fs_info->tree_mod_log_lock); -} - /* * Pull a new tree mod seq number for our operation. */ @@ -371,14 +351,12 @@ static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info) u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, struct seq_list *elem) { - tree_mod_log_write_lock(fs_info); - spin_lock(&fs_info->tree_mod_seq_lock); + write_lock(&fs_info->tree_mod_log_lock); if (!elem->seq) { elem->seq = btrfs_inc_tree_mod_seq(fs_info); list_add_tail(&elem->list, &fs_info->tree_mod_seq_list); } - spin_unlock(&fs_info->tree_mod_seq_lock); - tree_mod_log_write_unlock(fs_info); + write_unlock(&fs_info->tree_mod_log_lock); return elem->seq; } @@ -397,7 +375,7 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, if (!seq_putting) return; - spin_lock(&fs_info->tree_mod_seq_lock); + write_lock(&fs_info->tree_mod_log_lock); list_del(&elem->list); elem->seq = 0; @@ -408,29 +386,27 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, * blocker with lower sequence number exists, we * cannot remove anything from the log */ - spin_unlock(&fs_info->tree_mod_seq_lock); + write_unlock(&fs_info->tree_mod_log_lock); return; } min_seq = cur_elem->seq; } } - spin_unlock(&fs_info->tree_mod_seq_lock); /* * anything that's lower than the lowest existing (read: blocked) * sequence number can be removed from the tree. */ - tree_mod_log_write_lock(fs_info); tm_root = &fs_info->tree_mod_log; for (node = rb_first(tm_root); node; node = next) { next = rb_next(node); tm = container_of(node, struct tree_mod_elem, node); - if (tm->seq > min_seq) + if (tm->seq >= min_seq) continue; rb_erase(node, tm_root); kfree(tm); } - tree_mod_log_write_unlock(fs_info); + write_unlock(&fs_info->tree_mod_log_lock); } /* @@ -441,7 +417,7 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, * operations, or the shifted logical of the affected block for all other * operations. * - * Note: must be called with write lock (tree_mod_log_write_lock). + * Note: must be called with write lock for fs_info::tree_mod_log_lock. */ static noinline int __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm) @@ -481,7 +457,7 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm) * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it * returns zero with the tree_mod_log_lock acquired. The caller must hold * this until all tree mod log insertions are recorded in the rb tree and then - * call tree_mod_log_write_unlock() to release. + * write unlock fs_info::tree_mod_log_lock. */ static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info, struct extent_buffer *eb) { @@ -491,9 +467,9 @@ static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info, if (eb && btrfs_header_level(eb) == 0) return 1; - tree_mod_log_write_lock(fs_info); + write_lock(&fs_info->tree_mod_log_lock); if (list_empty(&(fs_info)->tree_mod_seq_list)) { - tree_mod_log_write_unlock(fs_info); + write_unlock(&fs_info->tree_mod_log_lock); return 1; } @@ -557,7 +533,7 @@ tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, } ret = __tree_mod_log_insert(fs_info, tm); - tree_mod_log_write_unlock(fs_info); + write_unlock(&eb->fs_info->tree_mod_log_lock); if (ret) kfree(tm); @@ -621,7 +597,7 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info, ret = __tree_mod_log_insert(fs_info, tm); if (ret) goto free_tms; - tree_mod_log_write_unlock(fs_info); + write_unlock(&eb->fs_info->tree_mod_log_lock); kfree(tm_list); return 0; @@ -632,7 +608,7 @@ free_tms: kfree(tm_list[i]); } if (locked) - tree_mod_log_write_unlock(fs_info); + write_unlock(&eb->fs_info->tree_mod_log_lock); kfree(tm_list); kfree(tm); @@ -713,7 +689,7 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, if (!ret) ret = __tree_mod_log_insert(fs_info, tm); - tree_mod_log_write_unlock(fs_info); + write_unlock(&fs_info->tree_mod_log_lock); if (ret) goto free_tms; kfree(tm_list); @@ -741,7 +717,7 @@ __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq, struct tree_mod_elem *found = NULL; u64 index = start >> PAGE_CACHE_SHIFT; - tree_mod_log_read_lock(fs_info); + read_lock(&fs_info->tree_mod_log_lock); tm_root = &fs_info->tree_mod_log; node = tm_root->rb_node; while (node) { @@ -769,7 +745,7 @@ __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq, break; } } - tree_mod_log_read_unlock(fs_info); + read_unlock(&fs_info->tree_mod_log_lock); return found; } @@ -850,7 +826,7 @@ tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, goto free_tms; } - tree_mod_log_write_unlock(fs_info); + write_unlock(&fs_info->tree_mod_log_lock); kfree(tm_list); return 0; @@ -862,7 +838,7 @@ free_tms: kfree(tm_list[i]); } if (locked) - tree_mod_log_write_unlock(fs_info); + write_unlock(&fs_info->tree_mod_log_lock); kfree(tm_list); return ret; @@ -922,7 +898,7 @@ tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb) goto free_tms; ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems); - tree_mod_log_write_unlock(fs_info); + write_unlock(&eb->fs_info->tree_mod_log_lock); if (ret) goto free_tms; kfree(tm_list); @@ -1153,6 +1129,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, ret = update_ref_for_cow(trans, root, buf, cow, &last_ref); if (ret) { + btrfs_tree_unlock(cow); + free_extent_buffer(cow); btrfs_abort_transaction(trans, root, ret); return ret; } @@ -1160,6 +1138,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) { ret = btrfs_reloc_cow_block(trans, root, buf, cow); if (ret) { + btrfs_tree_unlock(cow); + free_extent_buffer(cow); btrfs_abort_transaction(trans, root, ret); return ret; } @@ -1198,6 +1178,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, if (last_ref) { ret = tree_mod_log_free_eb(root->fs_info, buf); if (ret) { + btrfs_tree_unlock(cow); + free_extent_buffer(cow); btrfs_abort_transaction(trans, root, ret); return ret; } @@ -1284,7 +1266,7 @@ __tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, unsigned long p_size = sizeof(struct btrfs_key_ptr); n = btrfs_header_nritems(eb); - tree_mod_log_read_lock(fs_info); + read_lock(&fs_info->tree_mod_log_lock); while (tm && tm->seq >= time_seq) { /* * all the operations are recorded with the operator used for @@ -1339,7 +1321,7 @@ __tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, if (tm->index != first_tm->index) break; } - tree_mod_log_read_unlock(fs_info); + read_unlock(&fs_info->tree_mod_log_lock); btrfs_set_header_nritems(eb, n); } @@ -1396,7 +1378,8 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path, btrfs_tree_read_unlock_blocking(eb); free_extent_buffer(eb); - extent_buffer_get(eb_rewin); + btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb_rewin), + eb_rewin, btrfs_header_level(eb_rewin)); btrfs_tree_read_lock(eb_rewin); __tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm); WARN_ON(btrfs_header_nritems(eb_rewin) > @@ -1448,7 +1431,9 @@ get_old_root(struct btrfs_root *root, u64 time_seq) btrfs_warn(root->fs_info, "failed to read tree block %llu from get_old_root", logical); } else { + btrfs_tree_read_lock(old); eb = btrfs_clone_extent_buffer(old); + btrfs_tree_read_unlock(old); free_extent_buffer(old); } } else if (old_root) { @@ -1465,8 +1450,6 @@ get_old_root(struct btrfs_root *root, u64 time_seq) if (!eb) return NULL; - extent_buffer_get(eb); - btrfs_tree_read_lock(eb); if (old_root) { btrfs_set_header_bytenr(eb, eb->start); btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV); @@ -1474,6 +1457,9 @@ get_old_root(struct btrfs_root *root, u64 time_seq) btrfs_set_header_level(eb, old_root->level); btrfs_set_header_generation(eb, old_generation); } + btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb, + btrfs_header_level(eb)); + btrfs_tree_read_lock(eb); if (tm) __tree_mod_log_rewind(root->fs_info, eb, time_seq, tm); else diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 4a91d3119e59..8fb9a1e0048b 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1576,14 +1576,12 @@ struct btrfs_fs_info { struct list_head delayed_iputs; struct mutex cleaner_delayed_iput_mutex; - /* this protects tree_mod_seq_list */ - spinlock_t tree_mod_seq_lock; atomic64_t tree_mod_seq; - struct list_head tree_mod_seq_list; - /* this protects tree_mod_log */ + /* this protects tree_mod_log and tree_mod_seq_list */ rwlock_t tree_mod_log_lock; struct rb_root tree_mod_log; + struct list_head tree_mod_seq_list; atomic_t nr_async_submits; atomic_t async_submit_draining; @@ -4098,6 +4096,8 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size); /* super.c */ int btrfs_parse_options(struct btrfs_root *root, char *options); int btrfs_sync_fs(struct super_block *sb, int wait); +char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, + u64 subvol_objectid); #ifdef CONFIG_PRINTK __printf(2, 3) diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index e06dd75ad13f..bb1e32f77b69 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -193,8 +193,6 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans, ref->in_tree = 0; btrfs_put_delayed_ref(ref); atomic_dec(&delayed_refs->num_entries); - if (trans->delayed_ref_updates) - trans->delayed_ref_updates--; } static bool merge_ref(struct btrfs_trans_handle *trans, @@ -281,7 +279,7 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, if (head->is_data) return; - spin_lock(&fs_info->tree_mod_seq_lock); + read_lock(&fs_info->tree_mod_log_lock); if (!list_empty(&fs_info->tree_mod_seq_list)) { struct seq_list *elem; @@ -289,7 +287,7 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, struct seq_list, list); seq = elem->seq; } - spin_unlock(&fs_info->tree_mod_seq_lock); + read_unlock(&fs_info->tree_mod_log_lock); ref = list_first_entry(&head->ref_list, struct btrfs_delayed_ref_node, list); @@ -317,7 +315,7 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, struct seq_list *elem; int ret = 0; - spin_lock(&fs_info->tree_mod_seq_lock); + read_lock(&fs_info->tree_mod_log_lock); if (!list_empty(&fs_info->tree_mod_seq_list)) { elem = list_first_entry(&fs_info->tree_mod_seq_list, struct seq_list, list); @@ -330,7 +328,7 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, } } - spin_unlock(&fs_info->tree_mod_seq_lock); + read_unlock(&fs_info->tree_mod_log_lock); return ret; } @@ -444,7 +442,6 @@ add_delayed_ref_tail_merge(struct btrfs_trans_handle *trans, add_tail: list_add_tail(&ref->list, &href->ref_list); atomic_inc(&root->num_entries); - trans->delayed_ref_updates++; spin_unlock(&href->lock); return ret; } diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 78722aaffecd..de63cb9bc64b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1698,8 +1698,8 @@ static void end_workqueue_fn(struct btrfs_work *work) bio->bi_error = end_io_wq->error; bio->bi_private = end_io_wq->private; bio->bi_end_io = end_io_wq->end_io; - kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq); bio_endio(bio); + kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq); } static int cleaner_kthread(void *arg) @@ -1750,7 +1750,7 @@ static int cleaner_kthread(void *arg) */ btrfs_delete_unused_bgs(root->fs_info); sleep: - if (!try_to_freeze() && !again) { + if (!again) { set_current_state(TASK_INTERRUPTIBLE); if (!kthread_should_stop()) schedule(); @@ -2481,7 +2481,6 @@ int open_ctree(struct super_block *sb, spin_lock_init(&fs_info->delayed_iput_lock); spin_lock_init(&fs_info->defrag_inodes_lock); spin_lock_init(&fs_info->free_chunk_lock); - spin_lock_init(&fs_info->tree_mod_seq_lock); spin_lock_init(&fs_info->super_lock); spin_lock_init(&fs_info->qgroup_op_lock); spin_lock_init(&fs_info->buffer_lock); @@ -2973,6 +2972,7 @@ retry_root_backup: /* do not make disk changes in broken FS */ if (btrfs_super_log_root(disk_super) != 0) { + btrfs_info(fs_info, "start tree-log replay"); ret = btrfs_replay_log(fs_info, fs_devices); if (ret) { err = ret; @@ -3774,6 +3774,19 @@ void close_ctree(struct btrfs_root *root) */ btrfs_delete_unused_bgs(root->fs_info); + /* + * There might be existing delayed inode workers still running + * and holding an empty delayed inode item. We must wait for + * them to complete first because they can create a transaction. + * This happens when someone calls btrfs_balance_delayed_items() + * and then a transaction commit runs the same delayed nodes + * before any delayed worker has done something with the nodes. + * We must wait for any worker here and not at transaction + * commit time since that could cause a deadlock. + * This is a very rare case. + */ + btrfs_flush_workqueue(fs_info->delayed_workers); + ret = btrfs_commit_super(root); if (ret) btrfs_err(fs_info, "commit super ret %d", ret); diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 2513a7f53334..92f80ed64219 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -55,9 +55,9 @@ static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len, return type; } -static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, - u64 root_objectid, u32 generation, - int check_generation) +struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, + u64 root_objectid, u32 generation, + int check_generation) { struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_root *root; @@ -150,7 +150,7 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh, return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1); } -static struct dentry *btrfs_get_parent(struct dentry *child) +struct dentry *btrfs_get_parent(struct dentry *child) { struct inode *dir = d_inode(child); struct btrfs_root *root = BTRFS_I(dir)->root; diff --git a/fs/btrfs/export.h b/fs/btrfs/export.h index 074348a95841..7a305e554999 100644 --- a/fs/btrfs/export.h +++ b/fs/btrfs/export.h @@ -16,4 +16,9 @@ struct btrfs_fid { u64 parent_root_objectid; } __attribute__ ((packed)); +struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, + u64 root_objectid, u32 generation, + int check_generation); +struct dentry *btrfs_get_parent(struct dentry *child); + #endif diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 34ffc125763f..3bb731b2156c 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -10688,7 +10688,7 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info) disk_super = fs_info->super_copy; if (!btrfs_super_root(disk_super)) - return 1; + return -EINVAL; features = btrfs_super_incompat_flags(disk_super); if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 315f21191643..42c745dbccfa 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3995,6 +3995,10 @@ retry: if (!ret) { free_extent_buffer(eb); continue; + } else if (ret < 0) { + done = 1; + free_extent_buffer(eb); + break; } ret = write_one_eb(eb, fs_info, wbc, &epd); @@ -4153,6 +4157,14 @@ retry: */ scanned = 1; index = 0; + + /* + * If we're looping we could run into a page that is locked by a + * writer and that writer could be waiting on writeback for a + * page in our current bio, and thus deadlock, so flush the + * write bio here. + */ + flush_write_bio(data); goto retry; } btrfs_add_delayed_iput(inode); @@ -4418,6 +4430,8 @@ int try_release_extent_mapping(struct extent_map_tree *map, /* once for us */ free_extent_map(em); + + cond_resched(); /* Allow large-extent preemption. */ } } return try_release_extent_state(map, tree, page, mask); @@ -4842,25 +4856,28 @@ err: static void check_buffer_tree_ref(struct extent_buffer *eb) { int refs; - /* the ref bit is tricky. We have to make sure it is set - * if we have the buffer dirty. Otherwise the - * code to free a buffer can end up dropping a dirty - * page + /* + * The TREE_REF bit is first set when the extent_buffer is added + * to the radix tree. It is also reset, if unset, when a new reference + * is created by find_extent_buffer. * - * Once the ref bit is set, it won't go away while the - * buffer is dirty or in writeback, and it also won't - * go away while we have the reference count on the - * eb bumped. + * It is only cleared in two cases: freeing the last non-tree + * reference to the extent_buffer when its STALE bit is set or + * calling releasepage when the tree reference is the only reference. * - * We can't just set the ref bit without bumping the - * ref on the eb because free_extent_buffer might - * see the ref bit and try to clear it. If this happens - * free_extent_buffer might end up dropping our original - * ref by mistake and freeing the page before we are able - * to add one more ref. + * In both cases, care is taken to ensure that the extent_buffer's + * pages are not under io. However, releasepage can be concurrently + * called with creating new references, which is prone to race + * conditions between the calls to check_buffer_tree_ref in those + * codepaths and clearing TREE_REF in try_release_extent_buffer. * - * So bump the ref count first, then set the bit. If someone - * beat us to it, drop the ref we added. + * The actual lifetime of the extent_buffer in the radix tree is + * adequately protected by the refcount, but the TREE_REF bit and + * its corresponding reference are not. To protect against this + * class of races, we call check_buffer_tree_ref from the codepaths + * which trigger io after they set eb->io_pages. Note that once io is + * initiated, TREE_REF can no longer be cleared, so that is the + * moment at which any such race is best fixed. */ refs = atomic_read(&eb->refs); if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) @@ -4937,12 +4954,14 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, return eb; eb = alloc_dummy_extent_buffer(fs_info, start); if (!eb) - return NULL; + return ERR_PTR(-ENOMEM); eb->fs_info = fs_info; again: ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); - if (ret) + if (ret) { + exists = ERR_PTR(ret); goto free_eb; + } spin_lock(&fs_info->buffer_lock); ret = radix_tree_insert(&fs_info->buffer_radix, start >> PAGE_CACHE_SHIFT, eb); @@ -5325,6 +5344,11 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); eb->read_mirror = 0; atomic_set(&eb->io_pages, num_reads); + /* + * It is possible for releasepage to clear the TREE_REF bit before we + * set io_pages. See check_buffer_tree_ref for a more detailed comment. + */ + check_buffer_tree_ref(eb); for (i = start_i; i < num_pages; i++) { page = eb->pages[i]; if (!PageUptodate(page)) { @@ -5400,9 +5424,9 @@ void read_extent_buffer(const struct extent_buffer *eb, void *dstv, } } -int read_extent_buffer_to_user(const struct extent_buffer *eb, - void __user *dstv, - unsigned long start, unsigned long len) +int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb, + void __user *dstv, + unsigned long start, unsigned long len) { size_t cur; size_t offset; @@ -5423,7 +5447,7 @@ int read_extent_buffer_to_user(const struct extent_buffer *eb, cur = min(len, (PAGE_CACHE_SIZE - offset)); kaddr = page_address(page); - if (copy_to_user(dst, kaddr + offset, cur)) { + if (probe_user_write(dst, kaddr + offset, cur)) { ret = -EFAULT; break; } diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 751435967724..9631be7fc9e2 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -313,9 +313,9 @@ int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv, void read_extent_buffer(const struct extent_buffer *eb, void *dst, unsigned long start, unsigned long len); -int read_extent_buffer_to_user(const struct extent_buffer *eb, - void __user *dst, unsigned long start, - unsigned long len); +int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb, + void __user *dst, unsigned long start, + unsigned long len); void write_extent_buffer(struct extent_buffer *eb, const void *src, unsigned long start, unsigned long len); void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 84fb56d5c018..3818b65b0682 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -227,6 +227,17 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) struct extent_map *merge = NULL; struct rb_node *rb; + /* + * We can't modify an extent map that is in the tree and that is being + * used by another task, as it can cause that other task to see it in + * inconsistent state during the merging. We always have 1 reference for + * the tree and 1 for this task (which is unpinning the extent map or + * clearing the logging flag), so anything > 2 means it's being used by + * other tasks too. + */ + if (atomic_read(&em->refs) > 2) + return; + if (em->start != 0) { rb = rb_prev(&em->rb_node); if (rb) diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 58ece6558430..fb5c97ea670f 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -742,10 +742,12 @@ again: nritems = btrfs_header_nritems(path->nodes[0]); if (!nritems || (path->slots[0] >= nritems - 1)) { ret = btrfs_next_leaf(root, path); - if (ret == 1) + if (ret < 0) { + goto out; + } else if (ret > 0) { found_next = 1; - if (ret != 0) goto insert; + } slot = path->slots[0]; } btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index d056060529f8..2426dc56426f 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1089,7 +1089,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, int del_nr = 0; int del_slot = 0; int recow; - int ret; + int ret = 0; u64 ino = btrfs_ino(inode); path = btrfs_alloc_path(); @@ -1284,7 +1284,7 @@ again: } out: btrfs_free_path(path); - return 0; + return ret; } /* @@ -1525,6 +1525,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, } reserve_bytes = num_pages << PAGE_CACHE_SHIFT; + only_release_metadata = false; if ((BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) && @@ -1659,7 +1660,6 @@ again: set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, EXTENT_NORESERVE, NULL, NULL, GFP_NOFS); - only_release_metadata = false; } btrfs_drop_pages(pages, num_pages); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 6c0161284a9e..55d8020afc58 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -391,6 +391,12 @@ static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, struct inode *inode if (uptodate && !PageUptodate(page)) { btrfs_readpage(NULL, page); lock_page(page); + if (page->mapping != inode->i_mapping) { + btrfs_err(BTRFS_I(inode)->root->fs_info, + "free space cache page truncated"); + io_ctl_drop_pages(io_ctl); + return -EIO; + } if (!PageUptodate(page)) { btrfs_err(BTRFS_I(inode)->root->fs_info, "error reading free space cache"); @@ -748,8 +754,10 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, while (num_entries) { e = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS); - if (!e) + if (!e) { + ret = -ENOMEM; goto free_cache; + } ret = io_ctl_read_entry(&io_ctl, e, &type); if (ret) { @@ -758,6 +766,7 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, } if (!e->bytes) { + ret = -1; kmem_cache_free(btrfs_free_space_cachep, e); goto free_cache; } @@ -777,6 +786,7 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, num_bitmaps--; e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS); if (!e->bitmap) { + ret = -ENOMEM; kmem_cache_free( btrfs_free_space_cachep, e); goto free_cache; @@ -2152,7 +2162,7 @@ out: static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info, bool update_stat) { - struct btrfs_free_space *left_info; + struct btrfs_free_space *left_info = NULL; struct btrfs_free_space *right_info; bool merged = false; u64 offset = info->offset; @@ -2167,7 +2177,7 @@ static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl, if (right_info && rb_prev(&right_info->offset_index)) left_info = rb_entry(rb_prev(&right_info->offset_index), struct btrfs_free_space, offset_index); - else + else if (!right_info) left_info = tree_search_offset(ctl, offset - 1, 0, 0); if (right_info && !right_info->bitmap) { diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 07573dc1614a..3469c7ce7cb6 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -158,6 +158,7 @@ static void start_caching(struct btrfs_root *root) spin_lock(&root->ino_cache_lock); root->ino_cache_state = BTRFS_CACHE_FINISHED; spin_unlock(&root->ino_cache_lock); + wake_up(&root->ino_cache_wait); return; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d2c3edb50702..92415b8ac5a3 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -926,7 +926,7 @@ static noinline int cow_file_range(struct inode *inode, u64 alloc_hint = 0; u64 num_bytes; unsigned long ram_size; - u64 disk_num_bytes; + u64 min_alloc_size; u64 cur_alloc_size; u64 blocksize = root->sectorsize; struct btrfs_key ins; @@ -942,7 +942,6 @@ static noinline int cow_file_range(struct inode *inode, num_bytes = ALIGN(end - start + 1, blocksize); num_bytes = max(blocksize, num_bytes); - disk_num_bytes = num_bytes; /* if this is a small write inside eof, kick off defrag */ if (num_bytes < 64 * 1024 && @@ -969,18 +968,33 @@ static noinline int cow_file_range(struct inode *inode, } } - BUG_ON(disk_num_bytes > - btrfs_super_total_bytes(root->fs_info->super_copy)); + BUG_ON(num_bytes > btrfs_super_total_bytes(root->fs_info->super_copy)); alloc_hint = get_extent_allocation_hint(inode, start, num_bytes); btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); - while (disk_num_bytes > 0) { + /* + * Relocation relies on the relocated extents to have exactly the same + * size as the original extents. Normally writeback for relocation data + * extents follows a NOCOW path because relocation preallocates the + * extents. However, due to an operation such as scrub turning a block + * group to RO mode, it may fallback to COW mode, so we must make sure + * an extent allocated during COW has exactly the requested size and can + * not be split into smaller extents, otherwise relocation breaks and + * fails during the stage where it updates the bytenr of file extent + * items. + */ + if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) + min_alloc_size = num_bytes; + else + min_alloc_size = root->sectorsize; + + while (num_bytes > 0) { unsigned long op; - cur_alloc_size = disk_num_bytes; + cur_alloc_size = num_bytes; ret = btrfs_reserve_extent(root, cur_alloc_size, - root->sectorsize, 0, alloc_hint, + min_alloc_size, 0, alloc_hint, &ins, 1, 1); if (ret < 0) goto out_unlock; @@ -1033,7 +1047,7 @@ static noinline int cow_file_range(struct inode *inode, goto out_drop_extent_cache; } - if (disk_num_bytes < cur_alloc_size) + if (num_bytes < cur_alloc_size) break; /* we're not doing compressed IO, don't unlock the first @@ -1050,8 +1064,10 @@ static noinline int cow_file_range(struct inode *inode, start + ram_size - 1, locked_page, EXTENT_LOCKED | EXTENT_DELALLOC, op); - disk_num_bytes -= cur_alloc_size; - num_bytes -= cur_alloc_size; + if (num_bytes < cur_alloc_size) + num_bytes = 0; + else + num_bytes -= cur_alloc_size; alloc_hint = ins.objectid + ins.offset; start += cur_alloc_size; } @@ -5354,11 +5370,13 @@ no_delete: } /* - * this returns the key found in the dir entry in the location pointer. + * Return the key found in the dir entry in the location pointer, fill @type + * with BTRFS_FT_*, and return 0. + * * If no dir entries were found, location->objectid is 0. */ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, - struct btrfs_key *location) + struct btrfs_key *location, u8 *type) { const char *name = dentry->d_name.name; int namelen = dentry->d_name.len; @@ -5380,6 +5398,8 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, goto out_err; btrfs_dir_item_key_to_cpu(path->nodes[0], di, location); + if (!ret) + *type = btrfs_dir_type(path->nodes[0], di); out: btrfs_free_path(path); return ret; @@ -5506,7 +5526,6 @@ static void inode_tree_del(struct inode *inode) spin_unlock(&root->inode_lock); if (empty && btrfs_root_refs(&root->root_item) == 0) { - synchronize_srcu(&root->fs_info->subvol_srcu); spin_lock(&root->inode_lock); empty = RB_EMPTY_ROOT(&root->inode_tree); spin_unlock(&root->inode_lock); @@ -5666,19 +5685,25 @@ static struct inode *new_simple_dir(struct super_block *s, return inode; } +static inline u8 btrfs_inode_type(struct inode *inode) +{ + return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT]; +} + struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) { struct inode *inode; struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_root *sub_root = root; struct btrfs_key location; + u8 di_type = 0; int index; int ret = 0; if (dentry->d_name.len > BTRFS_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); - ret = btrfs_inode_by_name(dir, dentry, &location); + ret = btrfs_inode_by_name(dir, dentry, &location, &di_type); if (ret < 0) return ERR_PTR(ret); @@ -5687,6 +5712,18 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) if (location.type == BTRFS_INODE_ITEM_KEY) { inode = btrfs_iget(dir->i_sb, &location, root, NULL); + if (IS_ERR(inode)) + return inode; + + /* Do extra check against inode mode with di_type */ + if (btrfs_inode_type(inode) != di_type) { + btrfs_crit(root->fs_info, +"inode mode mismatch with dir: inode mode=0%o btrfs type=%u dir type=%u", + inode->i_mode, btrfs_inode_type(inode), + di_type); + iput(inode); + return ERR_PTR(-EUCLEAN); + } return inode; } @@ -6300,11 +6337,6 @@ fail: return ERR_PTR(ret); } -static inline u8 btrfs_inode_type(struct inode *inode) -{ - return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT]; -} - /* * utility function to add 'inode' into 'parent_inode' with * a give name and a given sequence number. @@ -6896,6 +6928,14 @@ again: extent_start = found_key.offset; if (found_type == BTRFS_FILE_EXTENT_REG || found_type == BTRFS_FILE_EXTENT_PREALLOC) { + /* Only regular file could have regular/prealloc extent */ + if (!S_ISREG(inode->i_mode)) { + err = -EUCLEAN; + btrfs_crit(root->fs_info, + "regular/prealloc extent found for non-regular inode %llu", + btrfs_ino(inode)); + goto out; + } extent_end = extent_start + btrfs_file_extent_num_bytes(leaf, item); } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { @@ -8302,7 +8342,6 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, bio->bi_private = dip; bio->bi_end_io = btrfs_end_dio_bio; btrfs_io_bio(bio)->logical = file_offset; - atomic_inc(&dip->pending_bios); while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) { if (map_length < submit_len + bvec->bv_len || @@ -8359,7 +8398,8 @@ submit: if (!ret) return 0; - bio_put(bio); + if (bio != orig_bio) + bio_put(bio); out_err: dip->errors = 1; /* @@ -8406,7 +8446,7 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio, io_bio->bi_private = dip; dip->orig_bio = io_bio; dip->dio_bio = dio_bio; - atomic_set(&dip->pending_bios, 0); + atomic_set(&dip->pending_bios, 1); btrfs_bio = btrfs_io_bio(io_bio); btrfs_bio->logical = file_offset; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 3379490ce54d..f35e18e76f16 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -59,6 +59,7 @@ #include "props.h" #include "sysfs.h" #include "qgroup.h" +#include "tree-log.h" #ifdef CONFIG_64BIT /* If we have a 32-bit userspace and 64-bit kernel, then the UAPI @@ -594,12 +595,18 @@ static noinline int create_subvol(struct inode *dir, btrfs_i_size_write(dir, dir->i_size + namelen * 2); ret = btrfs_update_inode(trans, root, dir); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto fail; + } ret = btrfs_add_root_ref(trans, root->fs_info->tree_root, objectid, root->root_key.objectid, btrfs_ino(dir), index, name, namelen); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto fail; + } ret = btrfs_uuid_tree_add(trans, root->fs_info->uuid_root, root_item.uuid, BTRFS_UUID_KEY_SUBVOL, @@ -2010,9 +2017,14 @@ static noinline int copy_to_sk(struct btrfs_root *root, sh.len = item_len; sh.transid = found_transid; - /* copy search result header */ - if (copy_to_user(ubuf + *sk_offset, &sh, sizeof(sh))) { - ret = -EFAULT; + /* + * Copy search result header. If we fault then loop again so we + * can fault in the pages and -EFAULT there if there's a + * problem. Otherwise we'll fault and then copy the buffer in + * properly this next time through + */ + if (probe_user_write(ubuf + *sk_offset, &sh, sizeof(sh))) { + ret = 0; goto out; } @@ -2020,10 +2032,14 @@ static noinline int copy_to_sk(struct btrfs_root *root, if (item_len) { char __user *up = ubuf + *sk_offset; - /* copy the item */ - if (read_extent_buffer_to_user(leaf, up, - item_off, item_len)) { - ret = -EFAULT; + /* + * Copy the item, same behavior as above, but reset the + * * sk_offset so we copy the full thing again. + */ + if (read_extent_buffer_to_user_nofault(leaf, up, + item_off, item_len)) { + ret = 0; + *sk_offset -= sizeof(sh); goto out; } @@ -2113,6 +2129,11 @@ static noinline int search_ioctl(struct inode *inode, key.offset = sk->min_offset; while (1) { + ret = fault_in_pages_writeable(ubuf + sk_offset, + *buf_size - sk_offset); + if (ret) + break; + ret = btrfs_search_forward(root, &key, path, sk->min_transid); if (ret != 0) { if (ret > 0) @@ -2534,6 +2555,8 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, out_end_trans: trans->block_rsv = NULL; trans->bytes_reserved = 0; + if (!err) + btrfs_record_snapshot_destroy(trans, dir); ret = btrfs_end_transaction(trans, root); if (ret && !err) err = ret; @@ -3833,6 +3856,8 @@ process_slot: ret = -EINTR; goto out; } + + cond_resched(); } ret = 0; diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 8c27292ea9ea..2eadc8f8c9ef 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -820,10 +820,15 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) } btrfs_start_ordered_extent(inode, ordered, 1); end = ordered->file_offset; + /* + * If the ordered extent had an error save the error but don't + * exit without waiting first for all other ordered extents in + * the range to complete. + */ if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) ret = -EIO; btrfs_put_ordered_extent(ordered); - if (ret || end == 0 || end == start) + if (end == 0 || end == start) break; end--; } diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 734babb6626c..bc4cc417e7ab 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -462,6 +462,7 @@ next2: break; } out: + btrfs_free_path(path); fs_info->qgroup_flags |= flags; if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) { fs_info->quota_enabled = 0; @@ -470,7 +471,6 @@ out: ret >= 0) { ret = qgroup_rescan_init(fs_info, rescan_progress, 0); } - btrfs_free_path(path); if (ret < 0) { ulist_free(fs_info->qgroup_ulist); @@ -2288,8 +2288,10 @@ out: } btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem); - if (done && !ret) + if (done && !ret) { ret = 1; + fs_info->qgroup_rescan_progress.objectid = (u64)-1; + } return ret; } diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index d6ccfb31aef0..628b6a046093 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1289,7 +1289,7 @@ static int __must_check __add_reloc_root(struct btrfs_root *root) if (!node) return -ENOMEM; - node->bytenr = root->node->start; + node->bytenr = root->commit_root->start; node->data = root; spin_lock(&rc->reloc_root_tree.lock); @@ -1321,15 +1321,14 @@ static void __del_reloc_root(struct btrfs_root *root) if (rc && root->node) { spin_lock(&rc->reloc_root_tree.lock); rb_node = tree_search(&rc->reloc_root_tree.rb_root, - root->node->start); + root->commit_root->start); if (rb_node) { node = rb_entry(rb_node, struct mapping_node, rb_node); rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); + RB_CLEAR_NODE(&node->rb_node); } spin_unlock(&rc->reloc_root_tree.lock); - if (!node) - return; - BUG_ON((struct btrfs_root *)node->data != root); + ASSERT(!node || (struct btrfs_root *)node->data == root); } spin_lock(&root->fs_info->trans_lock); @@ -1342,7 +1341,7 @@ static void __del_reloc_root(struct btrfs_root *root) * helper to update the 'address of tree root -> reloc tree' * mapping */ -static int __update_reloc_root(struct btrfs_root *root, u64 new_bytenr) +static int __update_reloc_root(struct btrfs_root *root) { struct rb_node *rb_node; struct mapping_node *node = NULL; @@ -1350,7 +1349,7 @@ static int __update_reloc_root(struct btrfs_root *root, u64 new_bytenr) spin_lock(&rc->reloc_root_tree.lock); rb_node = tree_search(&rc->reloc_root_tree.rb_root, - root->node->start); + root->commit_root->start); if (rb_node) { node = rb_entry(rb_node, struct mapping_node, rb_node); rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); @@ -1362,7 +1361,7 @@ static int __update_reloc_root(struct btrfs_root *root, u64 new_bytenr) BUG_ON((struct btrfs_root *)node->data != root); spin_lock(&rc->reloc_root_tree.lock); - node->bytenr = new_bytenr; + node->bytenr = root->node->start; rb_node = tree_insert(&rc->reloc_root_tree.rb_root, node->bytenr, &node->rb_node); spin_unlock(&rc->reloc_root_tree.lock); @@ -1503,6 +1502,7 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, } if (reloc_root->commit_root != reloc_root->node) { + __update_reloc_root(reloc_root); btrfs_set_root_node(root_item, reloc_root->node); free_extent_buffer(reloc_root->commit_root); reloc_root->commit_root = btrfs_root_node(reloc_root); @@ -1785,8 +1785,8 @@ int replace_path(struct btrfs_trans_handle *trans, int ret; int slot; - BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); - BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID); + ASSERT(src->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID); + ASSERT(dest->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); last_snapshot = btrfs_root_last_snapshot(&src->root_item); again: @@ -1818,7 +1818,7 @@ again: parent = eb; while (1) { level = btrfs_header_level(parent); - BUG_ON(level < lowest_level); + ASSERT(level >= lowest_level); ret = btrfs_bin_search(parent, &key, level, &slot); if (ret && slot > 0) @@ -2440,7 +2440,21 @@ out: free_reloc_roots(&reloc_roots); } - BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root)); + /* + * We used to have + * + * BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root)); + * + * here, but it's wrong. If we fail to start the transaction in + * prepare_to_merge() we will have only 0 ref reloc roots, none of which + * have actually been removed from the reloc_root_tree rb tree. This is + * fine because we're bailing here, and we hold a reference on the root + * for the list that holds it, so these roots will be cleaned up when we + * do the reloc_dirty_list afterwards. Meanwhile the root->reloc_root + * will be cleaned up on unmount. + * + * The remaining nodes will be cleaned up by free_reloc_control. + */ } static void free_block_list(struct rb_root *blocks) @@ -4454,6 +4468,7 @@ int btrfs_recover_relocation(struct btrfs_root *root) reloc_root->root_key.offset); if (IS_ERR(fs_root)) { err = PTR_ERR(fs_root); + list_add_tail(&reloc_root->root_list, &reloc_roots); goto out_free; } @@ -4563,11 +4578,6 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, BUG_ON(rc->stage == UPDATE_DATA_PTRS && root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID); - if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { - if (buf == root->node) - __update_reloc_root(root, cow->start); - } - level = btrfs_header_level(buf); if (btrfs_header_generation(buf) <= btrfs_root_last_snapshot(&root->root_item)) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index cc9ccc42f469..0b41a88ef9e9 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -918,11 +918,6 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) have_csum = sblock_to_check->pagev[0]->have_csum; dev = sblock_to_check->pagev[0]->dev; - if (sctx->is_dev_replace && !is_metadata && !have_csum) { - sblocks_for_recheck = NULL; - goto nodatasum_case; - } - /* * read all mirrors one after the other. This includes to * re-read the extent or metadata block that failed (that was @@ -1035,13 +1030,19 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) goto out; } - if (!is_metadata && !have_csum) { + /* + * NOTE: Even for nodatasum case, it's still possible that it's a + * compressed data extent, thus scrub_fixup_nodatasum(), which write + * inode page cache onto disk, could cause serious data corruption. + * + * So here we could only read from disk, and hope our recovery could + * reach disk before the newer write. + */ + if (0 && !is_metadata && !have_csum) { struct scrub_fixup_nodatasum *fixup_nodatasum; WARN_ON(sctx->is_dev_replace); -nodatasum_case: - /* * !is_metadata and !have_csum, this means that the data * might not be COW'ed, that it might be modified diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 40d1ab957fb6..de0ebb3b3cd3 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -34,6 +34,7 @@ #include "disk-io.h" #include "btrfs_inode.h" #include "transaction.h" +#include "xattr.h" static int g_verbose = 0; @@ -4194,6 +4195,10 @@ static int __process_new_xattr(int num, struct btrfs_key *di_key, struct fs_path *p; posix_acl_xattr_header dummy_acl; + /* Capabilities are emitted by finish_inode_if_needed */ + if (!strncmp(name, XATTR_NAME_CAPS, name_len)) + return 0; + p = fs_path_alloc(); if (!p) return -ENOMEM; @@ -4733,6 +4738,64 @@ static int send_extent_data(struct send_ctx *sctx, return 0; } +/* + * Search for a capability xattr related to sctx->cur_ino. If the capability is + * found, call send_set_xattr function to emit it. + * + * Return 0 if there isn't a capability, or when the capability was emitted + * successfully, or < 0 if an error occurred. + */ +static int send_capabilities(struct send_ctx *sctx) +{ + struct fs_path *fspath = NULL; + struct btrfs_path *path; + struct btrfs_dir_item *di; + struct extent_buffer *leaf; + unsigned long data_ptr; + char *buf = NULL; + int buf_len; + int ret = 0; + + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + + di = btrfs_lookup_xattr(NULL, sctx->send_root, path, sctx->cur_ino, + XATTR_NAME_CAPS, strlen(XATTR_NAME_CAPS), 0); + if (!di) { + /* There is no xattr for this inode */ + goto out; + } else if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto out; + } + + leaf = path->nodes[0]; + buf_len = btrfs_dir_data_len(leaf, di); + + fspath = fs_path_alloc(); + buf = kmalloc(buf_len, GFP_KERNEL); + if (!fspath || !buf) { + ret = -ENOMEM; + goto out; + } + + ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, fspath); + if (ret < 0) + goto out; + + data_ptr = (unsigned long)(di + 1) + btrfs_dir_name_len(leaf, di); + read_extent_buffer(leaf, buf, data_ptr, buf_len); + + ret = send_set_xattr(sctx, fspath, XATTR_NAME_CAPS, + strlen(XATTR_NAME_CAPS), buf, buf_len); +out: + kfree(buf); + fs_path_free(fspath); + btrfs_free_path(path); + return ret; +} + static int clone_range(struct send_ctx *sctx, struct clone_root *clone_root, const u64 disk_byte, @@ -5022,15 +5085,12 @@ static int is_extent_unchanged(struct send_ctx *sctx, goto out; } - right_disknr = btrfs_file_extent_disk_bytenr(eb, ei); if (right_type == BTRFS_FILE_EXTENT_INLINE) { right_len = btrfs_file_extent_inline_len(eb, slot, ei); right_len = PAGE_ALIGN(right_len); } else { right_len = btrfs_file_extent_num_bytes(eb, ei); } - right_offset = btrfs_file_extent_offset(eb, ei); - right_gen = btrfs_file_extent_generation(eb, ei); /* * Are we at extent 8? If yes, we know the extent is changed. @@ -5055,6 +5115,10 @@ static int is_extent_unchanged(struct send_ctx *sctx, goto out; } + right_disknr = btrfs_file_extent_disk_bytenr(eb, ei); + right_offset = btrfs_file_extent_offset(eb, ei); + right_gen = btrfs_file_extent_generation(eb, ei); + left_offset_fixed = left_offset; if (key.offset < ekey->offset) { /* Fix the right offset for 2a and 7. */ @@ -5443,6 +5507,10 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) goto out; } + ret = send_capabilities(sctx); + if (ret < 0) + goto out; + /* * If other directory inodes depended on our current directory * inode's move/rename, now do their move/rename operations. diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 0f99336c37eb..77e6ce0e1e35 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -843,8 +843,8 @@ out: return error; } -static char *get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, - u64 subvol_objectid) +char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, + u64 subvol_objectid) { struct btrfs_root *root = fs_info->tree_root; struct btrfs_root *fs_root; @@ -1120,6 +1120,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) struct btrfs_fs_info *info = btrfs_sb(dentry->d_sb); struct btrfs_root *root = info->tree_root; char *compress_type; + const char *subvol_name; if (btrfs_test_opt(root, DEGRADED)) seq_puts(seq, ",degraded"); @@ -1204,8 +1205,13 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) #endif seq_printf(seq, ",subvolid=%llu", BTRFS_I(d_inode(dentry))->root->root_key.objectid); - seq_puts(seq, ",subvol="); - seq_dentry(seq, dentry, " \t\n\\"); + subvol_name = btrfs_get_subvol_name_from_objectid(info, + BTRFS_I(d_inode(dentry))->root->root_key.objectid); + if (!IS_ERR(subvol_name)) { + seq_puts(seq, ",subvol="); + seq_escape(seq, subvol_name, " \t\n\\"); + kfree(subvol_name); + } return 0; } @@ -1323,8 +1329,8 @@ static struct dentry *mount_subvol(const char *subvol_name, u64 subvol_objectid, goto out; } } - subvol_name = get_subvol_name_from_objectid(btrfs_sb(mnt->mnt_sb), - subvol_objectid); + subvol_name = btrfs_get_subvol_name_from_objectid( + btrfs_sb(mnt->mnt_sb), subvol_objectid); if (IS_ERR(subvol_name)) { root = ERR_CAST(subvol_name); subvol_name = NULL; @@ -1702,6 +1708,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) } if (btrfs_super_log_root(fs_info->super_copy) != 0) { + btrfs_warn(fs_info, + "mount required to replay tree-log, cannot remount read-write"); ret = -EINVAL; goto restore; } @@ -1978,6 +1986,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; int ret; u64 thresh = 0; + int mixed = 0; /* * holding chunk_muext to avoid allocating new chunks, holding @@ -2003,8 +2012,17 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) } } } - if (found->flags & BTRFS_BLOCK_GROUP_METADATA) - total_free_meta += found->disk_total - found->disk_used; + + /* + * Metadata in mixed block goup profiles are accounted in data + */ + if (!mixed && found->flags & BTRFS_BLOCK_GROUP_METADATA) { + if (found->flags & BTRFS_BLOCK_GROUP_DATA) + mixed = 1; + else + total_free_meta += found->disk_total - + found->disk_used; + } total_used += found->disk_used; } @@ -2042,7 +2060,15 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) */ thresh = 4 * 1024 * 1024; - if (total_free_meta - thresh < block_rsv->size) + /* + * We only want to claim there's no available space if we can no longer + * allocate chunks for our metadata profile and our global reserve will + * not fit in the free metadata space. If we aren't ->full then we + * still can allocate chunks and thus are fine using the currently + * calculated f_bavail. + */ + if (!mixed && block_rsv->space_info->full && + total_free_meta - thresh < block_rsv->size) buf->f_bavail = 0; buf->f_type = BTRFS_SUPER_MAGIC; diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index 9626252ee6b4..2825cbe3ea8d 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -48,7 +48,13 @@ static struct file_system_type test_type = { struct inode *btrfs_new_test_inode(void) { - return new_inode(test_mnt->mnt_sb); + struct inode *inode; + + inode = new_inode(test_mnt->mnt_sb); + if (inode) + inode_init_owner(inode, NULL, S_IFREG); + + return inode; } int btrfs_init_test_fs(void) @@ -109,7 +115,6 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void) spin_lock_init(&fs_info->qgroup_op_lock); spin_lock_init(&fs_info->super_lock); spin_lock_init(&fs_info->fs_roots_radix_lock); - spin_lock_init(&fs_info->tree_mod_seq_lock); mutex_init(&fs_info->qgroup_ioctl_lock); mutex_init(&fs_info->qgroup_rescan_lock); rwlock_init(&fs_info->tree_mod_log_lock); diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index 054fc0d97131..5ff676df698f 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -235,6 +235,7 @@ static noinline int test_btrfs_get_extent(void) return ret; } + inode->i_mode = S_IFREG; BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID; BTRFS_I(inode)->location.offset = 0; diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c index 2b2978c04e80..1efec40455f8 100644 --- a/fs/btrfs/tests/qgroup-tests.c +++ b/fs/btrfs/tests/qgroup-tests.c @@ -477,9 +477,9 @@ int btrfs_test_qgroups(void) * *cough*backref walking code*cough* */ root->node = alloc_test_extent_buffer(root->fs_info, 4096); - if (!root->node) { + if (IS_ERR(root->node)) { test_msg("Couldn't allocate dummy buffer\n"); - ret = -ENOMEM; + ret = PTR_ERR(root->node); goto out; } btrfs_set_header_level(root->node, 0); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 098016338f98..f0675b7c95ec 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1264,8 +1264,10 @@ int btrfs_defrag_root(struct btrfs_root *root) while (1) { trans = btrfs_start_transaction(root, 0); - if (IS_ERR(trans)) - return PTR_ERR(trans); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + break; + } ret = btrfs_defrag_leaves(trans, root); @@ -1814,6 +1816,14 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, struct btrfs_inode *btree_ino = BTRFS_I(root->fs_info->btree_inode); int ret; + /* + * Some places just start a transaction to commit it. We need to make + * sure that if this commit fails that the abort code actually marks the + * transaction as failed, so set trans->dirty to make the abort code do + * the right thing. + */ + trans->dirty = true; + /* Stop the commit early if ->aborted is set */ if (unlikely(ACCESS_ONCE(cur_trans->aborted))) { ret = cur_trans->aborted; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 45c77f261856..bcf61a32f970 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1511,6 +1511,7 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, break; if (ret == 1) { + ret = 0; if (path->slots[0] == 0) break; path->slots[0]--; @@ -1523,17 +1524,19 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, ret = btrfs_del_item(trans, root, path); if (ret) - goto out; + break; btrfs_release_path(path); inode = read_one_inode(root, key.offset); - if (!inode) - return -EIO; + if (!inode) { + ret = -EIO; + break; + } ret = fixup_inode_link_count(trans, root, inode); iput(inode); if (ret) - goto out; + break; /* * fixup on a directory may create new entries, @@ -1542,8 +1545,6 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, */ key.offset = (u64)-1; } - ret = 0; -out: btrfs_release_path(path); return ret; } @@ -1582,8 +1583,6 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, ret = btrfs_update_inode(trans, root, inode); } else if (ret == -EEXIST) { ret = 0; - } else { - BUG(); /* Logic Error */ } iput(inode); @@ -2208,7 +2207,9 @@ again: else { ret = find_dir_range(log, path, dirid, key_type, &range_start, &range_end); - if (ret != 0) + if (ret < 0) + goto out; + else if (ret > 0) break; } @@ -3169,11 +3170,13 @@ fail: btrfs_free_path(path); out_unlock: mutex_unlock(&BTRFS_I(dir)->log_mutex); - if (ret == -ENOSPC) { + if (err == -ENOSPC) { btrfs_set_log_full_commit(root->fs_info, trans); - ret = 0; - } else if (ret < 0) - btrfs_abort_transaction(trans, root, ret); + err = 0; + } else if (err < 0 && err != -ENOENT) { + /* ENOENT can be returned if the entry hasn't been fsynced yet */ + btrfs_abort_transaction(trans, root, err); + } btrfs_end_log_trans(root); @@ -3333,6 +3336,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, * search and this search we'll not find the key again and can just * bail. */ +search: ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); if (ret != 0) goto done; @@ -3352,6 +3356,13 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, if (min_key.objectid != ino || min_key.type != key_type) goto done; + + if (need_resched()) { + btrfs_release_path(path); + cond_resched(); + goto search; + } + ret = overwrite_item(trans, log, dst_path, src, i, &min_key); if (ret) { @@ -3733,11 +3744,8 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, log->fs_info->csum_root, ds + cs, ds + cs + cl - 1, &ordered_sums, 0); - if (ret) { - btrfs_release_path(dst_path); - kfree(ins_data); - return ret; - } + if (ret) + break; } } } @@ -3750,7 +3758,6 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, * we have to do this after the loop above to avoid changing the * log tree while trying to change the log tree. */ - ret = 0; while (!list_empty(&ordered_sums)) { struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, struct btrfs_ordered_sum, @@ -4404,13 +4411,8 @@ static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans, struct btrfs_file_extent_item); if (btrfs_file_extent_type(leaf, extent) == - BTRFS_FILE_EXTENT_INLINE) { - len = btrfs_file_extent_inline_len(leaf, - path->slots[0], - extent); - ASSERT(len == i_size); + BTRFS_FILE_EXTENT_INLINE) return 0; - } len = btrfs_file_extent_num_bytes(leaf, extent); /* Last extent goes beyond i_size, no need to log a hole. */ @@ -5700,6 +5702,21 @@ record: } /* + * Make sure that if someone attempts to fsync the parent directory of a deleted + * snapshot, it ends up triggering a transaction commit. This is to guarantee + * that after replaying the log tree of the parent directory's root we will not + * see the snapshot anymore and at log replay time we will not see any log tree + * corresponding to the deleted snapshot's root, which could lead to replaying + * it after replaying the log tree of the parent directory (which would replay + * the snapshot delete operation). + */ +void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, + struct inode *dir) +{ + BTRFS_I(dir)->last_unlink_trans = trans->transid; +} + +/* * Call this after adding a new name for a file and it will properly * update the log to reflect the new name. * diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index 6916a781ea02..a9f1b75d080d 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -79,6 +79,8 @@ int btrfs_pin_log_trans(struct btrfs_root *root); void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, struct inode *dir, struct inode *inode, int for_rename); +void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, + struct inode *dir); int btrfs_log_new_name(struct btrfs_trans_handle *trans, struct inode *inode, struct inode *old_dir, struct dentry *parent); diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c index 837a9a8d579e..24eb6283dc62 100644 --- a/fs/btrfs/uuid-tree.c +++ b/fs/btrfs/uuid-tree.c @@ -332,6 +332,8 @@ again_search_slot: } if (ret < 0 && ret != -ENOENT) goto out; + key.offset++; + goto again_search_slot; } item_size -= sizeof(subid_le); offset += sizeof(subid_le); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 55ce6543050d..d6383d362e27 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2357,9 +2357,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) btrfs_set_super_num_devices(root->fs_info->super_copy, tmp + 1); - /* add sysfs device entry */ - btrfs_sysfs_add_device_link(root->fs_info->fs_devices, device); - /* * we've got more storage, clear any full flags on the space * infos @@ -2367,6 +2364,10 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) btrfs_clear_space_info_full(root->fs_info); unlock_chunks(root); + + /* add sysfs device entry */ + btrfs_sysfs_add_device_link(root->fs_info->fs_devices, device); + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); if (seeding_dev) { @@ -4065,6 +4066,7 @@ static int btrfs_uuid_scan_kthread(void *data) goto skip; } update_tree: + btrfs_release_path(path); if (!btrfs_is_empty_uuid(root_item.uuid)) { ret = btrfs_uuid_tree_add(trans, fs_info->uuid_root, root_item.uuid, @@ -4090,6 +4092,7 @@ update_tree: } skip: + btrfs_release_path(path); if (trans) { ret = btrfs_end_transaction(trans, fs_info->uuid_root); trans = NULL; @@ -4097,7 +4100,6 @@ skip: break; } - btrfs_release_path(path); if (key.offset < (u64)-1) { key.offset++; } else if (key.type < BTRFS_ROOT_ITEM_KEY) { @@ -6261,6 +6263,13 @@ static int btrfs_check_chunk_valid(struct btrfs_root *root, return -EIO; } + if (!is_power_of_2(type & BTRFS_BLOCK_GROUP_PROFILE_MASK) && + (type & BTRFS_BLOCK_GROUP_PROFILE_MASK) != 0) { + btrfs_err(root->fs_info, + "invalid chunk profile flag: 0x%llx, expect 0 or 1 bit set", + type & BTRFS_BLOCK_GROUP_PROFILE_MASK); + return -EUCLEAN; + } if ((type & BTRFS_BLOCK_GROUP_TYPE_MASK) == 0) { btrfs_err(root->fs_info, "missing chunk type flag: 0x%llx", type); return -EIO; @@ -6694,6 +6703,14 @@ int btrfs_read_chunk_tree(struct btrfs_root *root) lock_chunks(root); /* + * It is possible for mount and umount to race in such a way that + * we execute this code path, but open_fs_devices failed to clear + * total_rw_bytes. We certainly want it cleared before reading the + * device items, so clear it here. + */ + root->fs_info->fs_devices->total_rw_bytes = 0; + + /* * Read all device items, and then all the chunk items. All * device items are found before any chunk item (their object id * is smaller than the lowest possible object id for a chunk diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 7feac2d9da56..d24f3ceb0691 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -312,7 +312,6 @@ struct btrfs_bio { u64 map_type; /* get from map_lookup->type */ bio_end_io_t *end_io; struct bio *orig_bio; - unsigned long flags; void *private; atomic_t error; int max_errors; |