summaryrefslogtreecommitdiff
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/backref.c14
-rw-r--r--fs/btrfs/btrfs_inode.h6
-rw-r--r--fs/btrfs/compression.c2
-rw-r--r--fs/btrfs/ctree.c20
-rw-r--r--fs/btrfs/ctree.h17
-rw-r--r--fs/btrfs/dev-replace.c5
-rw-r--r--fs/btrfs/disk-io.c37
-rw-r--r--fs/btrfs/extent-tree.c433
-rw-r--r--fs/btrfs/extent_io.c10
-rw-r--r--fs/btrfs/extent_io.h1
-rw-r--r--fs/btrfs/extent_map.c2
-rw-r--r--fs/btrfs/extent_map.h1
-rw-r--r--fs/btrfs/file-item.c2
-rw-r--r--fs/btrfs/file.c26
-rw-r--r--fs/btrfs/free-space-cache.c192
-rw-r--r--fs/btrfs/inode.c112
-rw-r--r--fs/btrfs/ioctl.c37
-rw-r--r--fs/btrfs/locking.c80
-rw-r--r--fs/btrfs/ordered-data.c134
-rw-r--r--fs/btrfs/ordered-data.h5
-rw-r--r--fs/btrfs/print-tree.c9
-rw-r--r--fs/btrfs/qgroup.c169
-rw-r--r--fs/btrfs/qgroup.h1
-rw-r--r--fs/btrfs/raid56.c5
-rw-r--r--fs/btrfs/scrub.c19
-rw-r--r--fs/btrfs/super.c67
-rw-r--r--fs/btrfs/sysfs.c32
-rw-r--r--fs/btrfs/sysfs.h4
-rw-r--r--fs/btrfs/transaction.c45
-rw-r--r--fs/btrfs/transaction.h1
-rw-r--r--fs/btrfs/ulist.h15
-rw-r--r--fs/btrfs/volumes.c68
-rw-r--r--fs/btrfs/volumes.h3
-rw-r--r--fs/btrfs/zlib.c2
34 files changed, 1034 insertions, 542 deletions
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index e25564bfcb46..54a201dac7f9 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -276,9 +276,8 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
}
if (ret > 0)
goto next;
- ret = ulist_add_merge(parents, eb->start,
- (uintptr_t)eie,
- (u64 *)&old, GFP_NOFS);
+ ret = ulist_add_merge_ptr(parents, eb->start,
+ eie, (void **)&old, GFP_NOFS);
if (ret < 0)
break;
if (!ret && extent_item_pos) {
@@ -1001,16 +1000,19 @@ again:
ret = -EIO;
goto out;
}
+ btrfs_tree_read_lock(eb);
+ btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
ret = find_extent_in_eb(eb, bytenr,
*extent_item_pos, &eie);
+ btrfs_tree_read_unlock_blocking(eb);
free_extent_buffer(eb);
if (ret < 0)
goto out;
ref->inode_list = eie;
}
- ret = ulist_add_merge(refs, ref->parent,
- (uintptr_t)ref->inode_list,
- (u64 *)&eie, GFP_NOFS);
+ ret = ulist_add_merge_ptr(refs, ref->parent,
+ ref->inode_list,
+ (void **)&eie, GFP_NOFS);
if (ret < 0)
goto out;
if (!ret && extent_item_pos) {
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 4794923c410c..43527fd78825 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -84,12 +84,6 @@ struct btrfs_inode {
*/
struct list_head delalloc_inodes;
- /*
- * list for tracking inodes that must be sent to disk before a
- * rename or truncate commit
- */
- struct list_head ordered_operations;
-
/* node for the red-black tree that links inodes in subvolume root */
struct rb_node rb_node;
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 92371c414228..1daea0b47187 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -821,7 +821,7 @@ static void free_workspace(int type, struct list_head *workspace)
spin_lock(workspace_lock);
if (*num_workspace < num_online_cpus()) {
- list_add_tail(workspace, idle_workspace);
+ list_add(workspace, idle_workspace);
(*num_workspace)++;
spin_unlock(workspace_lock);
goto wake;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index aeab453b8e24..44ee5d2e52a4 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -280,9 +280,9 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
WARN_ON(btrfs_header_generation(buf) > trans->transid);
if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
- ret = btrfs_inc_ref(trans, root, cow, 1, 1);
+ ret = btrfs_inc_ref(trans, root, cow, 1);
else
- ret = btrfs_inc_ref(trans, root, cow, 0, 1);
+ ret = btrfs_inc_ref(trans, root, cow, 0);
if (ret)
return ret;
@@ -1035,14 +1035,14 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
if ((owner == root->root_key.objectid ||
root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
- ret = btrfs_inc_ref(trans, root, buf, 1, 1);
+ ret = btrfs_inc_ref(trans, root, buf, 1);
BUG_ON(ret); /* -ENOMEM */
if (root->root_key.objectid ==
BTRFS_TREE_RELOC_OBJECTID) {
- ret = btrfs_dec_ref(trans, root, buf, 0, 1);
+ ret = btrfs_dec_ref(trans, root, buf, 0);
BUG_ON(ret); /* -ENOMEM */
- ret = btrfs_inc_ref(trans, root, cow, 1, 1);
+ ret = btrfs_inc_ref(trans, root, cow, 1);
BUG_ON(ret); /* -ENOMEM */
}
new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
@@ -1050,9 +1050,9 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
if (root->root_key.objectid ==
BTRFS_TREE_RELOC_OBJECTID)
- ret = btrfs_inc_ref(trans, root, cow, 1, 1);
+ ret = btrfs_inc_ref(trans, root, cow, 1);
else
- ret = btrfs_inc_ref(trans, root, cow, 0, 1);
+ ret = btrfs_inc_ref(trans, root, cow, 0);
BUG_ON(ret); /* -ENOMEM */
}
if (new_flags != 0) {
@@ -1069,11 +1069,11 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
if (root->root_key.objectid ==
BTRFS_TREE_RELOC_OBJECTID)
- ret = btrfs_inc_ref(trans, root, cow, 1, 1);
+ ret = btrfs_inc_ref(trans, root, cow, 1);
else
- ret = btrfs_inc_ref(trans, root, cow, 0, 1);
+ ret = btrfs_inc_ref(trans, root, cow, 0);
BUG_ON(ret); /* -ENOMEM */
- ret = btrfs_dec_ref(trans, root, buf, 1, 1);
+ ret = btrfs_dec_ref(trans, root, buf, 1);
BUG_ON(ret); /* -ENOMEM */
}
clean_tree_block(trans, root, buf);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index b7e2c1c1ef36..8e29b614fe93 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1259,11 +1259,19 @@ struct btrfs_block_group_cache {
spinlock_t lock;
u64 pinned;
u64 reserved;
+ u64 delalloc_bytes;
u64 bytes_super;
u64 flags;
u64 sectorsize;
u64 cache_generation;
+ /*
+ * It is just used for the delayed data space allocation because
+ * only the data space allocation and the relative metadata update
+ * can be done cross the transaction.
+ */
+ struct rw_semaphore data_rwsem;
+
/* for raid56, this is a full stripe, without parity */
unsigned long full_stripe_len;
@@ -3316,11 +3324,11 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_key *ins);
int btrfs_reserve_extent(struct btrfs_root *root, u64 num_bytes,
u64 min_alloc_size, u64 empty_size, u64 hint_byte,
- struct btrfs_key *ins, int is_data);
+ struct btrfs_key *ins, int is_data, int delalloc);
int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- struct extent_buffer *buf, int full_backref, int no_quota);
+ struct extent_buffer *buf, int full_backref);
int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- struct extent_buffer *buf, int full_backref, int no_quota);
+ struct extent_buffer *buf, int full_backref);
int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 flags,
@@ -3330,7 +3338,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
u64 owner, u64 offset, int no_quota);
-int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
+int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len,
+ int delalloc);
int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
u64 start, u64 len);
void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 2af6e66fe788..eea26e1b2fda 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -36,6 +36,7 @@
#include "check-integrity.h"
#include "rcu-string.h"
#include "dev-replace.h"
+#include "sysfs.h"
static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
int scrub_ret);
@@ -562,6 +563,10 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
fs_info->fs_devices->latest_bdev = tgt_device->bdev;
list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
+ /* replace the sysfs entry */
+ btrfs_kobj_rm_device(fs_info, src_device);
+ btrfs_kobj_add_device(fs_info, tgt_device);
+
btrfs_rm_dev_replace_blocked(fs_info);
btrfs_rm_dev_replace_srcdev(fs_info, src_device);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 8bb4aa19898f..d0ed9e664f7d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -60,8 +60,6 @@ static void end_workqueue_fn(struct btrfs_work *work);
static void free_fs_root(struct btrfs_root *root);
static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
int read_only);
-static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
- struct btrfs_root *root);
static void btrfs_destroy_ordered_extents(struct btrfs_root *root);
static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
struct btrfs_root *root);
@@ -369,7 +367,8 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
out:
unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1,
&cached_state, GFP_NOFS);
- btrfs_tree_read_unlock_blocking(eb);
+ if (need_lock)
+ btrfs_tree_read_unlock_blocking(eb);
return ret;
}
@@ -2904,7 +2903,9 @@ retry_root_backup:
if (ret)
goto fail_qgroup;
+ mutex_lock(&fs_info->cleaner_mutex);
ret = btrfs_recover_relocation(tree_root);
+ mutex_unlock(&fs_info->cleaner_mutex);
if (ret < 0) {
printk(KERN_WARNING
"BTRFS: failed to recover relocation\n");
@@ -3826,34 +3827,6 @@ static void btrfs_error_commit_super(struct btrfs_root *root)
btrfs_cleanup_transaction(root);
}
-static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
- struct btrfs_root *root)
-{
- struct btrfs_inode *btrfs_inode;
- struct list_head splice;
-
- INIT_LIST_HEAD(&splice);
-
- mutex_lock(&root->fs_info->ordered_operations_mutex);
- spin_lock(&root->fs_info->ordered_root_lock);
-
- list_splice_init(&t->ordered_operations, &splice);
- while (!list_empty(&splice)) {
- btrfs_inode = list_entry(splice.next, struct btrfs_inode,
- ordered_operations);
-
- list_del_init(&btrfs_inode->ordered_operations);
- spin_unlock(&root->fs_info->ordered_root_lock);
-
- btrfs_invalidate_inodes(btrfs_inode->root);
-
- spin_lock(&root->fs_info->ordered_root_lock);
- }
-
- spin_unlock(&root->fs_info->ordered_root_lock);
- mutex_unlock(&root->fs_info->ordered_operations_mutex);
-}
-
static void btrfs_destroy_ordered_extents(struct btrfs_root *root)
{
struct btrfs_ordered_extent *ordered;
@@ -4090,8 +4063,6 @@ again:
void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
struct btrfs_root *root)
{
- btrfs_destroy_ordered_operations(cur_trans, root);
-
btrfs_destroy_delayed_refs(cur_trans, root);
cur_trans->state = TRANS_STATE_COMMIT_START;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index fafb3e53ecde..102ed3143976 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -105,7 +105,8 @@ static int find_next_key(struct btrfs_path *path, int level,
static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
int dump_block_groups);
static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
- u64 num_bytes, int reserve);
+ u64 num_bytes, int reserve,
+ int delalloc);
static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
u64 num_bytes);
int btrfs_pin_extent(struct btrfs_root *root,
@@ -3056,7 +3057,7 @@ out:
static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf,
- int full_backref, int inc, int no_quota)
+ int full_backref, int inc)
{
u64 bytenr;
u64 num_bytes;
@@ -3110,7 +3111,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
key.offset -= btrfs_file_extent_offset(buf, fi);
ret = process_func(trans, root, bytenr, num_bytes,
parent, ref_root, key.objectid,
- key.offset, no_quota);
+ key.offset, 1);
if (ret)
goto fail;
} else {
@@ -3118,7 +3119,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
num_bytes = btrfs_level_size(root, level - 1);
ret = process_func(trans, root, bytenr, num_bytes,
parent, ref_root, level - 1, 0,
- no_quota);
+ 1);
if (ret)
goto fail;
}
@@ -3129,15 +3130,15 @@ fail:
}
int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- struct extent_buffer *buf, int full_backref, int no_quota)
+ struct extent_buffer *buf, int full_backref)
{
- return __btrfs_mod_ref(trans, root, buf, full_backref, 1, no_quota);
+ return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
}
int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- struct extent_buffer *buf, int full_backref, int no_quota)
+ struct extent_buffer *buf, int full_backref)
{
- return __btrfs_mod_ref(trans, root, buf, full_backref, 0, no_quota);
+ return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
}
static int write_one_cache_group(struct btrfs_trans_handle *trans,
@@ -3260,7 +3261,8 @@ again:
spin_lock(&block_group->lock);
if (block_group->cached != BTRFS_CACHE_FINISHED ||
- !btrfs_test_opt(root, SPACE_CACHE)) {
+ !btrfs_test_opt(root, SPACE_CACHE) ||
+ block_group->delalloc_bytes) {
/*
* don't bother trying to write stuff out _if_
* a) we're not cached,
@@ -5613,6 +5615,7 @@ int btrfs_exclude_logged_extents(struct btrfs_root *log,
* @cache: The cache we are manipulating
* @num_bytes: The number of bytes in question
* @reserve: One of the reservation enums
+ * @delalloc: The blocks are allocated for the delalloc write
*
* This is called by the allocator when it reserves space, or by somebody who is
* freeing space that was never actually used on disk. For example if you
@@ -5631,7 +5634,7 @@ int btrfs_exclude_logged_extents(struct btrfs_root *log,
* succeeds.
*/
static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
- u64 num_bytes, int reserve)
+ u64 num_bytes, int reserve, int delalloc)
{
struct btrfs_space_info *space_info = cache->space_info;
int ret = 0;
@@ -5650,12 +5653,18 @@ static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
num_bytes, 0);
space_info->bytes_may_use -= num_bytes;
}
+
+ if (delalloc)
+ cache->delalloc_bytes += num_bytes;
}
} else {
if (cache->ro)
space_info->bytes_readonly += num_bytes;
cache->reserved -= num_bytes;
space_info->bytes_reserved -= num_bytes;
+
+ if (delalloc)
+ cache->delalloc_bytes -= num_bytes;
}
spin_unlock(&cache->lock);
spin_unlock(&space_info->lock);
@@ -5669,7 +5678,6 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
struct btrfs_caching_control *next;
struct btrfs_caching_control *caching_ctl;
struct btrfs_block_group_cache *cache;
- struct btrfs_space_info *space_info;
down_write(&fs_info->commit_root_sem);
@@ -5692,9 +5700,6 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
up_write(&fs_info->commit_root_sem);
- list_for_each_entry_rcu(space_info, &fs_info->space_info, list)
- percpu_counter_set(&space_info->total_bytes_pinned, 0);
-
update_global_block_rsv(fs_info);
}
@@ -5732,6 +5737,7 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
spin_lock(&cache->lock);
cache->pinned -= len;
space_info->bytes_pinned -= len;
+ percpu_counter_add(&space_info->total_bytes_pinned, -len);
if (cache->ro) {
space_info->bytes_readonly += len;
readonly = true;
@@ -6206,7 +6212,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
btrfs_add_free_space(cache, buf->start, buf->len);
- btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE);
+ btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE, 0);
trace_btrfs_reserved_extent_free(root, buf->start, buf->len);
pin = 0;
}
@@ -6365,6 +6371,70 @@ enum btrfs_loop_type {
LOOP_NO_EMPTY_SIZE = 3,
};
+static inline void
+btrfs_lock_block_group(struct btrfs_block_group_cache *cache,
+ int delalloc)
+{
+ if (delalloc)
+ down_read(&cache->data_rwsem);
+}
+
+static inline void
+btrfs_grab_block_group(struct btrfs_block_group_cache *cache,
+ int delalloc)
+{
+ btrfs_get_block_group(cache);
+ if (delalloc)
+ down_read(&cache->data_rwsem);
+}
+
+static struct btrfs_block_group_cache *
+btrfs_lock_cluster(struct btrfs_block_group_cache *block_group,
+ struct btrfs_free_cluster *cluster,
+ int delalloc)
+{
+ struct btrfs_block_group_cache *used_bg;
+ bool locked = false;
+again:
+ spin_lock(&cluster->refill_lock);
+ if (locked) {
+ if (used_bg == cluster->block_group)
+ return used_bg;
+
+ up_read(&used_bg->data_rwsem);
+ btrfs_put_block_group(used_bg);
+ }
+
+ used_bg = cluster->block_group;
+ if (!used_bg)
+ return NULL;
+
+ if (used_bg == block_group)
+ return used_bg;
+
+ btrfs_get_block_group(used_bg);
+
+ if (!delalloc)
+ return used_bg;
+
+ if (down_read_trylock(&used_bg->data_rwsem))
+ return used_bg;
+
+ spin_unlock(&cluster->refill_lock);
+ down_read(&used_bg->data_rwsem);
+ locked = true;
+ goto again;
+}
+
+static inline void
+btrfs_release_block_group(struct btrfs_block_group_cache *cache,
+ int delalloc)
+{
+ if (delalloc)
+ up_read(&cache->data_rwsem);
+ btrfs_put_block_group(cache);
+}
+
/*
* walks the btree of allocated extents and find a hole of a given size.
* The key ins is changed to record the hole:
@@ -6379,7 +6449,7 @@ enum btrfs_loop_type {
static noinline int find_free_extent(struct btrfs_root *orig_root,
u64 num_bytes, u64 empty_size,
u64 hint_byte, struct btrfs_key *ins,
- u64 flags)
+ u64 flags, int delalloc)
{
int ret = 0;
struct btrfs_root *root = orig_root->fs_info->extent_root;
@@ -6467,6 +6537,7 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
up_read(&space_info->groups_sem);
} else {
index = get_block_group_index(block_group);
+ btrfs_lock_block_group(block_group, delalloc);
goto have_block_group;
}
} else if (block_group) {
@@ -6481,7 +6552,7 @@ search:
u64 offset;
int cached;
- btrfs_get_block_group(block_group);
+ btrfs_grab_block_group(block_group, delalloc);
search_start = block_group->key.objectid;
/*
@@ -6529,16 +6600,16 @@ have_block_group:
* the refill lock keeps out other
* people trying to start a new cluster
*/
- spin_lock(&last_ptr->refill_lock);
- used_block_group = last_ptr->block_group;
- if (used_block_group != block_group &&
- (!used_block_group ||
- used_block_group->ro ||
- !block_group_bits(used_block_group, flags)))
+ used_block_group = btrfs_lock_cluster(block_group,
+ last_ptr,
+ delalloc);
+ if (!used_block_group)
goto refill_cluster;
- if (used_block_group != block_group)
- btrfs_get_block_group(used_block_group);
+ if (used_block_group != block_group &&
+ (used_block_group->ro ||
+ !block_group_bits(used_block_group, flags)))
+ goto release_cluster;
offset = btrfs_alloc_from_cluster(used_block_group,
last_ptr,
@@ -6552,16 +6623,15 @@ have_block_group:
used_block_group,
search_start, num_bytes);
if (used_block_group != block_group) {
- btrfs_put_block_group(block_group);
+ btrfs_release_block_group(block_group,
+ delalloc);
block_group = used_block_group;
}
goto checks;
}
WARN_ON(last_ptr->block_group != used_block_group);
- if (used_block_group != block_group)
- btrfs_put_block_group(used_block_group);
-refill_cluster:
+release_cluster:
/* If we are on LOOP_NO_EMPTY_SIZE, we can't
* set up a new clusters, so lets just skip it
* and let the allocator find whatever block
@@ -6578,8 +6648,10 @@ refill_cluster:
* succeeding in the unclustered
* allocation. */
if (loop >= LOOP_NO_EMPTY_SIZE &&
- last_ptr->block_group != block_group) {
+ used_block_group != block_group) {
spin_unlock(&last_ptr->refill_lock);
+ btrfs_release_block_group(used_block_group,
+ delalloc);
goto unclustered_alloc;
}
@@ -6589,6 +6661,10 @@ refill_cluster:
*/
btrfs_return_cluster_to_free_space(NULL, last_ptr);
+ if (used_block_group != block_group)
+ btrfs_release_block_group(used_block_group,
+ delalloc);
+refill_cluster:
if (loop >= LOOP_NO_EMPTY_SIZE) {
spin_unlock(&last_ptr->refill_lock);
goto unclustered_alloc;
@@ -6696,7 +6772,7 @@ checks:
BUG_ON(offset > search_start);
ret = btrfs_update_reserved_bytes(block_group, num_bytes,
- alloc_type);
+ alloc_type, delalloc);
if (ret == -EAGAIN) {
btrfs_add_free_space(block_group, offset, num_bytes);
goto loop;
@@ -6708,13 +6784,13 @@ checks:
trace_btrfs_reserve_extent(orig_root, block_group,
search_start, num_bytes);
- btrfs_put_block_group(block_group);
+ btrfs_release_block_group(block_group, delalloc);
break;
loop:
failed_cluster_refill = false;
failed_alloc = false;
BUG_ON(index != get_block_group_index(block_group));
- btrfs_put_block_group(block_group);
+ btrfs_release_block_group(block_group, delalloc);
}
up_read(&space_info->groups_sem);
@@ -6827,7 +6903,7 @@ again:
int btrfs_reserve_extent(struct btrfs_root *root,
u64 num_bytes, u64 min_alloc_size,
u64 empty_size, u64 hint_byte,
- struct btrfs_key *ins, int is_data)
+ struct btrfs_key *ins, int is_data, int delalloc)
{
bool final_tried = false;
u64 flags;
@@ -6837,7 +6913,7 @@ int btrfs_reserve_extent(struct btrfs_root *root,
again:
WARN_ON(num_bytes < root->sectorsize);
ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins,
- flags);
+ flags, delalloc);
if (ret == -ENOSPC) {
if (!final_tried && ins->offset) {
@@ -6862,7 +6938,8 @@ again:
}
static int __btrfs_free_reserved_extent(struct btrfs_root *root,
- u64 start, u64 len, int pin)
+ u64 start, u64 len,
+ int pin, int delalloc)
{
struct btrfs_block_group_cache *cache;
int ret = 0;
@@ -6881,7 +6958,7 @@ static int __btrfs_free_reserved_extent(struct btrfs_root *root,
pin_down_extent(root, cache, start, len, 1);
else {
btrfs_add_free_space(cache, start, len);
- btrfs_update_reserved_bytes(cache, len, RESERVE_FREE);
+ btrfs_update_reserved_bytes(cache, len, RESERVE_FREE, delalloc);
}
btrfs_put_block_group(cache);
@@ -6891,15 +6968,15 @@ static int __btrfs_free_reserved_extent(struct btrfs_root *root,
}
int btrfs_free_reserved_extent(struct btrfs_root *root,
- u64 start, u64 len)
+ u64 start, u64 len, int delalloc)
{
- return __btrfs_free_reserved_extent(root, start, len, 0);
+ return __btrfs_free_reserved_extent(root, start, len, 0, delalloc);
}
int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
u64 start, u64 len)
{
- return __btrfs_free_reserved_extent(root, start, len, 1);
+ return __btrfs_free_reserved_extent(root, start, len, 1, 0);
}
static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
@@ -7114,7 +7191,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
return -EINVAL;
ret = btrfs_update_reserved_bytes(block_group, ins->offset,
- RESERVE_ALLOC_NO_ACCOUNT);
+ RESERVE_ALLOC_NO_ACCOUNT, 0);
BUG_ON(ret); /* logic error */
ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
0, owner, offset, ins, 1);
@@ -7256,7 +7333,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
return ERR_CAST(block_rsv);
ret = btrfs_reserve_extent(root, blocksize, blocksize,
- empty_size, hint, &ins, 0);
+ empty_size, hint, &ins, 0, 0);
if (ret) {
unuse_block_rsv(root->fs_info, block_rsv, blocksize);
return ERR_PTR(ret);
@@ -7401,6 +7478,220 @@ reada:
wc->reada_slot = slot;
}
+static int account_leaf_items(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *eb)
+{
+ int nr = btrfs_header_nritems(eb);
+ int i, extent_type, ret;
+ struct btrfs_key key;
+ struct btrfs_file_extent_item *fi;
+ u64 bytenr, num_bytes;
+
+ for (i = 0; i < nr; i++) {
+ btrfs_item_key_to_cpu(eb, &key, i);
+
+ if (key.type != BTRFS_EXTENT_DATA_KEY)
+ continue;
+
+ fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
+ /* filter out non qgroup-accountable extents */
+ extent_type = btrfs_file_extent_type(eb, fi);
+
+ if (extent_type == BTRFS_FILE_EXTENT_INLINE)
+ continue;
+
+ bytenr = btrfs_file_extent_disk_bytenr(eb, fi);
+ if (!bytenr)
+ continue;
+
+ num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi);
+
+ ret = btrfs_qgroup_record_ref(trans, root->fs_info,
+ root->objectid,
+ bytenr, num_bytes,
+ BTRFS_QGROUP_OPER_SUB_SUBTREE, 0);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
+/*
+ * Walk up the tree from the bottom, freeing leaves and any interior
+ * nodes which have had all slots visited. If a node (leaf or
+ * interior) is freed, the node above it will have it's slot
+ * incremented. The root node will never be freed.
+ *
+ * At the end of this function, we should have a path which has all
+ * slots incremented to the next position for a search. If we need to
+ * read a new node it will be NULL and the node above it will have the
+ * correct slot selected for a later read.
+ *
+ * If we increment the root nodes slot counter past the number of
+ * elements, 1 is returned to signal completion of the search.
+ */
+static int adjust_slots_upwards(struct btrfs_root *root,
+ struct btrfs_path *path, int root_level)
+{
+ int level = 0;
+ int nr, slot;
+ struct extent_buffer *eb;
+
+ if (root_level == 0)
+ return 1;
+
+ while (level <= root_level) {
+ eb = path->nodes[level];
+ nr = btrfs_header_nritems(eb);
+ path->slots[level]++;
+ slot = path->slots[level];
+ if (slot >= nr || level == 0) {
+ /*
+ * Don't free the root - we will detect this
+ * condition after our loop and return a
+ * positive value for caller to stop walking the tree.
+ */
+ if (level != root_level) {
+ btrfs_tree_unlock_rw(eb, path->locks[level]);
+ path->locks[level] = 0;
+
+ free_extent_buffer(eb);
+ path->nodes[level] = NULL;
+ path->slots[level] = 0;
+ }
+ } else {
+ /*
+ * We have a valid slot to walk back down
+ * from. Stop here so caller can process these
+ * new nodes.
+ */
+ break;
+ }
+
+ level++;
+ }
+
+ eb = path->nodes[root_level];
+ if (path->slots[root_level] >= btrfs_header_nritems(eb))
+ return 1;
+
+ return 0;
+}
+
+/*
+ * root_eb is the subtree root and is locked before this function is called.
+ */
+static int account_shared_subtree(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *root_eb,
+ u64 root_gen,
+ int root_level)
+{
+ int ret = 0;
+ int level;
+ struct extent_buffer *eb = root_eb;
+ struct btrfs_path *path = NULL;
+
+ BUG_ON(root_level < 0 || root_level > BTRFS_MAX_LEVEL);
+ BUG_ON(root_eb == NULL);
+
+ if (!root->fs_info->quota_enabled)
+ return 0;
+
+ if (!extent_buffer_uptodate(root_eb)) {
+ ret = btrfs_read_buffer(root_eb, root_gen);
+ if (ret)
+ goto out;
+ }
+
+ if (root_level == 0) {
+ ret = account_leaf_items(trans, root, root_eb);
+ goto out;
+ }
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ /*
+ * Walk down the tree. Missing extent blocks are filled in as
+ * we go. Metadata is accounted every time we read a new
+ * extent block.
+ *
+ * When we reach a leaf, we account for file extent items in it,
+ * walk back up the tree (adjusting slot pointers as we go)
+ * and restart the search process.
+ */
+ extent_buffer_get(root_eb); /* For path */
+ path->nodes[root_level] = root_eb;
+ path->slots[root_level] = 0;
+ path->locks[root_level] = 0; /* so release_path doesn't try to unlock */
+walk_down:
+ level = root_level;
+ while (level >= 0) {
+ if (path->nodes[level] == NULL) {
+ int child_bsize = root->nodesize;
+ int parent_slot;
+ u64 child_gen;
+ u64 child_bytenr;
+
+ /* We need to get child blockptr/gen from
+ * parent before we can read it. */
+ eb = path->nodes[level + 1];
+ parent_slot = path->slots[level + 1];
+ child_bytenr = btrfs_node_blockptr(eb, parent_slot);
+ child_gen = btrfs_node_ptr_generation(eb, parent_slot);
+
+ eb = read_tree_block(root, child_bytenr, child_bsize,
+ child_gen);
+ if (!eb || !extent_buffer_uptodate(eb)) {
+ ret = -EIO;
+ goto out;
+ }
+
+ path->nodes[level] = eb;
+ path->slots[level] = 0;
+
+ btrfs_tree_read_lock(eb);
+ btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+ path->locks[level] = BTRFS_READ_LOCK_BLOCKING;
+
+ ret = btrfs_qgroup_record_ref(trans, root->fs_info,
+ root->objectid,
+ child_bytenr,
+ child_bsize,
+ BTRFS_QGROUP_OPER_SUB_SUBTREE,
+ 0);
+ if (ret)
+ goto out;
+
+ }
+
+ if (level == 0) {
+ ret = account_leaf_items(trans, root, path->nodes[level]);
+ if (ret)
+ goto out;
+
+ /* Nonzero return here means we completed our search */
+ ret = adjust_slots_upwards(root, path, root_level);
+ if (ret)
+ break;
+
+ /* Restart search with new slots */
+ goto walk_down;
+ }
+
+ level--;
+ }
+
+ ret = 0;
+out:
+ btrfs_free_path(path);
+
+ return ret;
+}
+
/*
* helper to process tree block while walking down the tree.
*
@@ -7455,9 +7746,9 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
/* wc->stage == UPDATE_BACKREF */
if (!(wc->flags[level] & flag)) {
BUG_ON(!path->locks[level]);
- ret = btrfs_inc_ref(trans, root, eb, 1, wc->for_reloc);
+ ret = btrfs_inc_ref(trans, root, eb, 1);
BUG_ON(ret); /* -ENOMEM */
- ret = btrfs_dec_ref(trans, root, eb, 0, wc->for_reloc);
+ ret = btrfs_dec_ref(trans, root, eb, 0);
BUG_ON(ret); /* -ENOMEM */
ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
eb->len, flag,
@@ -7504,6 +7795,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
int level = wc->level;
int reada = 0;
int ret = 0;
+ bool need_account = false;
generation = btrfs_node_ptr_generation(path->nodes[level],
path->slots[level]);
@@ -7549,6 +7841,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
if (wc->stage == DROP_REFERENCE) {
if (wc->refs[level - 1] > 1) {
+ need_account = true;
if (level == 1 &&
(wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
goto skip;
@@ -7612,6 +7905,16 @@ skip:
parent = 0;
}
+ if (need_account) {
+ ret = account_shared_subtree(trans, root, next,
+ generation, level - 1);
+ if (ret) {
+ printk_ratelimited(KERN_ERR "BTRFS: %s Error "
+ "%d accounting shared subtree. Quota "
+ "is out of sync, rescan required.\n",
+ root->fs_info->sb->s_id, ret);
+ }
+ }
ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
root->root_key.objectid, level - 1, 0, 0);
BUG_ON(ret); /* -ENOMEM */
@@ -7692,12 +7995,17 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
if (wc->refs[level] == 1) {
if (level == 0) {
if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
- ret = btrfs_dec_ref(trans, root, eb, 1,
- wc->for_reloc);
+ ret = btrfs_dec_ref(trans, root, eb, 1);
else
- ret = btrfs_dec_ref(trans, root, eb, 0,
- wc->for_reloc);
+ ret = btrfs_dec_ref(trans, root, eb, 0);
BUG_ON(ret); /* -ENOMEM */
+ ret = account_leaf_items(trans, root, eb);
+ if (ret) {
+ printk_ratelimited(KERN_ERR "BTRFS: %s Error "
+ "%d accounting leaf items. Quota "
+ "is out of sync, rescan required.\n",
+ root->fs_info->sb->s_id, ret);
+ }
}
/* make block locked assertion in clean_tree_block happy */
if (!path->locks[level] &&
@@ -7823,6 +8131,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
int level;
bool root_dropped = false;
+ btrfs_debug(root->fs_info, "Drop subvolume %llu", root->objectid);
+
path = btrfs_alloc_path();
if (!path) {
err = -ENOMEM;
@@ -7948,6 +8258,24 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
goto out_end_trans;
}
+ /*
+ * Qgroup update accounting is run from
+ * delayed ref handling. This usually works
+ * out because delayed refs are normally the
+ * only way qgroup updates are added. However,
+ * we may have added updates during our tree
+ * walk so run qgroups here to make sure we
+ * don't lose any updates.
+ */
+ ret = btrfs_delayed_qgroup_accounting(trans,
+ root->fs_info);
+ if (ret)
+ printk_ratelimited(KERN_ERR "BTRFS: Failure %d "
+ "running qgroup updates "
+ "during snapshot delete. "
+ "Quota is out of sync, "
+ "rescan required.\n", ret);
+
btrfs_end_transaction_throttle(trans, tree_root);
if (!for_reloc && btrfs_need_cleaner_sleep(root)) {
pr_debug("BTRFS: drop snapshot early exit\n");
@@ -8001,6 +8329,14 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
}
root_dropped = true;
out_end_trans:
+ ret = btrfs_delayed_qgroup_accounting(trans, tree_root->fs_info);
+ if (ret)
+ printk_ratelimited(KERN_ERR "BTRFS: Failure %d "
+ "running qgroup updates "
+ "during snapshot delete. "
+ "Quota is out of sync, "
+ "rescan required.\n", ret);
+
btrfs_end_transaction_throttle(trans, tree_root);
out_free:
kfree(wc);
@@ -8659,6 +8995,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
start);
atomic_set(&cache->count, 1);
spin_lock_init(&cache->lock);
+ init_rwsem(&cache->data_rwsem);
INIT_LIST_HEAD(&cache->list);
INIT_LIST_HEAD(&cache->cluster_list);
INIT_LIST_HEAD(&cache->new_bg_list);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index a389820d158b..3e11aab9f391 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3437,16 +3437,10 @@ done_unlocked:
return 0;
}
-static int eb_wait(void *word)
-{
- io_schedule();
- return 0;
-}
-
void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
{
- wait_on_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK, eb_wait,
- TASK_UNINTERRUPTIBLE);
+ wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_WRITEBACK,
+ TASK_UNINTERRUPTIBLE);
}
static noinline_for_stack int
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 15ce5f2a2b62..ccc264e7bde1 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -158,7 +158,6 @@ struct extent_buffer {
* to unlock
*/
wait_queue_head_t read_lock_wq;
- wait_queue_head_t lock_wq;
struct page *pages[INLINE_EXTENT_BUFFER_PAGES];
#ifdef CONFIG_BTRFS_DEBUG
struct list_head leak_list;
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 1874aee69c86..225302b39afb 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -75,6 +75,8 @@ void free_extent_map(struct extent_map *em)
if (atomic_dec_and_test(&em->refs)) {
WARN_ON(extent_map_in_tree(em));
WARN_ON(!list_empty(&em->list));
+ if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags))
+ kfree(em->bdev);
kmem_cache_free(extent_map_cache, em);
}
}
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index e7fd8a56a140..b2991fd8583e 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -15,6 +15,7 @@
#define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */
#define EXTENT_FLAG_LOGGING 4 /* Logging this extent */
#define EXTENT_FLAG_FILLING 5 /* Filling in a preallocated extent */
+#define EXTENT_FLAG_FS_MAPPING 6 /* filesystem extent mapping type */
struct extent_map {
struct rb_node rb_node;
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index f46cfe45d686..54c84daec9b5 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -756,7 +756,7 @@ again:
found_next = 1;
if (ret != 0)
goto insert;
- slot = 0;
+ slot = path->slots[0];
}
btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot);
if (found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 1f2b99cb55ea..d3afac292d67 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1838,33 +1838,9 @@ out:
int btrfs_release_file(struct inode *inode, struct file *filp)
{
- /*
- * ordered_data_close is set by settattr when we are about to truncate
- * a file from a non-zero size to a zero size. This tries to
- * flush down new bytes that may have been written if the
- * application were using truncate to replace a file in place.
- */
- if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
- &BTRFS_I(inode)->runtime_flags)) {
- struct btrfs_trans_handle *trans;
- struct btrfs_root *root = BTRFS_I(inode)->root;
-
- /*
- * We need to block on a committing transaction to keep us from
- * throwing a ordered operation on to the list and causing
- * something like sync to deadlock trying to flush out this
- * inode.
- */
- trans = btrfs_start_transaction(root, 0);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
- btrfs_add_ordered_operation(trans, BTRFS_I(inode)->root, inode);
- btrfs_end_transaction(trans, root);
- if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
- filemap_flush(inode->i_mapping);
- }
if (filp->private_data)
btrfs_ioctl_trans_end(filp);
+ filemap_flush(inode->i_mapping);
return 0;
}
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 372b05ff1943..2b0a627cb5f9 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -274,18 +274,32 @@ struct io_ctl {
};
static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode,
- struct btrfs_root *root)
+ struct btrfs_root *root, int write)
{
+ int num_pages;
+ int check_crcs = 0;
+
+ num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
+ PAGE_CACHE_SHIFT;
+
+ if (btrfs_ino(inode) != BTRFS_FREE_INO_OBJECTID)
+ check_crcs = 1;
+
+ /* Make sure we can fit our crcs into the first page */
+ if (write && check_crcs &&
+ (num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE)
+ return -ENOSPC;
+
memset(io_ctl, 0, sizeof(struct io_ctl));
- io_ctl->num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
- PAGE_CACHE_SHIFT;
- io_ctl->pages = kzalloc(sizeof(struct page *) * io_ctl->num_pages,
- GFP_NOFS);
+
+ io_ctl->pages = kzalloc(sizeof(struct page *) * num_pages, GFP_NOFS);
if (!io_ctl->pages)
return -ENOMEM;
+
+ io_ctl->num_pages = num_pages;
io_ctl->root = root;
- if (btrfs_ino(inode) != BTRFS_FREE_INO_OBJECTID)
- io_ctl->check_crcs = 1;
+ io_ctl->check_crcs = check_crcs;
+
return 0;
}
@@ -666,6 +680,13 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
generation = btrfs_free_space_generation(leaf, header);
btrfs_release_path(path);
+ if (!BTRFS_I(inode)->generation) {
+ btrfs_info(root->fs_info,
+ "The free space cache file (%llu) is invalid. skip it\n",
+ offset);
+ return 0;
+ }
+
if (BTRFS_I(inode)->generation != generation) {
btrfs_err(root->fs_info,
"free space inode generation (%llu) "
@@ -677,7 +698,7 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
if (!num_entries)
return 0;
- ret = io_ctl_init(&io_ctl, inode, root);
+ ret = io_ctl_init(&io_ctl, inode, root, 0);
if (ret)
return ret;
@@ -957,19 +978,18 @@ fail:
}
static noinline_for_stack int
-add_ioctl_entries(struct btrfs_root *root,
- struct inode *inode,
- struct btrfs_block_group_cache *block_group,
- struct io_ctl *io_ctl,
- struct extent_state **cached_state,
- struct list_head *bitmap_list,
- int *entries)
+write_pinned_extent_entries(struct btrfs_root *root,
+ struct btrfs_block_group_cache *block_group,
+ struct io_ctl *io_ctl,
+ int *entries)
{
u64 start, extent_start, extent_end, len;
- struct list_head *pos, *n;
struct extent_io_tree *unpin = NULL;
int ret;
+ if (!block_group)
+ return 0;
+
/*
* We want to add any pinned extents to our free space cache
* so we don't leak the space
@@ -979,23 +999,19 @@ add_ioctl_entries(struct btrfs_root *root,
*/
unpin = root->fs_info->pinned_extents;
- if (block_group)
- start = block_group->key.objectid;
+ start = block_group->key.objectid;
- while (block_group && (start < block_group->key.objectid +
- block_group->key.offset)) {
+ while (start < block_group->key.objectid + block_group->key.offset) {
ret = find_first_extent_bit(unpin, start,
&extent_start, &extent_end,
EXTENT_DIRTY, NULL);
- if (ret) {
- ret = 0;
- break;
- }
+ if (ret)
+ return 0;
/* This pinned extent is out of our range */
if (extent_start >= block_group->key.objectid +
block_group->key.offset)
- break;
+ return 0;
extent_start = max(extent_start, start);
extent_end = min(block_group->key.objectid +
@@ -1005,11 +1021,20 @@ add_ioctl_entries(struct btrfs_root *root,
*entries += 1;
ret = io_ctl_add_entry(io_ctl, extent_start, len, NULL);
if (ret)
- goto out_nospc;
+ return -ENOSPC;
start = extent_end;
}
+ return 0;
+}
+
+static noinline_for_stack int
+write_bitmap_entries(struct io_ctl *io_ctl, struct list_head *bitmap_list)
+{
+ struct list_head *pos, *n;
+ int ret;
+
/* Write out the bitmaps */
list_for_each_safe(pos, n, bitmap_list) {
struct btrfs_free_space *entry =
@@ -1017,36 +1042,24 @@ add_ioctl_entries(struct btrfs_root *root,
ret = io_ctl_add_bitmap(io_ctl, entry->bitmap);
if (ret)
- goto out_nospc;
+ return -ENOSPC;
list_del_init(&entry->list);
}
- /* Zero out the rest of the pages just to make sure */
- io_ctl_zero_remaining_pages(io_ctl);
-
- ret = btrfs_dirty_pages(root, inode, io_ctl->pages, io_ctl->num_pages,
- 0, i_size_read(inode), cached_state);
- io_ctl_drop_pages(io_ctl);
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
- i_size_read(inode) - 1, cached_state, GFP_NOFS);
+ return 0;
+}
- if (ret)
- goto fail;
+static int flush_dirty_cache(struct inode *inode)
+{
+ int ret;
ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
- if (ret) {
+ if (ret)
clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL,
GFP_NOFS);
- goto fail;
- }
- return 0;
-fail:
- return -1;
-
-out_nospc:
- return -ENOSPC;
+ return ret;
}
static void noinline_for_stack
@@ -1056,6 +1069,7 @@ cleanup_write_cache_enospc(struct inode *inode,
struct list_head *bitmap_list)
{
struct list_head *pos, *n;
+
list_for_each_safe(pos, n, bitmap_list) {
struct btrfs_free_space *entry =
list_entry(pos, struct btrfs_free_space, list);
@@ -1088,64 +1102,104 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
{
struct extent_state *cached_state = NULL;
struct io_ctl io_ctl;
- struct list_head bitmap_list;
+ LIST_HEAD(bitmap_list);
int entries = 0;
int bitmaps = 0;
int ret;
- int err = -1;
-
- INIT_LIST_HEAD(&bitmap_list);
if (!i_size_read(inode))
return -1;
- ret = io_ctl_init(&io_ctl, inode, root);
+ ret = io_ctl_init(&io_ctl, inode, root, 1);
if (ret)
return -1;
+ if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) {
+ down_write(&block_group->data_rwsem);
+ spin_lock(&block_group->lock);
+ if (block_group->delalloc_bytes) {
+ block_group->disk_cache_state = BTRFS_DC_WRITTEN;
+ spin_unlock(&block_group->lock);
+ up_write(&block_group->data_rwsem);
+ BTRFS_I(inode)->generation = 0;
+ ret = 0;
+ goto out;
+ }
+ spin_unlock(&block_group->lock);
+ }
+
/* Lock all pages first so we can lock the extent safely. */
io_ctl_prepare_pages(&io_ctl, inode, 0);
lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
0, &cached_state);
-
- /* Make sure we can fit our crcs into the first page */
- if (io_ctl.check_crcs &&
- (io_ctl.num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE)
- goto out_nospc;
-
io_ctl_set_generation(&io_ctl, trans->transid);
+ /* Write out the extent entries in the free space cache */
ret = write_cache_extent_entries(&io_ctl, ctl,
block_group, &entries, &bitmaps,
&bitmap_list);
if (ret)
goto out_nospc;
- ret = add_ioctl_entries(root, inode, block_group, &io_ctl,
- &cached_state, &bitmap_list, &entries);
+ /*
+ * Some spaces that are freed in the current transaction are pinned,
+ * they will be added into free space cache after the transaction is
+ * committed, we shouldn't lose them.
+ */
+ ret = write_pinned_extent_entries(root, block_group, &io_ctl, &entries);
+ if (ret)
+ goto out_nospc;
- if (ret == -ENOSPC)
+ /* At last, we write out all the bitmaps. */
+ ret = write_bitmap_entries(&io_ctl, &bitmap_list);
+ if (ret)
goto out_nospc;
- else if (ret)
+
+ /* Zero out the rest of the pages just to make sure */
+ io_ctl_zero_remaining_pages(&io_ctl);
+
+ /* Everything is written out, now we dirty the pages in the file. */
+ ret = btrfs_dirty_pages(root, inode, io_ctl.pages, io_ctl.num_pages,
+ 0, i_size_read(inode), &cached_state);
+ if (ret)
+ goto out_nospc;
+
+ if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
+ up_write(&block_group->data_rwsem);
+ /*
+ * Release the pages and unlock the extent, we will flush
+ * them out later
+ */
+ io_ctl_drop_pages(&io_ctl);
+
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
+ i_size_read(inode) - 1, &cached_state, GFP_NOFS);
+
+ /* Flush the dirty pages in the cache file. */
+ ret = flush_dirty_cache(inode);
+ if (ret)
goto out;
- err = update_cache_item(trans, root, inode, path, offset,
+ /* Update the cache item to tell everyone this cache file is valid. */
+ ret = update_cache_item(trans, root, inode, path, offset,
entries, bitmaps);
-
out:
io_ctl_free(&io_ctl);
- if (err) {
+ if (ret) {
invalidate_inode_pages2(inode->i_mapping);
BTRFS_I(inode)->generation = 0;
}
btrfs_update_inode(trans, root, inode);
- return err;
+ return ret;
out_nospc:
-
cleanup_write_cache_enospc(inode, &io_ctl, &cached_state, &bitmap_list);
+
+ if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
+ up_write(&block_group->data_rwsem);
+
goto out;
}
@@ -1165,6 +1219,12 @@ int btrfs_write_out_cache(struct btrfs_root *root,
spin_unlock(&block_group->lock);
return 0;
}
+
+ if (block_group->delalloc_bytes) {
+ block_group->disk_cache_state = BTRFS_DC_WRITTEN;
+ spin_unlock(&block_group->lock);
+ return 0;
+ }
spin_unlock(&block_group->lock);
inode = lookup_free_space_inode(root, block_group, path);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8925f66a1411..03708ef3deef 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -693,7 +693,7 @@ retry:
ret = btrfs_reserve_extent(root,
async_extent->compressed_size,
async_extent->compressed_size,
- 0, alloc_hint, &ins, 1);
+ 0, alloc_hint, &ins, 1, 1);
if (ret) {
int i;
@@ -709,6 +709,18 @@ retry:
unlock_extent(io_tree, async_extent->start,
async_extent->start +
async_extent->ram_size - 1);
+
+ /*
+ * we need to redirty the pages if we decide to
+ * fallback to uncompressed IO, otherwise we
+ * will not submit these pages down to lower
+ * layers.
+ */
+ extent_range_redirty_for_io(inode,
+ async_extent->start,
+ async_extent->start +
+ async_extent->ram_size - 1);
+
goto retry;
}
goto out_free;
@@ -794,7 +806,7 @@ retry:
out:
return ret;
out_free_reserve:
- btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
+ btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
out_free:
extent_clear_unlock_delalloc(inode, async_extent->start,
async_extent->start +
@@ -917,7 +929,7 @@ static noinline int cow_file_range(struct inode *inode,
cur_alloc_size = disk_num_bytes;
ret = btrfs_reserve_extent(root, cur_alloc_size,
root->sectorsize, 0, alloc_hint,
- &ins, 1);
+ &ins, 1, 1);
if (ret < 0)
goto out_unlock;
@@ -995,7 +1007,7 @@ out:
return ret;
out_reserve:
- btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
+ btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
out_unlock:
extent_clear_unlock_delalloc(inode, start, end, locked_page,
EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
@@ -2599,6 +2611,21 @@ out_kfree:
return NULL;
}
+static void btrfs_release_delalloc_bytes(struct btrfs_root *root,
+ u64 start, u64 len)
+{
+ struct btrfs_block_group_cache *cache;
+
+ cache = btrfs_lookup_block_group(root->fs_info, start);
+ ASSERT(cache);
+
+ spin_lock(&cache->lock);
+ cache->delalloc_bytes -= len;
+ spin_unlock(&cache->lock);
+
+ btrfs_put_block_group(cache);
+}
+
/* as ordered data IO finishes, this gets called so we can finish
* an ordered extent if the range of bytes in the file it covers are
* fully written.
@@ -2698,6 +2725,10 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
logical_len, logical_len,
compress_type, 0, 0,
BTRFS_FILE_EXTENT_REG);
+ if (!ret)
+ btrfs_release_delalloc_bytes(root,
+ ordered_extent->start,
+ ordered_extent->disk_len);
}
unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
ordered_extent->file_offset, ordered_extent->len,
@@ -2750,7 +2781,7 @@ out:
!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
!test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags))
btrfs_free_reserved_extent(root, ordered_extent->start,
- ordered_extent->disk_len);
+ ordered_extent->disk_len, 1);
}
@@ -6535,21 +6566,21 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
alloc_hint = get_extent_allocation_hint(inode, start, len);
ret = btrfs_reserve_extent(root, len, root->sectorsize, 0,
- alloc_hint, &ins, 1);
+ alloc_hint, &ins, 1, 1);
if (ret)
return ERR_PTR(ret);
em = create_pinned_em(inode, start, ins.offset, start, ins.objectid,
ins.offset, ins.offset, ins.offset, 0);
if (IS_ERR(em)) {
- btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
+ btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
return em;
}
ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
ins.offset, ins.offset, 0);
if (ret) {
- btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
+ btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
free_extent_map(em);
return ERR_PTR(ret);
}
@@ -7437,7 +7468,7 @@ free_ordered:
if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) &&
!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
btrfs_free_reserved_extent(root, ordered->start,
- ordered->disk_len);
+ ordered->disk_len, 1);
btrfs_put_ordered_extent(ordered);
btrfs_put_ordered_extent(ordered);
}
@@ -7920,27 +7951,6 @@ static int btrfs_truncate(struct inode *inode)
BUG_ON(ret);
/*
- * setattr is responsible for setting the ordered_data_close flag,
- * but that is only tested during the last file release. That
- * could happen well after the next commit, leaving a great big
- * window where new writes may get lost if someone chooses to write
- * to this file after truncating to zero
- *
- * The inode doesn't have any dirty data here, and so if we commit
- * this is a noop. If someone immediately starts writing to the inode
- * it is very likely we'll catch some of their writes in this
- * transaction, and the commit will find this file on the ordered
- * data list with good things to send down.
- *
- * This is a best effort solution, there is still a window where
- * using truncate to replace the contents of the file will
- * end up with a zero length file after a crash.
- */
- if (inode->i_size == 0 && test_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
- &BTRFS_I(inode)->runtime_flags))
- btrfs_add_ordered_operation(trans, root, inode);
-
- /*
* So if we truncate and then write and fsync we normally would just
* write the extents that changed, which is a problem if we need to
* first truncate that entire inode. So set this flag so we write out
@@ -8087,7 +8097,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
mutex_init(&ei->delalloc_mutex);
btrfs_ordered_inode_tree_init(&ei->ordered_tree);
INIT_LIST_HEAD(&ei->delalloc_inodes);
- INIT_LIST_HEAD(&ei->ordered_operations);
RB_CLEAR_NODE(&ei->rb_node);
return inode;
@@ -8127,17 +8136,6 @@ void btrfs_destroy_inode(struct inode *inode)
if (!root)
goto free;
- /*
- * Make sure we're properly removed from the ordered operation
- * lists.
- */
- smp_mb();
- if (!list_empty(&BTRFS_I(inode)->ordered_operations)) {
- spin_lock(&root->fs_info->ordered_root_lock);
- list_del_init(&BTRFS_I(inode)->ordered_operations);
- spin_unlock(&root->fs_info->ordered_root_lock);
- }
-
if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
&BTRFS_I(inode)->runtime_flags)) {
btrfs_info(root->fs_info, "inode %llu still on the orphan list",
@@ -8319,12 +8317,10 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
ret = 0;
/*
- * we're using rename to replace one file with another.
- * and the replacement file is large. Start IO on it now so
- * we don't add too much work to the end of the transaction
+ * we're using rename to replace one file with another. Start IO on it
+ * now so we don't add too much work to the end of the transaction
*/
- if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size &&
- old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
+ if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size)
filemap_flush(old_inode->i_mapping);
/* close the racy window with snapshot create/destroy ioctl */
@@ -8372,12 +8368,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
*/
btrfs_pin_log_trans(root);
}
- /*
- * make sure the inode gets flushed if it is replacing
- * something.
- */
- if (new_inode && new_inode->i_size && S_ISREG(old_inode->i_mode))
- btrfs_add_ordered_operation(trans, root, old_inode);
inode_inc_iversion(old_dir);
inode_inc_iversion(new_dir);
@@ -8457,6 +8447,16 @@ out_notrans:
return ret;
}
+static int btrfs_rename2(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry,
+ unsigned int flags)
+{
+ if (flags & ~RENAME_NOREPLACE)
+ return -EINVAL;
+
+ return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry);
+}
+
static void btrfs_run_delalloc_work(struct btrfs_work *work)
{
struct btrfs_delalloc_work *delalloc_work;
@@ -8808,7 +8808,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
cur_bytes = min(num_bytes, 256ULL * 1024 * 1024);
cur_bytes = max(cur_bytes, min_size);
ret = btrfs_reserve_extent(root, cur_bytes, min_size, 0,
- *alloc_hint, &ins, 1);
+ *alloc_hint, &ins, 1, 0);
if (ret) {
if (own_trans)
btrfs_end_transaction(trans, root);
@@ -8822,7 +8822,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
BTRFS_FILE_EXTENT_PREALLOC);
if (ret) {
btrfs_free_reserved_extent(root, ins.objectid,
- ins.offset);
+ ins.offset, 0);
btrfs_abort_transaction(trans, root, ret);
if (own_trans)
btrfs_end_transaction(trans, root);
@@ -9000,7 +9000,7 @@ static const struct inode_operations btrfs_dir_inode_operations = {
.link = btrfs_link,
.mkdir = btrfs_mkdir,
.rmdir = btrfs_rmdir,
- .rename = btrfs_rename,
+ .rename2 = btrfs_rename2,
.symlink = btrfs_symlink,
.setattr = btrfs_setattr,
.mknod = btrfs_mknod,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 0d321c23069a..47aceb494d1d 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -136,19 +136,22 @@ static unsigned int btrfs_flags_to_ioctl(unsigned int flags)
void btrfs_update_iflags(struct inode *inode)
{
struct btrfs_inode *ip = BTRFS_I(inode);
-
- inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
+ unsigned int new_fl = 0;
if (ip->flags & BTRFS_INODE_SYNC)
- inode->i_flags |= S_SYNC;
+ new_fl |= S_SYNC;
if (ip->flags & BTRFS_INODE_IMMUTABLE)
- inode->i_flags |= S_IMMUTABLE;
+ new_fl |= S_IMMUTABLE;
if (ip->flags & BTRFS_INODE_APPEND)
- inode->i_flags |= S_APPEND;
+ new_fl |= S_APPEND;
if (ip->flags & BTRFS_INODE_NOATIME)
- inode->i_flags |= S_NOATIME;
+ new_fl |= S_NOATIME;
if (ip->flags & BTRFS_INODE_DIRSYNC)
- inode->i_flags |= S_DIRSYNC;
+ new_fl |= S_DIRSYNC;
+
+ set_mask_bits(&inode->i_flags,
+ S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC,
+ new_fl);
}
/*
@@ -3139,7 +3142,6 @@ out:
static void clone_update_extent_map(struct inode *inode,
const struct btrfs_trans_handle *trans,
const struct btrfs_path *path,
- struct btrfs_file_extent_item *fi,
const u64 hole_offset,
const u64 hole_len)
{
@@ -3154,7 +3156,11 @@ static void clone_update_extent_map(struct inode *inode,
return;
}
- if (fi) {
+ if (path) {
+ struct btrfs_file_extent_item *fi;
+
+ fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_file_extent_item);
btrfs_extent_item_to_extent_map(inode, path, fi, false, em);
em->generation = -1;
if (btrfs_file_extent_type(path->nodes[0], fi) ==
@@ -3508,18 +3514,15 @@ process_slot:
btrfs_item_ptr_offset(leaf, slot),
size);
inode_add_bytes(inode, datal);
- extent = btrfs_item_ptr(leaf, slot,
- struct btrfs_file_extent_item);
}
/* If we have an implicit hole (NO_HOLES feature). */
if (drop_start < new_key.offset)
clone_update_extent_map(inode, trans,
- path, NULL, drop_start,
+ NULL, drop_start,
new_key.offset - drop_start);
- clone_update_extent_map(inode, trans, path,
- extent, 0, 0);
+ clone_update_extent_map(inode, trans, path, 0, 0);
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
@@ -3562,12 +3565,10 @@ process_slot:
btrfs_end_transaction(trans, root);
goto out;
}
+ clone_update_extent_map(inode, trans, NULL, last_dest_end,
+ destoff + len - last_dest_end);
ret = clone_finish_inode_update(trans, inode, destoff + len,
destoff, olen);
- if (ret)
- goto out;
- clone_update_extent_map(inode, trans, path, NULL, last_dest_end,
- destoff + len - last_dest_end);
}
out:
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 01277b8f2373..5665d2149249 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -33,14 +33,14 @@ static void btrfs_assert_tree_read_locked(struct extent_buffer *eb);
*/
void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw)
{
- if (eb->lock_nested) {
- read_lock(&eb->lock);
- if (eb->lock_nested && current->pid == eb->lock_owner) {
- read_unlock(&eb->lock);
- return;
- }
- read_unlock(&eb->lock);
- }
+ /*
+ * no lock is required. The lock owner may change if
+ * we have a read lock, but it won't change to or away
+ * from us. If we have the write lock, we are the owner
+ * and it'll never change.
+ */
+ if (eb->lock_nested && current->pid == eb->lock_owner)
+ return;
if (rw == BTRFS_WRITE_LOCK) {
if (atomic_read(&eb->blocking_writers) == 0) {
WARN_ON(atomic_read(&eb->spinning_writers) != 1);
@@ -65,14 +65,15 @@ void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw)
*/
void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
{
- if (eb->lock_nested) {
- read_lock(&eb->lock);
- if (eb->lock_nested && current->pid == eb->lock_owner) {
- read_unlock(&eb->lock);
- return;
- }
- read_unlock(&eb->lock);
- }
+ /*
+ * no lock is required. The lock owner may change if
+ * we have a read lock, but it won't change to or away
+ * from us. If we have the write lock, we are the owner
+ * and it'll never change.
+ */
+ if (eb->lock_nested && current->pid == eb->lock_owner)
+ return;
+
if (rw == BTRFS_WRITE_LOCK_BLOCKING) {
BUG_ON(atomic_read(&eb->blocking_writers) != 1);
write_lock(&eb->lock);
@@ -99,6 +100,9 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
void btrfs_tree_read_lock(struct extent_buffer *eb)
{
again:
+ BUG_ON(!atomic_read(&eb->blocking_writers) &&
+ current->pid == eb->lock_owner);
+
read_lock(&eb->lock);
if (atomic_read(&eb->blocking_writers) &&
current->pid == eb->lock_owner) {
@@ -132,7 +136,9 @@ int btrfs_try_tree_read_lock(struct extent_buffer *eb)
if (atomic_read(&eb->blocking_writers))
return 0;
- read_lock(&eb->lock);
+ if (!read_trylock(&eb->lock))
+ return 0;
+
if (atomic_read(&eb->blocking_writers)) {
read_unlock(&eb->lock);
return 0;
@@ -151,7 +157,10 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb)
if (atomic_read(&eb->blocking_writers) ||
atomic_read(&eb->blocking_readers))
return 0;
- write_lock(&eb->lock);
+
+ if (!write_trylock(&eb->lock))
+ return 0;
+
if (atomic_read(&eb->blocking_writers) ||
atomic_read(&eb->blocking_readers)) {
write_unlock(&eb->lock);
@@ -168,14 +177,15 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb)
*/
void btrfs_tree_read_unlock(struct extent_buffer *eb)
{
- if (eb->lock_nested) {
- read_lock(&eb->lock);
- if (eb->lock_nested && current->pid == eb->lock_owner) {
- eb->lock_nested = 0;
- read_unlock(&eb->lock);
- return;
- }
- read_unlock(&eb->lock);
+ /*
+ * if we're nested, we have the write lock. No new locking
+ * is needed as long as we are the lock owner.
+ * The write unlock will do a barrier for us, and the lock_nested
+ * field only matters to the lock owner.
+ */
+ if (eb->lock_nested && current->pid == eb->lock_owner) {
+ eb->lock_nested = 0;
+ return;
}
btrfs_assert_tree_read_locked(eb);
WARN_ON(atomic_read(&eb->spinning_readers) == 0);
@@ -189,14 +199,15 @@ void btrfs_tree_read_unlock(struct extent_buffer *eb)
*/
void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)
{
- if (eb->lock_nested) {
- read_lock(&eb->lock);
- if (eb->lock_nested && current->pid == eb->lock_owner) {
- eb->lock_nested = 0;
- read_unlock(&eb->lock);
- return;
- }
- read_unlock(&eb->lock);
+ /*
+ * if we're nested, we have the write lock. No new locking
+ * is needed as long as we are the lock owner.
+ * The write unlock will do a barrier for us, and the lock_nested
+ * field only matters to the lock owner.
+ */
+ if (eb->lock_nested && current->pid == eb->lock_owner) {
+ eb->lock_nested = 0;
+ return;
}
btrfs_assert_tree_read_locked(eb);
WARN_ON(atomic_read(&eb->blocking_readers) == 0);
@@ -244,6 +255,7 @@ void btrfs_tree_unlock(struct extent_buffer *eb)
BUG_ON(blockers > 1);
btrfs_assert_tree_locked(eb);
+ eb->lock_owner = 0;
atomic_dec(&eb->write_locks);
if (blockers) {
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index e12441c7cf1d..963895c1f801 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -484,8 +484,19 @@ void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid)
log_list);
list_del_init(&ordered->log_list);
spin_unlock_irq(&log->log_extents_lock[index]);
+
+ if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) &&
+ !test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) {
+ struct inode *inode = ordered->inode;
+ u64 start = ordered->file_offset;
+ u64 end = ordered->file_offset + ordered->len - 1;
+
+ WARN_ON(!inode);
+ filemap_fdatawrite_range(inode->i_mapping, start, end);
+ }
wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE,
&ordered->flags));
+
btrfs_put_ordered_extent(ordered);
spin_lock_irq(&log->log_extents_lock[index]);
}
@@ -560,18 +571,6 @@ void btrfs_remove_ordered_extent(struct inode *inode,
trace_btrfs_ordered_extent_remove(inode, entry);
- /*
- * we have no more ordered extents for this inode and
- * no dirty pages. We can safely remove it from the
- * list of ordered extents
- */
- if (RB_EMPTY_ROOT(&tree->tree) &&
- !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
- spin_lock(&root->fs_info->ordered_root_lock);
- list_del_init(&BTRFS_I(inode)->ordered_operations);
- spin_unlock(&root->fs_info->ordered_root_lock);
- }
-
if (!root->nr_ordered_extents) {
spin_lock(&root->fs_info->ordered_root_lock);
BUG_ON(list_empty(&root->ordered_root));
@@ -676,81 +675,6 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr)
}
/*
- * this is used during transaction commit to write all the inodes
- * added to the ordered operation list. These files must be fully on
- * disk before the transaction commits.
- *
- * we have two modes here, one is to just start the IO via filemap_flush
- * and the other is to wait for all the io. When we wait, we have an
- * extra check to make sure the ordered operation list really is empty
- * before we return
- */
-int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, int wait)
-{
- struct btrfs_inode *btrfs_inode;
- struct inode *inode;
- struct btrfs_transaction *cur_trans = trans->transaction;
- struct list_head splice;
- struct list_head works;
- struct btrfs_delalloc_work *work, *next;
- int ret = 0;
-
- INIT_LIST_HEAD(&splice);
- INIT_LIST_HEAD(&works);
-
- mutex_lock(&root->fs_info->ordered_extent_flush_mutex);
- spin_lock(&root->fs_info->ordered_root_lock);
- list_splice_init(&cur_trans->ordered_operations, &splice);
- while (!list_empty(&splice)) {
- btrfs_inode = list_entry(splice.next, struct btrfs_inode,
- ordered_operations);
- inode = &btrfs_inode->vfs_inode;
-
- list_del_init(&btrfs_inode->ordered_operations);
-
- /*
- * the inode may be getting freed (in sys_unlink path).
- */
- inode = igrab(inode);
- if (!inode)
- continue;
-
- if (!wait)
- list_add_tail(&BTRFS_I(inode)->ordered_operations,
- &cur_trans->ordered_operations);
- spin_unlock(&root->fs_info->ordered_root_lock);
-
- work = btrfs_alloc_delalloc_work(inode, wait, 1);
- if (!work) {
- spin_lock(&root->fs_info->ordered_root_lock);
- if (list_empty(&BTRFS_I(inode)->ordered_operations))
- list_add_tail(&btrfs_inode->ordered_operations,
- &splice);
- list_splice_tail(&splice,
- &cur_trans->ordered_operations);
- spin_unlock(&root->fs_info->ordered_root_lock);
- ret = -ENOMEM;
- goto out;
- }
- list_add_tail(&work->list, &works);
- btrfs_queue_work(root->fs_info->flush_workers,
- &work->work);
-
- cond_resched();
- spin_lock(&root->fs_info->ordered_root_lock);
- }
- spin_unlock(&root->fs_info->ordered_root_lock);
-out:
- list_for_each_entry_safe(work, next, &works, list) {
- list_del_init(&work->list);
- btrfs_wait_and_free_delalloc_work(work);
- }
- mutex_unlock(&root->fs_info->ordered_extent_flush_mutex);
- return ret;
-}
-
-/*
* Used to start IO or wait for a given ordered extent to finish.
*
* If wait is one, this effectively waits on page writeback for all the pages
@@ -1109,42 +1033,6 @@ out:
return index;
}
-
-/*
- * add a given inode to the list of inodes that must be fully on
- * disk before a transaction commit finishes.
- *
- * This basically gives us the ext3 style data=ordered mode, and it is mostly
- * used to make sure renamed files are fully on disk.
- *
- * It is a noop if the inode is already fully on disk.
- *
- * If trans is not null, we'll do a friendly check for a transaction that
- * is already flushing things and force the IO down ourselves.
- */
-void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct inode *inode)
-{
- struct btrfs_transaction *cur_trans = trans->transaction;
- u64 last_mod;
-
- last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans);
-
- /*
- * if this file hasn't been changed since the last transaction
- * commit, we can safely return without doing anything
- */
- if (last_mod <= root->fs_info->last_trans_committed)
- return;
-
- spin_lock(&root->fs_info->ordered_root_lock);
- if (list_empty(&BTRFS_I(inode)->ordered_operations)) {
- list_add_tail(&BTRFS_I(inode)->ordered_operations,
- &cur_trans->ordered_operations);
- }
- spin_unlock(&root->fs_info->ordered_root_lock);
-}
-
int __init ordered_data_init(void)
{
btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent",
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 246897058efb..d81a274d621e 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -190,11 +190,6 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
struct btrfs_ordered_extent *ordered);
int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
u32 *sum, int len);
-int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, int wait);
-void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct inode *inode);
int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr);
void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr);
void btrfs_get_logged_extents(struct inode *inode,
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 6efd70d3b64f..9626b4ad3b9a 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -54,7 +54,7 @@ static void print_extent_data_ref(struct extent_buffer *eb,
btrfs_extent_data_ref_count(eb, ref));
}
-static void print_extent_item(struct extent_buffer *eb, int slot)
+static void print_extent_item(struct extent_buffer *eb, int slot, int type)
{
struct btrfs_extent_item *ei;
struct btrfs_extent_inline_ref *iref;
@@ -63,7 +63,6 @@ static void print_extent_item(struct extent_buffer *eb, int slot)
struct btrfs_disk_key key;
unsigned long end;
unsigned long ptr;
- int type;
u32 item_size = btrfs_item_size_nr(eb, slot);
u64 flags;
u64 offset;
@@ -88,7 +87,8 @@ static void print_extent_item(struct extent_buffer *eb, int slot)
btrfs_extent_refs(eb, ei), btrfs_extent_generation(eb, ei),
flags);
- if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+ if ((type == BTRFS_EXTENT_ITEM_KEY) &&
+ flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
struct btrfs_tree_block_info *info;
info = (struct btrfs_tree_block_info *)(ei + 1);
btrfs_tree_block_key(eb, info, &key);
@@ -223,7 +223,8 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
btrfs_disk_root_refs(l, ri));
break;
case BTRFS_EXTENT_ITEM_KEY:
- print_extent_item(l, i);
+ case BTRFS_METADATA_ITEM_KEY:
+ print_extent_item(l, i, type);
break;
case BTRFS_TREE_BLOCK_REF_KEY:
printk(KERN_INFO "\t\ttree block backref\n");
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 98cb6b2630f9..b497498484be 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1201,6 +1201,50 @@ out:
mutex_unlock(&fs_info->qgroup_ioctl_lock);
return ret;
}
+
+static int comp_oper_exist(struct btrfs_qgroup_operation *oper1,
+ struct btrfs_qgroup_operation *oper2)
+{
+ /*
+ * Ignore seq and type here, we're looking for any operation
+ * at all related to this extent on that root.
+ */
+ if (oper1->bytenr < oper2->bytenr)
+ return -1;
+ if (oper1->bytenr > oper2->bytenr)
+ return 1;
+ if (oper1->ref_root < oper2->ref_root)
+ return -1;
+ if (oper1->ref_root > oper2->ref_root)
+ return 1;
+ return 0;
+}
+
+static int qgroup_oper_exists(struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup_operation *oper)
+{
+ struct rb_node *n;
+ struct btrfs_qgroup_operation *cur;
+ int cmp;
+
+ spin_lock(&fs_info->qgroup_op_lock);
+ n = fs_info->qgroup_op_tree.rb_node;
+ while (n) {
+ cur = rb_entry(n, struct btrfs_qgroup_operation, n);
+ cmp = comp_oper_exist(cur, oper);
+ if (cmp < 0) {
+ n = n->rb_right;
+ } else if (cmp) {
+ n = n->rb_left;
+ } else {
+ spin_unlock(&fs_info->qgroup_op_lock);
+ return -EEXIST;
+ }
+ }
+ spin_unlock(&fs_info->qgroup_op_lock);
+ return 0;
+}
+
static int comp_oper(struct btrfs_qgroup_operation *oper1,
struct btrfs_qgroup_operation *oper2)
{
@@ -1290,6 +1334,23 @@ int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
oper->seq = atomic_inc_return(&fs_info->qgroup_op_seq);
INIT_LIST_HEAD(&oper->elem.list);
oper->elem.seq = 0;
+
+ if (type == BTRFS_QGROUP_OPER_SUB_SUBTREE) {
+ /*
+ * If any operation for this bytenr/ref_root combo
+ * exists, then we know it's not exclusively owned and
+ * shouldn't be queued up.
+ *
+ * This also catches the case where we have a cloned
+ * extent that gets queued up multiple times during
+ * drop snapshot.
+ */
+ if (qgroup_oper_exists(fs_info, oper)) {
+ kfree(oper);
+ return 0;
+ }
+ }
+
ret = insert_qgroup_oper(fs_info, oper);
if (ret) {
/* Shouldn't happen so have an assert for developers */
@@ -1884,6 +1945,111 @@ out:
}
/*
+ * Process a reference to a shared subtree. This type of operation is
+ * queued during snapshot removal when we encounter extents which are
+ * shared between more than one root.
+ */
+static int qgroup_subtree_accounting(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup_operation *oper)
+{
+ struct ulist *roots = NULL;
+ struct ulist_node *unode;
+ struct ulist_iterator uiter;
+ struct btrfs_qgroup_list *glist;
+ struct ulist *parents;
+ int ret = 0;
+ int err;
+ struct btrfs_qgroup *qg;
+ u64 root_obj = 0;
+ struct seq_list elem = {};
+
+ parents = ulist_alloc(GFP_NOFS);
+ if (!parents)
+ return -ENOMEM;
+
+ btrfs_get_tree_mod_seq(fs_info, &elem);
+ ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr,
+ elem.seq, &roots);
+ btrfs_put_tree_mod_seq(fs_info, &elem);
+ if (ret < 0)
+ return ret;
+
+ if (roots->nnodes != 1)
+ goto out;
+
+ ULIST_ITER_INIT(&uiter);
+ unode = ulist_next(roots, &uiter); /* Only want 1 so no need to loop */
+ /*
+ * If we find our ref root then that means all refs
+ * this extent has to the root have not yet been
+ * deleted. In that case, we do nothing and let the
+ * last ref for this bytenr drive our update.
+ *
+ * This can happen for example if an extent is
+ * referenced multiple times in a snapshot (clone,
+ * etc). If we are in the middle of snapshot removal,
+ * queued updates for such an extent will find the
+ * root if we have not yet finished removing the
+ * snapshot.
+ */
+ if (unode->val == oper->ref_root)
+ goto out;
+
+ root_obj = unode->val;
+ BUG_ON(!root_obj);
+
+ spin_lock(&fs_info->qgroup_lock);
+ qg = find_qgroup_rb(fs_info, root_obj);
+ if (!qg)
+ goto out_unlock;
+
+ qg->excl += oper->num_bytes;
+ qg->excl_cmpr += oper->num_bytes;
+ qgroup_dirty(fs_info, qg);
+
+ /*
+ * Adjust counts for parent groups. First we find all
+ * parents, then in the 2nd loop we do the adjustment
+ * while adding parents of the parents to our ulist.
+ */
+ list_for_each_entry(glist, &qg->groups, next_group) {
+ err = ulist_add(parents, glist->group->qgroupid,
+ ptr_to_u64(glist->group), GFP_ATOMIC);
+ if (err < 0) {
+ ret = err;
+ goto out_unlock;
+ }
+ }
+
+ ULIST_ITER_INIT(&uiter);
+ while ((unode = ulist_next(parents, &uiter))) {
+ qg = u64_to_ptr(unode->aux);
+ qg->excl += oper->num_bytes;
+ qg->excl_cmpr += oper->num_bytes;
+ qgroup_dirty(fs_info, qg);
+
+ /* Add any parents of the parents */
+ list_for_each_entry(glist, &qg->groups, next_group) {
+ err = ulist_add(parents, glist->group->qgroupid,
+ ptr_to_u64(glist->group), GFP_ATOMIC);
+ if (err < 0) {
+ ret = err;
+ goto out_unlock;
+ }
+ }
+ }
+
+out_unlock:
+ spin_unlock(&fs_info->qgroup_lock);
+
+out:
+ ulist_free(roots);
+ ulist_free(parents);
+ return ret;
+}
+
+/*
* btrfs_qgroup_account_ref is called for every ref that is added to or deleted
* from the fs. First, all roots referencing the extent are searched, and
* then the space is accounted accordingly to the different roots. The
@@ -1920,6 +2086,9 @@ static int btrfs_qgroup_account(struct btrfs_trans_handle *trans,
case BTRFS_QGROUP_OPER_SUB_SHARED:
ret = qgroup_shared_accounting(trans, fs_info, oper);
break;
+ case BTRFS_QGROUP_OPER_SUB_SUBTREE:
+ ret = qgroup_subtree_accounting(trans, fs_info, oper);
+ break;
default:
ASSERT(0);
}
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 5952ff1fbd7a..18cc68ca3090 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -44,6 +44,7 @@ enum btrfs_qgroup_operation_type {
BTRFS_QGROUP_OPER_ADD_SHARED,
BTRFS_QGROUP_OPER_SUB_EXCL,
BTRFS_QGROUP_OPER_SUB_SHARED,
+ BTRFS_QGROUP_OPER_SUB_SUBTREE,
};
struct btrfs_qgroup_operation {
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 4055291a523e..4a88f073fdd7 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1956,9 +1956,10 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
* pages are going to be uptodate.
*/
for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
- if (rbio->faila == stripe ||
- rbio->failb == stripe)
+ if (rbio->faila == stripe || rbio->failb == stripe) {
+ atomic_inc(&rbio->bbio->error);
continue;
+ }
for (pagenr = 0; pagenr < nr_pages; pagenr++) {
struct page *p;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index ac80188eec88..b6d198f5181e 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -2725,11 +2725,8 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
length = btrfs_dev_extent_length(l, dev_extent);
- if (found_key.offset + length <= start) {
- key.offset = found_key.offset + length;
- btrfs_release_path(path);
- continue;
- }
+ if (found_key.offset + length <= start)
+ goto skip;
chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
@@ -2740,10 +2737,12 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
* the chunk from going away while we scrub it
*/
cache = btrfs_lookup_block_group(fs_info, chunk_offset);
- if (!cache) {
- ret = -ENOENT;
- break;
- }
+
+ /* some chunks are removed but not committed to disk yet,
+ * continue scrubbing */
+ if (!cache)
+ goto skip;
+
dev_replace->cursor_right = found_key.offset + length;
dev_replace->cursor_left = found_key.offset;
dev_replace->item_needs_writeback = 1;
@@ -2802,7 +2801,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
dev_replace->cursor_left = dev_replace->cursor_right;
dev_replace->item_needs_writeback = 1;
-
+skip:
key.offset = found_key.offset + length;
btrfs_release_path(path);
}
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 4662d92a4b73..c4124de4435b 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -522,9 +522,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
case Opt_ssd_spread:
btrfs_set_and_info(root, SSD_SPREAD,
"use spread ssd allocation scheme");
+ btrfs_set_opt(info->mount_opt, SSD);
break;
case Opt_nossd:
- btrfs_clear_and_info(root, NOSSD,
+ btrfs_set_and_info(root, NOSSD,
"not using ssd allocation scheme");
btrfs_clear_opt(info->mount_opt, SSD);
break;
@@ -850,7 +851,6 @@ static struct dentry *get_default_root(struct super_block *sb,
struct btrfs_path *path;
struct btrfs_key location;
struct inode *inode;
- struct dentry *dentry;
u64 dir_id;
int new = 0;
@@ -921,13 +921,7 @@ setup_root:
return dget(sb->s_root);
}
- dentry = d_obtain_alias(inode);
- if (!IS_ERR(dentry)) {
- spin_lock(&dentry->d_lock);
- dentry->d_flags &= ~DCACHE_DISCONNECTED;
- spin_unlock(&dentry->d_lock);
- }
- return dentry;
+ return d_obtain_root(inode);
}
static int btrfs_fill_super(struct super_block *sb,
@@ -1467,7 +1461,9 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
goto restore;
/* recover relocation */
+ mutex_lock(&fs_info->cleaner_mutex);
ret = btrfs_recover_relocation(root);
+ mutex_unlock(&fs_info->cleaner_mutex);
if (ret)
goto restore;
@@ -1669,6 +1665,21 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
return 0;
}
+/*
+ * Calculate numbers for 'df', pessimistic in case of mixed raid profiles.
+ *
+ * If there's a redundant raid level at DATA block groups, use the respective
+ * multiplier to scale the sizes.
+ *
+ * Unused device space usage is based on simulating the chunk allocator
+ * algorithm that respects the device sizes, order of allocations and the
+ * 'alloc_start' value, this is a close approximation of the actual use but
+ * there are other factors that may change the result (like a new metadata
+ * chunk).
+ *
+ * FIXME: not accurate for mixed block groups, total and free/used are ok,
+ * available appears slightly larger.
+ */
static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb);
@@ -1679,6 +1690,8 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
u64 total_free_data = 0;
int bits = dentry->d_sb->s_blocksize_bits;
__be32 *fsid = (__be32 *)fs_info->fsid;
+ unsigned factor = 1;
+ struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
int ret;
/* holding chunk_muext to avoid allocating new chunks */
@@ -1686,30 +1699,52 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
rcu_read_lock();
list_for_each_entry_rcu(found, head, list) {
if (found->flags & BTRFS_BLOCK_GROUP_DATA) {
+ int i;
+
total_free_data += found->disk_total - found->disk_used;
total_free_data -=
btrfs_account_ro_block_groups_free_space(found);
+
+ for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
+ if (!list_empty(&found->block_groups[i])) {
+ switch (i) {
+ case BTRFS_RAID_DUP:
+ case BTRFS_RAID_RAID1:
+ case BTRFS_RAID_RAID10:
+ factor = 2;
+ }
+ }
+ }
}
total_used += found->disk_used;
}
+
rcu_read_unlock();
- buf->f_namelen = BTRFS_NAME_LEN;
- buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits;
- buf->f_bfree = buf->f_blocks - (total_used >> bits);
- buf->f_bsize = dentry->d_sb->s_blocksize;
- buf->f_type = BTRFS_SUPER_MAGIC;
+ buf->f_blocks = div_u64(btrfs_super_total_bytes(disk_super), factor);
+ buf->f_blocks >>= bits;
+ buf->f_bfree = buf->f_blocks - (div_u64(total_used, factor) >> bits);
+
+ /* Account global block reserve as used, it's in logical size already */
+ spin_lock(&block_rsv->lock);
+ buf->f_bfree -= block_rsv->size >> bits;
+ spin_unlock(&block_rsv->lock);
+
buf->f_bavail = total_free_data;
ret = btrfs_calc_avail_data_space(fs_info->tree_root, &total_free_data);
if (ret) {
mutex_unlock(&fs_info->chunk_mutex);
return ret;
}
- buf->f_bavail += total_free_data;
+ buf->f_bavail += div_u64(total_free_data, factor);
buf->f_bavail = buf->f_bavail >> bits;
mutex_unlock(&fs_info->chunk_mutex);
+ buf->f_type = BTRFS_SUPER_MAGIC;
+ buf->f_bsize = dentry->d_sb->s_blocksize;
+ buf->f_namelen = BTRFS_NAME_LEN;
+
/* We treat it as constant endianness (it doesn't matter _which_)
because we want the fsid to come out the same whether mounted
on a big-endian or little-endian host */
@@ -1808,6 +1843,8 @@ static int btrfs_show_devname(struct seq_file *m, struct dentry *root)
list_for_each_entry(dev, head, dev_list) {
if (dev->missing)
continue;
+ if (!dev->name)
+ continue;
if (!first_dev || dev->devid < first_dev->devid)
first_dev = dev;
}
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index df39458f1487..78699364f537 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -605,14 +605,37 @@ static void init_feature_attrs(void)
}
}
-static int add_device_membership(struct btrfs_fs_info *fs_info)
+int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info,
+ struct btrfs_device *one_device)
+{
+ struct hd_struct *disk;
+ struct kobject *disk_kobj;
+
+ if (!fs_info->device_dir_kobj)
+ return -EINVAL;
+
+ if (one_device) {
+ disk = one_device->bdev->bd_part;
+ disk_kobj = &part_to_dev(disk)->kobj;
+
+ sysfs_remove_link(fs_info->device_dir_kobj,
+ disk_kobj->name);
+ }
+
+ return 0;
+}
+
+int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info,
+ struct btrfs_device *one_device)
{
int error = 0;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_device *dev;
- fs_info->device_dir_kobj = kobject_create_and_add("devices",
+ if (!fs_info->device_dir_kobj)
+ fs_info->device_dir_kobj = kobject_create_and_add("devices",
&fs_info->super_kobj);
+
if (!fs_info->device_dir_kobj)
return -ENOMEM;
@@ -623,6 +646,9 @@ static int add_device_membership(struct btrfs_fs_info *fs_info)
if (!dev->bdev)
continue;
+ if (one_device && one_device != dev)
+ continue;
+
disk = dev->bdev->bd_part;
disk_kobj = &part_to_dev(disk)->kobj;
@@ -666,7 +692,7 @@ int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info)
if (error)
goto failure;
- error = add_device_membership(fs_info);
+ error = btrfs_kobj_add_device(fs_info, NULL);
if (error)
goto failure;
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index 9ab576318a84..ac46df37504c 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -66,4 +66,8 @@ char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags);
extern const char * const btrfs_feature_set_names[3];
extern struct kobj_type space_info_ktype;
extern struct kobj_type btrfs_raid_ktype;
+int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info,
+ struct btrfs_device *one_device);
+int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info,
+ struct btrfs_device *one_device);
#endif /* _BTRFS_SYSFS_H_ */
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 511839c04f11..d89c6d3542ca 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -218,7 +218,6 @@ loop:
spin_lock_init(&cur_trans->delayed_refs.lock);
INIT_LIST_HEAD(&cur_trans->pending_snapshots);
- INIT_LIST_HEAD(&cur_trans->ordered_operations);
INIT_LIST_HEAD(&cur_trans->pending_chunks);
INIT_LIST_HEAD(&cur_trans->switch_commits);
list_add_tail(&cur_trans->list, &fs_info->trans_list);
@@ -386,11 +385,13 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
bool reloc_reserved = false;
int ret;
+ /* Send isn't supposed to start transactions. */
+ ASSERT(current->journal_info != (void *)BTRFS_SEND_TRANS_STUB);
+
if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
return ERR_PTR(-EROFS);
- if (current->journal_info &&
- current->journal_info != (void *)BTRFS_SEND_TRANS_STUB) {
+ if (current->journal_info) {
WARN_ON(type & TRANS_EXTWRITERS);
h = current->journal_info;
h->use_count++;
@@ -491,6 +492,7 @@ again:
smp_mb();
if (cur_trans->state >= TRANS_STATE_BLOCKED &&
may_wait_transaction(root, type)) {
+ current->journal_info = h;
btrfs_commit_transaction(h, root);
goto again;
}
@@ -1609,32 +1611,6 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
kmem_cache_free(btrfs_trans_handle_cachep, trans);
}
-static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
- struct btrfs_root *root)
-{
- int ret;
-
- ret = btrfs_run_delayed_items(trans, root);
- /*
- * running the delayed items may have added new refs. account
- * them now so that they hinder processing of more delayed refs
- * as little as possible.
- */
- if (ret)
- return ret;
-
- /*
- * rename don't use btrfs_join_transaction, so, once we
- * set the transaction to blocked above, we aren't going
- * to get any new ordered operations. We can safely run
- * it here and no for sure that nothing new will be added
- * to the list
- */
- ret = btrfs_run_ordered_operations(trans, root, 1);
-
- return ret;
-}
-
static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
{
if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT))
@@ -1655,13 +1631,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
struct btrfs_transaction *prev_trans = NULL;
int ret;
- ret = btrfs_run_ordered_operations(trans, root, 0);
- if (ret) {
- btrfs_abort_transaction(trans, root, ret);
- btrfs_end_transaction(trans, root);
- return ret;
- }
-
/* Stop the commit early if ->aborted is set */
if (unlikely(ACCESS_ONCE(cur_trans->aborted))) {
ret = cur_trans->aborted;
@@ -1742,7 +1711,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
if (ret)
goto cleanup_transaction;
- ret = btrfs_flush_all_pending_stuffs(trans, root);
+ ret = btrfs_run_delayed_items(trans, root);
if (ret)
goto cleanup_transaction;
@@ -1750,7 +1719,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
extwriter_counter_read(cur_trans) == 0);
/* some pending stuffs might be added after the previous flush. */
- ret = btrfs_flush_all_pending_stuffs(trans, root);
+ ret = btrfs_run_delayed_items(trans, root);
if (ret)
goto cleanup_transaction;
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 7dd558ed0716..579be51b27e5 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -55,7 +55,6 @@ struct btrfs_transaction {
wait_queue_head_t writer_wait;
wait_queue_head_t commit_wait;
struct list_head pending_snapshots;
- struct list_head ordered_operations;
struct list_head pending_chunks;
struct list_head switch_commits;
struct btrfs_delayed_ref_root delayed_refs;
diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h
index 7f78cbf5cf41..4c29db604bbe 100644
--- a/fs/btrfs/ulist.h
+++ b/fs/btrfs/ulist.h
@@ -57,6 +57,21 @@ void ulist_free(struct ulist *ulist);
int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask);
int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
u64 *old_aux, gfp_t gfp_mask);
+
+/* just like ulist_add_merge() but take a pointer for the aux data */
+static inline int ulist_add_merge_ptr(struct ulist *ulist, u64 val, void *aux,
+ void **old_aux, gfp_t gfp_mask)
+{
+#if BITS_PER_LONG == 32
+ u64 old64 = (uintptr_t)*old_aux;
+ int ret = ulist_add_merge(ulist, val, (uintptr_t)aux, &old64, gfp_mask);
+ *old_aux = (void *)((uintptr_t)old64);
+ return ret;
+#else
+ return ulist_add_merge(ulist, val, (u64)aux, (u64 *)old_aux, gfp_mask);
+#endif
+}
+
struct ulist_node *ulist_next(struct ulist *ulist,
struct ulist_iterator *uiter);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index ffeed6d6326f..6cb82f62cb7c 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -40,6 +40,7 @@
#include "rcu-string.h"
#include "math.h"
#include "dev-replace.h"
+#include "sysfs.h"
static int init_first_rw_device(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -554,12 +555,14 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
* This is ok to do without rcu read locked because we hold the
* uuid mutex so nothing we touch in here is going to disappear.
*/
- name = rcu_string_strdup(orig_dev->name->str, GFP_NOFS);
- if (!name) {
- kfree(device);
- goto error;
+ if (orig_dev->name) {
+ name = rcu_string_strdup(orig_dev->name->str, GFP_NOFS);
+ if (!name) {
+ kfree(device);
+ goto error;
+ }
+ rcu_assign_pointer(device->name, name);
}
- rcu_assign_pointer(device->name, name);
list_add(&device->dev_list, &fs_devices->devices);
device->fs_devices = fs_devices;
@@ -1677,8 +1680,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
if (device->bdev == root->fs_info->fs_devices->latest_bdev)
root->fs_info->fs_devices->latest_bdev = next_device->bdev;
- if (device->bdev)
+ if (device->bdev) {
device->fs_devices->open_devices--;
+ /* remove sysfs entry */
+ btrfs_kobj_rm_device(root->fs_info, device);
+ }
call_rcu(&device->rcu, free_device);
@@ -2143,9 +2149,14 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
total_bytes = btrfs_super_num_devices(root->fs_info->super_copy);
btrfs_set_super_num_devices(root->fs_info->super_copy,
total_bytes + 1);
+
+ /* add sysfs device entry */
+ btrfs_kobj_add_device(root->fs_info, device);
+
mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
if (seeding_dev) {
+ char fsid_buf[BTRFS_UUID_UNPARSED_SIZE];
ret = init_first_rw_device(trans, root, device);
if (ret) {
btrfs_abort_transaction(trans, root, ret);
@@ -2156,6 +2167,14 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
btrfs_abort_transaction(trans, root, ret);
goto error_trans;
}
+
+ /* Sprouting would change fsid of the mounted root,
+ * so rename the fsid on the sysfs
+ */
+ snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU",
+ root->fs_info->fsid);
+ if (kobject_rename(&root->fs_info->super_kobj, fsid_buf))
+ goto error_trans;
} else {
ret = btrfs_add_device(trans, root, device);
if (ret) {
@@ -2205,6 +2224,7 @@ error_trans:
unlock_chunks(root);
btrfs_end_transaction(trans, root);
rcu_string_free(device->name);
+ btrfs_kobj_rm_device(root->fs_info, device);
kfree(device);
error:
blkdev_put(bdev, FMODE_EXCL);
@@ -2543,9 +2563,6 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
remove_extent_mapping(em_tree, em);
write_unlock(&em_tree->lock);
- kfree(map);
- em->bdev = NULL;
-
/* once for the tree */
free_extent_map(em);
/* once for us */
@@ -4301,9 +4318,11 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
em = alloc_extent_map();
if (!em) {
+ kfree(map);
ret = -ENOMEM;
goto error;
}
+ set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
em->bdev = (struct block_device *)map;
em->start = start;
em->len = num_bytes;
@@ -4346,7 +4365,6 @@ error_del_extent:
/* One for the tree reference */
free_extent_map(em);
error:
- kfree(map);
kfree(devices_info);
return ret;
}
@@ -4558,7 +4576,6 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
write_unlock(&tree->map_tree.lock);
if (!em)
break;
- kfree(em->bdev);
/* once for us */
free_extent_map(em);
/* once for the tree */
@@ -5362,6 +5379,15 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
return 0;
}
+static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio, int err)
+{
+ if (likely(bbio->flags & BTRFS_BIO_ORIG_BIO_SUBMITTED))
+ bio_endio_nodec(bio, err);
+ else
+ bio_endio(bio, err);
+ kfree(bbio);
+}
+
static void btrfs_end_bio(struct bio *bio, int err)
{
struct btrfs_bio *bbio = bio->bi_private;
@@ -5402,12 +5428,6 @@ static void btrfs_end_bio(struct bio *bio, int err)
bio = bbio->orig_bio;
}
- /*
- * We have original bio now. So increment bi_remaining to
- * account for it in endio
- */
- atomic_inc(&bio->bi_remaining);
-
bio->bi_private = bbio->private;
bio->bi_end_io = bbio->end_io;
btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
@@ -5424,9 +5444,8 @@ static void btrfs_end_bio(struct bio *bio, int err)
set_bit(BIO_UPTODATE, &bio->bi_flags);
err = 0;
}
- kfree(bbio);
- bio_endio(bio, err);
+ btrfs_end_bbio(bbio, bio, err);
} else if (!is_orig_bio) {
bio_put(bio);
}
@@ -5589,12 +5608,15 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
{
atomic_inc(&bbio->error);
if (atomic_dec_and_test(&bbio->stripes_pending)) {
+ /* Shoud be the original bio. */
+ WARN_ON(bio != bbio->orig_bio);
+
bio->bi_private = bbio->private;
bio->bi_end_io = bbio->end_io;
btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
bio->bi_iter.bi_sector = logical >> 9;
- kfree(bbio);
- bio_endio(bio, -EIO);
+
+ btrfs_end_bbio(bbio, bio, -EIO);
}
}
@@ -5681,6 +5703,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
BUG_ON(!bio); /* -ENOMEM */
} else {
bio = first_bio;
+ bbio->flags |= BTRFS_BIO_ORIG_BIO_SUBMITTED;
}
submit_stripe_bio(root, bbio, bio,
@@ -5822,6 +5845,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
return -ENOMEM;
}
+ set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
em->bdev = (struct block_device *)map;
em->start = logical;
em->len = length;
@@ -5846,7 +5870,6 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
map->stripes[i].dev = btrfs_find_device(root->fs_info, devid,
uuid, NULL);
if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) {
- kfree(map);
free_extent_map(em);
return -EIO;
}
@@ -5854,7 +5877,6 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
map->stripes[i].dev =
add_missing_dev(root, devid, uuid);
if (!map->stripes[i].dev) {
- kfree(map);
free_extent_map(em);
return -EIO;
}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 1a15bbeb65e2..2aaa00c47816 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -190,11 +190,14 @@ struct btrfs_bio_stripe {
struct btrfs_bio;
typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err);
+#define BTRFS_BIO_ORIG_BIO_SUBMITTED 0x1
+
struct btrfs_bio {
atomic_t stripes_pending;
struct btrfs_fs_info *fs_info;
bio_end_io_t *end_io;
struct bio *orig_bio;
+ unsigned long flags;
void *private;
atomic_t error;
int max_errors;
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 4f196314c0c1..b67d8fc81277 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -136,7 +136,7 @@ static int zlib_compress_pages(struct list_head *ws,
if (workspace->def_strm.total_in > 8192 &&
workspace->def_strm.total_in <
workspace->def_strm.total_out) {
- ret = -EIO;
+ ret = -E2BIG;
goto out;
}
/* we need another page for writing out. Test this