summaryrefslogtreecommitdiff
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/Makefile2
-rw-r--r--fs/btrfs/acl.c21
-rw-r--r--fs/btrfs/async-thread.c14
-rw-r--r--fs/btrfs/async-thread.h1
-rw-r--r--fs/btrfs/compression.c4
-rw-r--r--fs/btrfs/ctree.c32
-rw-r--r--fs/btrfs/ctree.h145
-rw-r--r--fs/btrfs/delayed-inode.c6
-rw-r--r--fs/btrfs/dev-replace.c8
-rw-r--r--fs/btrfs/disk-io.c135
-rw-r--r--fs/btrfs/disk-io.h2
-rw-r--r--fs/btrfs/extent-tree.c246
-rw-r--r--fs/btrfs/extent_io.c58
-rw-r--r--fs/btrfs/extent_io.h19
-rw-r--r--fs/btrfs/extent_map.c2
-rw-r--r--fs/btrfs/extent_map.h10
-rw-r--r--fs/btrfs/file.c11
-rw-r--r--fs/btrfs/free-space-cache.c37
-rw-r--r--fs/btrfs/hash.c5
-rw-r--r--fs/btrfs/inode.c116
-rw-r--r--fs/btrfs/ioctl.c48
-rw-r--r--fs/btrfs/qgroup.c25
-rw-r--r--fs/btrfs/raid56.c21
-rw-r--r--fs/btrfs/relocation.c50
-rw-r--r--fs/btrfs/root-tree.c27
-rw-r--r--fs/btrfs/scrub.c4
-rw-r--r--fs/btrfs/send.c42
-rw-r--r--fs/btrfs/struct-funcs.c9
-rw-r--r--fs/btrfs/super.c3
-rw-r--r--fs/btrfs/tests/qgroup-tests.c2
-rw-r--r--fs/btrfs/transaction.c6
-rw-r--r--fs/btrfs/tree-checker.c649
-rw-r--r--fs/btrfs/tree-checker.h38
-rw-r--r--fs/btrfs/tree-log.c120
-rw-r--r--fs/btrfs/uuid-tree.c4
-rw-r--r--fs/btrfs/volumes.c171
-rw-r--r--fs/btrfs/volumes.h2
37 files changed, 1689 insertions, 406 deletions
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 6d1d0b93b1aa..c792df826e12 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -9,7 +9,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
export.o tree-log.o free-space-cache.o zlib.o lzo.o \
compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
- uuid-tree.o props.o hash.o
+ uuid-tree.o props.o hash.o tree-checker.o
btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 9a0124a95851..6b16b8653d98 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -82,14 +82,6 @@ static int __btrfs_set_acl(struct btrfs_trans_handle *trans,
switch (type) {
case ACL_TYPE_ACCESS:
name = POSIX_ACL_XATTR_ACCESS;
- if (acl) {
- ret = posix_acl_equiv_mode(acl, &inode->i_mode);
- if (ret < 0)
- return ret;
- if (ret == 0)
- acl = NULL;
- }
- ret = 0;
break;
case ACL_TYPE_DEFAULT:
if (!S_ISDIR(inode->i_mode))
@@ -125,7 +117,18 @@ out:
int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
{
- return __btrfs_set_acl(NULL, inode, acl, type);
+ int ret;
+ umode_t old_mode = inode->i_mode;
+
+ if (type == ACL_TYPE_ACCESS && acl) {
+ ret = posix_acl_update_mode(inode, &inode->i_mode, &acl);
+ if (ret)
+ return ret;
+ }
+ ret = __btrfs_set_acl(NULL, inode, acl, type);
+ if (ret)
+ inode->i_mode = old_mode;
+ return ret;
}
/*
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 9aba42b78253..a09264d8b853 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -70,6 +70,20 @@ void btrfs_##name(struct work_struct *arg) \
normal_work_helper(work); \
}
+bool btrfs_workqueue_normal_congested(struct btrfs_workqueue *wq)
+{
+ /*
+ * We could compare wq->normal->pending with num_online_cpus()
+ * to support "thresh == NO_THRESHOLD" case, but it requires
+ * moving up atomic_inc/dec in thresh_queue/exec_hook. Let's
+ * postpone it until someone needs the support of that case.
+ */
+ if (wq->normal->thresh == NO_THRESHOLD)
+ return false;
+
+ return atomic_read(&wq->normal->pending) > wq->normal->thresh * 2;
+}
+
BTRFS_WORK_HELPER(worker_helper);
BTRFS_WORK_HELPER(delalloc_helper);
BTRFS_WORK_HELPER(flush_delalloc_helper);
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index ad4d0647d1a6..8e1d6576d764 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -80,4 +80,5 @@ void btrfs_queue_work(struct btrfs_workqueue *wq,
void btrfs_destroy_workqueue(struct btrfs_workqueue *wq);
void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max);
void btrfs_set_work_high_priority(struct btrfs_work *work);
+bool btrfs_workqueue_normal_congested(struct btrfs_workqueue *wq);
#endif
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index c473c42d7d6c..bae05c5c75ba 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -694,7 +694,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
ret = btrfs_map_bio(root, READ, comp_bio,
mirror_num, 0);
if (ret) {
- bio->bi_error = ret;
+ comp_bio->bi_error = ret;
bio_endio(comp_bio);
}
@@ -723,7 +723,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0);
if (ret) {
- bio->bi_error = ret;
+ comp_bio->bi_error = ret;
bio_endio(comp_bio);
}
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 0f2b7c622ce3..8f4baa3cb992 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1726,20 +1726,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
return err;
}
-/*
- * The leaf data grows from end-to-front in the node.
- * this returns the address of the start of the last item,
- * which is the stop of the leaf data stack
- */
-static inline unsigned int leaf_data_end(struct btrfs_root *root,
- struct extent_buffer *leaf)
-{
- u32 nr = btrfs_header_nritems(leaf);
- if (nr == 0)
- return BTRFS_LEAF_DATA_SIZE(root);
- return btrfs_item_offset_nr(leaf, nr - 1);
-}
-
/*
* search for key in the extent_buffer. The items start at offset p,
@@ -2497,10 +2483,8 @@ read_block_for_search(struct btrfs_trans_handle *trans,
if (p->reada)
reada_for_search(root, p, level, slot, key->objectid);
- btrfs_release_path(p);
-
ret = -EAGAIN;
- tmp = read_tree_block(root, blocknr, 0);
+ tmp = read_tree_block(root, blocknr, gen);
if (!IS_ERR(tmp)) {
/*
* If the read above didn't mark this buffer up to date,
@@ -2512,6 +2496,8 @@ read_block_for_search(struct btrfs_trans_handle *trans,
ret = -EIO;
free_extent_buffer(tmp);
}
+
+ btrfs_release_path(p);
return ret;
}
@@ -2769,6 +2755,8 @@ again:
* contention with the cow code
*/
if (cow) {
+ bool last_level = (level == (BTRFS_MAX_LEVEL - 1));
+
/*
* if we don't really need to cow this block
* then we don't want to set the path blocking,
@@ -2793,9 +2781,13 @@ again:
}
btrfs_set_path_blocking(p);
- err = btrfs_cow_block(trans, root, b,
- p->nodes[level + 1],
- p->slots[level + 1], &b);
+ if (last_level)
+ err = btrfs_cow_block(trans, root, b, NULL, 0,
+ &b);
+ else
+ err = btrfs_cow_block(trans, root, b,
+ p->nodes[level + 1],
+ p->slots[level + 1], &b);
if (err) {
ret = err;
goto done;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 1391f72c28c3..4a91d3119e59 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -35,6 +35,7 @@
#include <linux/btrfs.h>
#include <linux/workqueue.h>
#include <linux/security.h>
+#include <linux/sizes.h>
#include "extent_io.h"
#include "extent_map.h"
#include "async-thread.h"
@@ -897,6 +898,7 @@ struct btrfs_balance_item {
#define BTRFS_FILE_EXTENT_INLINE 0
#define BTRFS_FILE_EXTENT_REG 1
#define BTRFS_FILE_EXTENT_PREALLOC 2
+#define BTRFS_FILE_EXTENT_TYPES 2
struct btrfs_file_extent_item {
/*
@@ -2283,7 +2285,7 @@ do { \
#define BTRFS_INODE_ROOT_ITEM_INIT (1 << 31)
struct btrfs_map_token {
- struct extent_buffer *eb;
+ const struct extent_buffer *eb;
char *kaddr;
unsigned long offset;
};
@@ -2314,18 +2316,19 @@ static inline void btrfs_init_map_token (struct btrfs_map_token *token)
sizeof(((type *)0)->member)))
#define DECLARE_BTRFS_SETGET_BITS(bits) \
-u##bits btrfs_get_token_##bits(struct extent_buffer *eb, void *ptr, \
- unsigned long off, \
- struct btrfs_map_token *token); \
-void btrfs_set_token_##bits(struct extent_buffer *eb, void *ptr, \
+u##bits btrfs_get_token_##bits(const struct extent_buffer *eb, \
+ const void *ptr, unsigned long off, \
+ struct btrfs_map_token *token); \
+void btrfs_set_token_##bits(struct extent_buffer *eb, const void *ptr, \
unsigned long off, u##bits val, \
struct btrfs_map_token *token); \
-static inline u##bits btrfs_get_##bits(struct extent_buffer *eb, void *ptr, \
+static inline u##bits btrfs_get_##bits(const struct extent_buffer *eb, \
+ const void *ptr, \
unsigned long off) \
{ \
return btrfs_get_token_##bits(eb, ptr, off, NULL); \
} \
-static inline void btrfs_set_##bits(struct extent_buffer *eb, void *ptr, \
+static inline void btrfs_set_##bits(struct extent_buffer *eb, void *ptr,\
unsigned long off, u##bits val) \
{ \
btrfs_set_token_##bits(eb, ptr, off, val, NULL); \
@@ -2337,7 +2340,8 @@ DECLARE_BTRFS_SETGET_BITS(32)
DECLARE_BTRFS_SETGET_BITS(64)
#define BTRFS_SETGET_FUNCS(name, type, member, bits) \
-static inline u##bits btrfs_##name(struct extent_buffer *eb, type *s) \
+static inline u##bits btrfs_##name(const struct extent_buffer *eb, \
+ const type *s) \
{ \
BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \
return btrfs_get_##bits(eb, s, offsetof(type, member)); \
@@ -2348,7 +2352,8 @@ static inline void btrfs_set_##name(struct extent_buffer *eb, type *s, \
BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \
btrfs_set_##bits(eb, s, offsetof(type, member), val); \
} \
-static inline u##bits btrfs_token_##name(struct extent_buffer *eb, type *s, \
+static inline u##bits btrfs_token_##name(const struct extent_buffer *eb,\
+ const type *s, \
struct btrfs_map_token *token) \
{ \
BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \
@@ -2363,9 +2368,9 @@ static inline void btrfs_set_token_##name(struct extent_buffer *eb, \
}
#define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \
-static inline u##bits btrfs_##name(struct extent_buffer *eb) \
+static inline u##bits btrfs_##name(const struct extent_buffer *eb) \
{ \
- type *p = page_address(eb->pages[0]); \
+ const type *p = page_address(eb->pages[0]); \
u##bits res = le##bits##_to_cpu(p->member); \
return res; \
} \
@@ -2377,7 +2382,7 @@ static inline void btrfs_set_##name(struct extent_buffer *eb, \
}
#define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits) \
-static inline u##bits btrfs_##name(type *s) \
+static inline u##bits btrfs_##name(const type *s) \
{ \
return le##bits##_to_cpu(s->member); \
} \
@@ -2678,7 +2683,7 @@ static inline unsigned long btrfs_node_key_ptr_offset(int nr)
sizeof(struct btrfs_key_ptr) * nr;
}
-void btrfs_node_key(struct extent_buffer *eb,
+void btrfs_node_key(const struct extent_buffer *eb,
struct btrfs_disk_key *disk_key, int nr);
static inline void btrfs_set_node_key(struct extent_buffer *eb,
@@ -2707,28 +2712,28 @@ static inline struct btrfs_item *btrfs_item_nr(int nr)
return (struct btrfs_item *)btrfs_item_nr_offset(nr);
}
-static inline u32 btrfs_item_end(struct extent_buffer *eb,
+static inline u32 btrfs_item_end(const struct extent_buffer *eb,
struct btrfs_item *item)
{
return btrfs_item_offset(eb, item) + btrfs_item_size(eb, item);
}
-static inline u32 btrfs_item_end_nr(struct extent_buffer *eb, int nr)
+static inline u32 btrfs_item_end_nr(const struct extent_buffer *eb, int nr)
{
return btrfs_item_end(eb, btrfs_item_nr(nr));
}
-static inline u32 btrfs_item_offset_nr(struct extent_buffer *eb, int nr)
+static inline u32 btrfs_item_offset_nr(const struct extent_buffer *eb, int nr)
{
return btrfs_item_offset(eb, btrfs_item_nr(nr));
}
-static inline u32 btrfs_item_size_nr(struct extent_buffer *eb, int nr)
+static inline u32 btrfs_item_size_nr(const struct extent_buffer *eb, int nr)
{
return btrfs_item_size(eb, btrfs_item_nr(nr));
}
-static inline void btrfs_item_key(struct extent_buffer *eb,
+static inline void btrfs_item_key(const struct extent_buffer *eb,
struct btrfs_disk_key *disk_key, int nr)
{
struct btrfs_item *item = btrfs_item_nr(nr);
@@ -2764,8 +2769,8 @@ BTRFS_SETGET_STACK_FUNCS(stack_dir_name_len, struct btrfs_dir_item,
BTRFS_SETGET_STACK_FUNCS(stack_dir_transid, struct btrfs_dir_item,
transid, 64);
-static inline void btrfs_dir_item_key(struct extent_buffer *eb,
- struct btrfs_dir_item *item,
+static inline void btrfs_dir_item_key(const struct extent_buffer *eb,
+ const struct btrfs_dir_item *item,
struct btrfs_disk_key *key)
{
read_eb_member(eb, item, struct btrfs_dir_item, location, key);
@@ -2773,7 +2778,7 @@ static inline void btrfs_dir_item_key(struct extent_buffer *eb,
static inline void btrfs_set_dir_item_key(struct extent_buffer *eb,
struct btrfs_dir_item *item,
- struct btrfs_disk_key *key)
+ const struct btrfs_disk_key *key)
{
write_eb_member(eb, item, struct btrfs_dir_item, location, key);
}
@@ -2785,8 +2790,8 @@ BTRFS_SETGET_FUNCS(free_space_bitmaps, struct btrfs_free_space_header,
BTRFS_SETGET_FUNCS(free_space_generation, struct btrfs_free_space_header,
generation, 64);
-static inline void btrfs_free_space_key(struct extent_buffer *eb,
- struct btrfs_free_space_header *h,
+static inline void btrfs_free_space_key(const struct extent_buffer *eb,
+ const struct btrfs_free_space_header *h,
struct btrfs_disk_key *key)
{
read_eb_member(eb, h, struct btrfs_free_space_header, location, key);
@@ -2794,7 +2799,7 @@ static inline void btrfs_free_space_key(struct extent_buffer *eb,
static inline void btrfs_set_free_space_key(struct extent_buffer *eb,
struct btrfs_free_space_header *h,
- struct btrfs_disk_key *key)
+ const struct btrfs_disk_key *key)
{
write_eb_member(eb, h, struct btrfs_free_space_header, location, key);
}
@@ -2821,25 +2826,25 @@ static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk,
disk->objectid = cpu_to_le64(cpu->objectid);
}
-static inline void btrfs_node_key_to_cpu(struct extent_buffer *eb,
- struct btrfs_key *key, int nr)
+static inline void btrfs_node_key_to_cpu(const struct extent_buffer *eb,
+ struct btrfs_key *key, int nr)
{
struct btrfs_disk_key disk_key;
btrfs_node_key(eb, &disk_key, nr);
btrfs_disk_key_to_cpu(key, &disk_key);
}
-static inline void btrfs_item_key_to_cpu(struct extent_buffer *eb,
- struct btrfs_key *key, int nr)
+static inline void btrfs_item_key_to_cpu(const struct extent_buffer *eb,
+ struct btrfs_key *key, int nr)
{
struct btrfs_disk_key disk_key;
btrfs_item_key(eb, &disk_key, nr);
btrfs_disk_key_to_cpu(key, &disk_key);
}
-static inline void btrfs_dir_item_key_to_cpu(struct extent_buffer *eb,
- struct btrfs_dir_item *item,
- struct btrfs_key *key)
+static inline void btrfs_dir_item_key_to_cpu(const struct extent_buffer *eb,
+ const struct btrfs_dir_item *item,
+ struct btrfs_key *key)
{
struct btrfs_disk_key disk_key;
btrfs_dir_item_key(eb, item, &disk_key);
@@ -2872,7 +2877,7 @@ BTRFS_SETGET_STACK_FUNCS(stack_header_nritems, struct btrfs_header,
nritems, 32);
BTRFS_SETGET_STACK_FUNCS(stack_header_bytenr, struct btrfs_header, bytenr, 64);
-static inline int btrfs_header_flag(struct extent_buffer *eb, u64 flag)
+static inline int btrfs_header_flag(const struct extent_buffer *eb, u64 flag)
{
return (btrfs_header_flags(eb) & flag) == flag;
}
@@ -2891,7 +2896,7 @@ static inline int btrfs_clear_header_flag(struct extent_buffer *eb, u64 flag)
return (flags & flag) == flag;
}
-static inline int btrfs_header_backref_rev(struct extent_buffer *eb)
+static inline int btrfs_header_backref_rev(const struct extent_buffer *eb)
{
u64 flags = btrfs_header_flags(eb);
return flags >> BTRFS_BACKREF_REV_SHIFT;
@@ -2911,12 +2916,12 @@ static inline unsigned long btrfs_header_fsid(void)
return offsetof(struct btrfs_header, fsid);
}
-static inline unsigned long btrfs_header_chunk_tree_uuid(struct extent_buffer *eb)
+static inline unsigned long btrfs_header_chunk_tree_uuid(const struct extent_buffer *eb)
{
return offsetof(struct btrfs_header, chunk_tree_uuid);
}
-static inline int btrfs_is_leaf(struct extent_buffer *eb)
+static inline int btrfs_is_leaf(const struct extent_buffer *eb)
{
return btrfs_header_level(eb) == 0;
}
@@ -2950,12 +2955,12 @@ BTRFS_SETGET_STACK_FUNCS(root_stransid, struct btrfs_root_item,
BTRFS_SETGET_STACK_FUNCS(root_rtransid, struct btrfs_root_item,
rtransid, 64);
-static inline bool btrfs_root_readonly(struct btrfs_root *root)
+static inline bool btrfs_root_readonly(const struct btrfs_root *root)
{
return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_RDONLY)) != 0;
}
-static inline bool btrfs_root_dead(struct btrfs_root *root)
+static inline bool btrfs_root_dead(const struct btrfs_root *root)
{
return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_DEAD)) != 0;
}
@@ -3012,51 +3017,51 @@ BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup,
/* struct btrfs_balance_item */
BTRFS_SETGET_FUNCS(balance_flags, struct btrfs_balance_item, flags, 64);
-static inline void btrfs_balance_data(struct extent_buffer *eb,
- struct btrfs_balance_item *bi,
+static inline void btrfs_balance_data(const struct extent_buffer *eb,
+ const struct btrfs_balance_item *bi,
struct btrfs_disk_balance_args *ba)
{
read_eb_member(eb, bi, struct btrfs_balance_item, data, ba);
}
static inline void btrfs_set_balance_data(struct extent_buffer *eb,
- struct btrfs_balance_item *bi,
- struct btrfs_disk_balance_args *ba)
+ struct btrfs_balance_item *bi,
+ const struct btrfs_disk_balance_args *ba)
{
write_eb_member(eb, bi, struct btrfs_balance_item, data, ba);
}
-static inline void btrfs_balance_meta(struct extent_buffer *eb,
- struct btrfs_balance_item *bi,
+static inline void btrfs_balance_meta(const struct extent_buffer *eb,
+ const struct btrfs_balance_item *bi,
struct btrfs_disk_balance_args *ba)
{
read_eb_member(eb, bi, struct btrfs_balance_item, meta, ba);
}
static inline void btrfs_set_balance_meta(struct extent_buffer *eb,
- struct btrfs_balance_item *bi,
- struct btrfs_disk_balance_args *ba)
+ struct btrfs_balance_item *bi,
+ const struct btrfs_disk_balance_args *ba)
{
write_eb_member(eb, bi, struct btrfs_balance_item, meta, ba);
}
-static inline void btrfs_balance_sys(struct extent_buffer *eb,
- struct btrfs_balance_item *bi,
+static inline void btrfs_balance_sys(const struct extent_buffer *eb,
+ const struct btrfs_balance_item *bi,
struct btrfs_disk_balance_args *ba)
{
read_eb_member(eb, bi, struct btrfs_balance_item, sys, ba);
}
static inline void btrfs_set_balance_sys(struct extent_buffer *eb,
- struct btrfs_balance_item *bi,
- struct btrfs_disk_balance_args *ba)
+ struct btrfs_balance_item *bi,
+ const struct btrfs_disk_balance_args *ba)
{
write_eb_member(eb, bi, struct btrfs_balance_item, sys, ba);
}
static inline void
btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu,
- struct btrfs_disk_balance_args *disk)
+ const struct btrfs_disk_balance_args *disk)
{
memset(cpu, 0, sizeof(*cpu));
@@ -3070,11 +3075,13 @@ btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu,
cpu->target = le64_to_cpu(disk->target);
cpu->flags = le64_to_cpu(disk->flags);
cpu->limit = le64_to_cpu(disk->limit);
+ cpu->stripes_min = le32_to_cpu(disk->stripes_min);
+ cpu->stripes_max = le32_to_cpu(disk->stripes_max);
}
static inline void
btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk,
- struct btrfs_balance_args *cpu)
+ const struct btrfs_balance_args *cpu)
{
memset(disk, 0, sizeof(*disk));
@@ -3088,6 +3095,8 @@ btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk,
disk->target = cpu_to_le64(cpu->target);
disk->flags = cpu_to_le64(cpu->flags);
disk->limit = cpu_to_le64(cpu->limit);
+ disk->stripes_min = cpu_to_le32(cpu->stripes_min);
+ disk->stripes_max = cpu_to_le32(cpu->stripes_max);
}
/* struct btrfs_super_block */
@@ -3140,7 +3149,7 @@ BTRFS_SETGET_STACK_FUNCS(super_magic, struct btrfs_super_block, magic, 64);
BTRFS_SETGET_STACK_FUNCS(super_uuid_tree_generation, struct btrfs_super_block,
uuid_tree_generation, 64);
-static inline int btrfs_super_csum_size(struct btrfs_super_block *s)
+static inline int btrfs_super_csum_size(const struct btrfs_super_block *s)
{
u16 t = btrfs_super_csum_type(s);
/*
@@ -3154,6 +3163,21 @@ static inline unsigned long btrfs_leaf_data(struct extent_buffer *l)
return offsetof(struct btrfs_leaf, items);
}
+/*
+ * The leaf data grows from end-to-front in the node.
+ * this returns the address of the start of the last item,
+ * which is the stop of the leaf data stack
+ */
+static inline unsigned int leaf_data_end(const struct btrfs_root *root,
+ const struct extent_buffer *leaf)
+{
+ u32 nr = btrfs_header_nritems(leaf);
+
+ if (nr == 0)
+ return BTRFS_LEAF_DATA_SIZE(root);
+ return btrfs_item_offset_nr(leaf, nr - 1);
+}
+
/* struct btrfs_file_extent_item */
BTRFS_SETGET_FUNCS(file_extent_type, struct btrfs_file_extent_item, type, 8);
BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_bytenr,
@@ -3170,7 +3194,7 @@ BTRFS_SETGET_STACK_FUNCS(stack_file_extent_compression,
struct btrfs_file_extent_item, compression, 8);
static inline unsigned long
-btrfs_file_extent_inline_start(struct btrfs_file_extent_item *e)
+btrfs_file_extent_inline_start(const struct btrfs_file_extent_item *e)
{
return (unsigned long)e + BTRFS_FILE_EXTENT_INLINE_DATA_START;
}
@@ -3204,8 +3228,9 @@ BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item,
* size of any extent headers. If a file is compressed on disk, this is
* the compressed size
*/
-static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb,
- struct btrfs_item *e)
+static inline u32 btrfs_file_extent_inline_item_len(
+ const struct extent_buffer *eb,
+ struct btrfs_item *e)
{
return btrfs_item_size(eb, e) - BTRFS_FILE_EXTENT_INLINE_DATA_START;
}
@@ -3213,9 +3238,9 @@ static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb,
/* this returns the number of file bytes represented by the inline item.
* If an item is compressed, this is the uncompressed size
*/
-static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb,
- int slot,
- struct btrfs_file_extent_item *fi)
+static inline u32 btrfs_file_extent_inline_len(const struct extent_buffer *eb,
+ int slot,
+ const struct btrfs_file_extent_item *fi)
{
struct btrfs_map_token token;
@@ -3237,8 +3262,8 @@ static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb,
/* btrfs_dev_stats_item */
-static inline u64 btrfs_dev_stats_value(struct extent_buffer *eb,
- struct btrfs_dev_stats_item *ptr,
+static inline u64 btrfs_dev_stats_value(const struct extent_buffer *eb,
+ const struct btrfs_dev_stats_item *ptr,
int index)
{
u64 val;
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 02b934d0ee65..09fa5af9782e 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1375,7 +1375,8 @@ release_path:
total_done++;
btrfs_release_prepared_delayed_node(delayed_node);
- if (async_work->nr == 0 || total_done < async_work->nr)
+ if ((async_work->nr == 0 && total_done < BTRFS_DELAYED_WRITEBACK) ||
+ total_done < async_work->nr)
goto again;
free_path:
@@ -1391,7 +1392,8 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root,
{
struct btrfs_async_delayed_work *async_work;
- if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND)
+ if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND ||
+ btrfs_workqueue_normal_congested(fs_info->delayed_workers))
return 0;
async_work = kmalloc(sizeof(*async_work), GFP_NOFS);
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 1e668fb7dd4c..81e5bc62e8e3 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -574,6 +574,12 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
btrfs_rm_dev_replace_unblocked(fs_info);
/*
+ * Increment dev_stats_ccnt so that btrfs_run_dev_stats() will
+ * update on-disk dev stats value during commit transaction
+ */
+ atomic_inc(&tgt_device->dev_stats_ccnt);
+
+ /*
* this is again a consistent state where no dev_replace procedure
* is running, the target device is part of the filesystem, the
* source device is not part of the filesystem anymore and its 1st
@@ -614,7 +620,7 @@ static void btrfs_dev_replace_update_device_in_mapping_tree(
em = lookup_extent_mapping(em_tree, start, (u64)-1);
if (!em)
break;
- map = (struct map_lookup *)em->bdev;
+ map = em->map_lookup;
for (i = 0; i < map->num_stripes; i++)
if (srcdev == map->stripes[i].dev)
map->stripes[i].dev = tgtdev;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 85b207d19aa5..78722aaffecd 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -49,6 +49,7 @@
#include "raid56.h"
#include "sysfs.h"
#include "qgroup.h"
+#include "tree-checker.h"
#ifdef CONFIG_X86
#include <asm/cpufeature.h>
@@ -445,9 +446,9 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
int mirror_num = 0;
int failed_mirror = 0;
- clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
while (1) {
+ clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
ret = read_extent_buffer_pages(io_tree, eb, start,
WAIT_COMPLETE,
btree_get_extent, mirror_num);
@@ -459,14 +460,6 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
ret = -EIO;
}
- /*
- * This buffer's crc is fine, but its contents are corrupted, so
- * there is no reason to read the other copies, they won't be
- * any less wrong.
- */
- if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags))
- break;
-
num_copies = btrfs_num_copies(root->fs_info,
eb->start, eb->len);
if (num_copies == 1)
@@ -530,72 +523,6 @@ static int check_tree_block_fsid(struct btrfs_fs_info *fs_info,
return ret;
}
-#define CORRUPT(reason, eb, root, slot) \
- btrfs_crit(root->fs_info, "corrupt leaf, %s: block=%llu," \
- "root=%llu, slot=%d", reason, \
- btrfs_header_bytenr(eb), root->objectid, slot)
-
-static noinline int check_leaf(struct btrfs_root *root,
- struct extent_buffer *leaf)
-{
- struct btrfs_key key;
- struct btrfs_key leaf_key;
- u32 nritems = btrfs_header_nritems(leaf);
- int slot;
-
- if (nritems == 0)
- return 0;
-
- /* Check the 0 item */
- if (btrfs_item_offset_nr(leaf, 0) + btrfs_item_size_nr(leaf, 0) !=
- BTRFS_LEAF_DATA_SIZE(root)) {
- CORRUPT("invalid item offset size pair", leaf, root, 0);
- return -EIO;
- }
-
- /*
- * Check to make sure each items keys are in the correct order and their
- * offsets make sense. We only have to loop through nritems-1 because
- * we check the current slot against the next slot, which verifies the
- * next slot's offset+size makes sense and that the current's slot
- * offset is correct.
- */
- for (slot = 0; slot < nritems - 1; slot++) {
- btrfs_item_key_to_cpu(leaf, &leaf_key, slot);
- btrfs_item_key_to_cpu(leaf, &key, slot + 1);
-
- /* Make sure the keys are in the right order */
- if (btrfs_comp_cpu_keys(&leaf_key, &key) >= 0) {
- CORRUPT("bad key order", leaf, root, slot);
- return -EIO;
- }
-
- /*
- * Make sure the offset and ends are right, remember that the
- * item data starts at the end of the leaf and grows towards the
- * front.
- */
- if (btrfs_item_offset_nr(leaf, slot) !=
- btrfs_item_end_nr(leaf, slot + 1)) {
- CORRUPT("slot offset bad", leaf, root, slot);
- return -EIO;
- }
-
- /*
- * Check to make sure that we don't point outside of the leaf,
- * just incase all the items are consistent to eachother, but
- * all point outside of the leaf.
- */
- if (btrfs_item_end_nr(leaf, slot) >
- BTRFS_LEAF_DATA_SIZE(root)) {
- CORRUPT("slot end outside of leaf", leaf, root, slot);
- return -EIO;
- }
- }
-
- return 0;
-}
-
static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
u64 phy_offset, struct page *page,
u64 start, u64 end, int mirror)
@@ -662,11 +589,14 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
* that we don't try and read the other copies of this block, just
* return -EIO.
*/
- if (found_level == 0 && check_leaf(root, eb)) {
+ if (found_level == 0 && btrfs_check_leaf_full(root, eb)) {
set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
ret = -EIO;
}
+ if (found_level > 0 && btrfs_check_node(root, eb))
+ ret = -EIO;
+
if (!ret)
set_extent_buffer_uptodate(eb);
err:
@@ -923,7 +853,7 @@ static int check_async_write(struct inode *inode, unsigned long bio_flags)
if (bio_flags & EXTENT_BIO_TREE_LOG)
return 0;
#ifdef CONFIG_X86
- if (cpu_has_xmm4_2)
+ if (static_cpu_has(X86_FEATURE_XMM4_2))
return 0;
#endif
return 1;
@@ -1011,8 +941,9 @@ static int btree_writepages(struct address_space *mapping,
fs_info = BTRFS_I(mapping->host)->root->fs_info;
/* this is a bit racy, but that's ok */
- ret = percpu_counter_compare(&fs_info->dirty_metadata_bytes,
- BTRFS_DIRTY_METADATA_THRESH);
+ ret = __percpu_counter_compare(&fs_info->dirty_metadata_bytes,
+ BTRFS_DIRTY_METADATA_THRESH,
+ fs_info->dirty_metadata_batch);
if (ret < 0)
return 0;
}
@@ -1196,7 +1127,7 @@ static struct btrfs_subvolume_writers *btrfs_alloc_subvolume_writers(void)
if (!writers)
return ERR_PTR(-ENOMEM);
- ret = percpu_counter_init(&writers->counter, 0, GFP_KERNEL);
+ ret = percpu_counter_init(&writers->counter, 0, GFP_NOFS);
if (ret < 0) {
kfree(writers);
return ERR_PTR(ret);
@@ -1607,8 +1538,8 @@ fail:
return ret;
}
-static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
- u64 root_id)
+struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
+ u64 root_id)
{
struct btrfs_root *root;
@@ -3965,7 +3896,13 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
buf->len,
root->fs_info->dirty_metadata_batch);
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
- if (btrfs_header_level(buf) == 0 && check_leaf(root, buf)) {
+ /*
+ * Since btrfs_mark_buffer_dirty() can be called with item pointer set
+ * but item data not updated.
+ * So here we should only check item pointers, not item data.
+ */
+ if (btrfs_header_level(buf) == 0 &&
+ btrfs_check_leaf_relaxed(root, buf)) {
btrfs_print_leaf(root, buf);
ASSERT(0);
}
@@ -3987,8 +3924,9 @@ static void __btrfs_btree_balance_dirty(struct btrfs_root *root,
if (flush_delayed)
btrfs_balance_delayed_items(root);
- ret = percpu_counter_compare(&root->fs_info->dirty_metadata_bytes,
- BTRFS_DIRTY_METADATA_THRESH);
+ ret = __percpu_counter_compare(&root->fs_info->dirty_metadata_bytes,
+ BTRFS_DIRTY_METADATA_THRESH,
+ root->fs_info->dirty_metadata_batch);
if (ret > 0) {
balance_dirty_pages_ratelimited(
root->fs_info->btree_inode->i_mapping);
@@ -4173,6 +4111,14 @@ static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info)
spin_lock(&fs_info->ordered_root_lock);
}
spin_unlock(&fs_info->ordered_root_lock);
+
+ /*
+ * We need this here because if we've been flipped read-only we won't
+ * get sync() from the umount, so we need to make sure any ordered
+ * extents that haven't had their dirty pages IO start writeout yet
+ * actually get run and error out properly.
+ */
+ btrfs_wait_ordered_roots(fs_info, -1);
}
static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
@@ -4331,6 +4277,7 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root,
static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
struct extent_io_tree *pinned_extents)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_io_tree *unpin;
u64 start;
u64 end;
@@ -4340,21 +4287,31 @@ static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
unpin = pinned_extents;
again:
while (1) {
+ /*
+ * The btrfs_finish_extent_commit() may get the same range as
+ * ours between find_first_extent_bit and clear_extent_dirty.
+ * Hence, hold the unused_bg_unpin_mutex to avoid double unpin
+ * the same extent range.
+ */
+ mutex_lock(&fs_info->unused_bg_unpin_mutex);
ret = find_first_extent_bit(unpin, 0, &start, &end,
EXTENT_DIRTY, NULL);
- if (ret)
+ if (ret) {
+ mutex_unlock(&fs_info->unused_bg_unpin_mutex);
break;
+ }
clear_extent_dirty(unpin, start, end, GFP_NOFS);
btrfs_error_unpin_extent_range(root, start, end);
+ mutex_unlock(&fs_info->unused_bg_unpin_mutex);
cond_resched();
}
if (loop) {
- if (unpin == &root->fs_info->freed_extents[0])
- unpin = &root->fs_info->freed_extents[1];
+ if (unpin == &fs_info->freed_extents[0])
+ unpin = &fs_info->freed_extents[1];
else
- unpin = &root->fs_info->freed_extents[0];
+ unpin = &fs_info->freed_extents[0];
loop = false;
goto again;
}
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index adeb31830b9c..3c9819403487 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -68,6 +68,8 @@ struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info,
struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
struct btrfs_key *location);
int btrfs_init_fs_root(struct btrfs_root *root);
+struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
+ u64 root_id);
int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
struct btrfs_root *root);
void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 47cdc6f3390b..978bbfed5a2c 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2342,7 +2342,13 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
ins.type = BTRFS_EXTENT_ITEM_KEY;
}
- BUG_ON(node->ref_mod != 1);
+ if (node->ref_mod != 1) {
+ btrfs_err(root->fs_info,
+ "btree block(%llu) has %d references rather than 1: action %d ref_root %llu parent %llu",
+ node->bytenr, node->ref_mod, node->action, ref_root,
+ parent);
+ return -EIO;
+ }
if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
BUG_ON(!extent_op || !extent_op->update_flags);
ret = alloc_reserved_tree_block(trans, root,
@@ -2520,11 +2526,11 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
if (ref && ref->seq &&
btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) {
spin_unlock(&locked_ref->lock);
- btrfs_delayed_ref_unlock(locked_ref);
spin_lock(&delayed_refs->lock);
locked_ref->processing = 0;
delayed_refs->num_heads_ready++;
spin_unlock(&delayed_refs->lock);
+ btrfs_delayed_ref_unlock(locked_ref);
locked_ref = NULL;
cond_resched();
count++;
@@ -2570,7 +2576,10 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
*/
if (must_insert_reserved)
locked_ref->must_insert_reserved = 1;
+ spin_lock(&delayed_refs->lock);
locked_ref->processing = 0;
+ delayed_refs->num_heads_ready++;
+ spin_unlock(&delayed_refs->lock);
btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
btrfs_delayed_ref_unlock(locked_ref);
return ret;
@@ -3358,13 +3367,6 @@ again:
goto again;
}
- /* We've already setup this transaction, go ahead and exit */
- if (block_group->cache_generation == trans->transid &&
- i_size_read(inode)) {
- dcs = BTRFS_DC_SETUP;
- goto out_put;
- }
-
/*
* We want to set the generation to 0, that way if anything goes wrong
* from here on out we know not to trust this cache when we load up next
@@ -3388,6 +3390,13 @@ again:
}
WARN_ON(ret);
+ /* We've already setup this transaction, go ahead and exit */
+ if (block_group->cache_generation == trans->transid &&
+ i_size_read(inode)) {
+ dcs = BTRFS_DC_SETUP;
+ goto out_put;
+ }
+
if (i_size_read(inode) > 0) {
ret = btrfs_check_trunc_cache_free_space(root,
&root->fs_info->global_block_rsv);
@@ -3851,6 +3860,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
info->space_info_kobj, "%s",
alloc_name(found->flags));
if (ret) {
+ percpu_counter_destroy(&found->total_bytes_pinned);
kfree(found);
return ret;
}
@@ -4124,7 +4134,7 @@ commit_trans:
data_sinfo->flags, bytes, 1);
spin_unlock(&data_sinfo->lock);
- return ret;
+ return 0;
}
/*
@@ -4388,6 +4398,7 @@ again:
if (wait_for_alloc) {
mutex_unlock(&fs_info->chunk_mutex);
wait_for_alloc = 0;
+ cond_resched();
goto again;
}
@@ -7830,6 +7841,20 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
buf = btrfs_find_create_tree_block(root, bytenr);
if (!buf)
return ERR_PTR(-ENOMEM);
+
+ /*
+ * Extra safety check in case the extent tree is corrupted and extent
+ * allocator chooses to use a tree block which is already used and
+ * locked.
+ */
+ if (buf->lock_owner == current->pid) {
+ btrfs_err_rl(root->fs_info,
+"tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected",
+ buf->start, btrfs_header_owner(buf), current->pid);
+ free_extent_buffer(buf);
+ return ERR_PTR(-EUCLEAN);
+ }
+
btrfs_set_header_generation(buf, trans->transid);
btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
btrfs_tree_lock(buf);
@@ -8486,14 +8511,13 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
ret = btrfs_lookup_extent_info(trans, root, bytenr, level - 1, 1,
&wc->refs[level - 1],
&wc->flags[level - 1]);
- if (ret < 0) {
- btrfs_tree_unlock(next);
- return ret;
- }
+ if (ret < 0)
+ goto out_unlock;
if (unlikely(wc->refs[level - 1] == 0)) {
btrfs_err(root->fs_info, "Missing references.");
- BUG();
+ ret = -EIO;
+ goto out_unlock;
}
*lookup_info = 0;
@@ -8545,7 +8569,12 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
}
level--;
- BUG_ON(level != btrfs_header_level(next));
+ ASSERT(level == btrfs_header_level(next));
+ if (level != btrfs_header_level(next)) {
+ btrfs_err(root->fs_info, "mismatched level");
+ ret = -EIO;
+ goto out_unlock;
+ }
path->nodes[level] = next;
path->slots[level] = 0;
path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
@@ -8560,8 +8589,15 @@ skip:
if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
parent = path->nodes[level]->start;
} else {
- BUG_ON(root->root_key.objectid !=
+ ASSERT(root->root_key.objectid ==
btrfs_header_owner(path->nodes[level]));
+ if (root->root_key.objectid !=
+ btrfs_header_owner(path->nodes[level])) {
+ btrfs_err(root->fs_info,
+ "mismatched block owner");
+ ret = -EIO;
+ goto out_unlock;
+ }
parent = 0;
}
@@ -8578,12 +8614,18 @@ skip:
}
ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
root->root_key.objectid, level - 1, 0);
- BUG_ON(ret); /* -ENOMEM */
+ if (ret)
+ goto out_unlock;
}
+
+ *lookup_info = 1;
+ ret = 1;
+
+out_unlock:
btrfs_tree_unlock(next);
free_extent_buffer(next);
- *lookup_info = 1;
- return 1;
+
+ return ret;
}
/*
@@ -8682,15 +8724,14 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
if (eb == root->node) {
if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
parent = eb->start;
- else
- BUG_ON(root->root_key.objectid !=
- btrfs_header_owner(eb));
+ else if (root->root_key.objectid != btrfs_header_owner(eb))
+ goto owner_mismatch;
} else {
if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
parent = path->nodes[level + 1]->start;
- else
- BUG_ON(root->root_key.objectid !=
- btrfs_header_owner(path->nodes[level + 1]));
+ else if (root->root_key.objectid !=
+ btrfs_header_owner(path->nodes[level + 1]))
+ goto owner_mismatch;
}
btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
@@ -8698,6 +8739,11 @@ out:
wc->refs[level] = 0;
wc->flags[level] = 0;
return 0;
+
+owner_mismatch:
+ btrfs_err_rl(root->fs_info, "unexpected tree owner, have %llu expect %llu",
+ btrfs_header_owner(eb), root->root_key.objectid);
+ return -EUCLEAN;
}
static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
@@ -8751,6 +8797,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
ret = walk_up_proc(trans, root, path, wc);
if (ret > 0)
return 0;
+ if (ret < 0)
+ return ret;
if (path->locks[level]) {
btrfs_tree_unlock_rw(path->nodes[level],
@@ -9439,6 +9487,8 @@ static int find_first_block_group(struct btrfs_root *root,
int ret = 0;
struct btrfs_key found_key;
struct extent_buffer *leaf;
+ struct btrfs_block_group_item bg;
+ u64 flags;
int slot;
ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
@@ -9460,7 +9510,47 @@ static int find_first_block_group(struct btrfs_root *root,
if (found_key.objectid >= key->objectid &&
found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
- ret = 0;
+ struct extent_map_tree *em_tree;
+ struct extent_map *em;
+
+ em_tree = &root->fs_info->mapping_tree.map_tree;
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, found_key.objectid,
+ found_key.offset);
+ read_unlock(&em_tree->lock);
+ if (!em) {
+ btrfs_err(root->fs_info,
+ "logical %llu len %llu found bg but no related chunk",
+ found_key.objectid, found_key.offset);
+ ret = -ENOENT;
+ } else if (em->start != found_key.objectid ||
+ em->len != found_key.offset) {
+ btrfs_err(root->fs_info,
+ "block group %llu len %llu mismatch with chunk %llu len %llu",
+ found_key.objectid, found_key.offset,
+ em->start, em->len);
+ ret = -EUCLEAN;
+ } else {
+ read_extent_buffer(leaf, &bg,
+ btrfs_item_ptr_offset(leaf, slot),
+ sizeof(bg));
+ flags = btrfs_block_group_flags(&bg) &
+ BTRFS_BLOCK_GROUP_TYPE_MASK;
+
+ if (flags != (em->map_lookup->type &
+ BTRFS_BLOCK_GROUP_TYPE_MASK)) {
+ btrfs_err(root->fs_info,
+"block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx",
+ found_key.objectid,
+ found_key.offset, flags,
+ (BTRFS_BLOCK_GROUP_TYPE_MASK &
+ em->map_lookup->type));
+ ret = -EUCLEAN;
+ } else {
+ ret = 0;
+ }
+ }
+ free_extent_map(em);
goto out;
}
path->slots[0]++;
@@ -9479,6 +9569,7 @@ void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
block_group = btrfs_lookup_first_block_group(info, last);
while (block_group) {
+ wait_block_group_cache_done(block_group);
spin_lock(&block_group->lock);
if (block_group->iref)
break;
@@ -9674,6 +9765,62 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
return cache;
}
+
+/*
+ * Iterate all chunks and verify that each of them has the corresponding block
+ * group
+ */
+static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
+ struct extent_map *em;
+ struct btrfs_block_group_cache *bg;
+ u64 start = 0;
+ int ret = 0;
+
+ while (1) {
+ read_lock(&map_tree->map_tree.lock);
+ /*
+ * lookup_extent_mapping will return the first extent map
+ * intersecting the range, so setting @len to 1 is enough to
+ * get the first chunk.
+ */
+ em = lookup_extent_mapping(&map_tree->map_tree, start, 1);
+ read_unlock(&map_tree->map_tree.lock);
+ if (!em)
+ break;
+
+ bg = btrfs_lookup_block_group(fs_info, em->start);
+ if (!bg) {
+ btrfs_err(fs_info,
+ "chunk start=%llu len=%llu doesn't have corresponding block group",
+ em->start, em->len);
+ ret = -EUCLEAN;
+ free_extent_map(em);
+ break;
+ }
+ if (bg->key.objectid != em->start ||
+ bg->key.offset != em->len ||
+ (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
+ (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
+ btrfs_err(fs_info,
+"chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx",
+ em->start, em->len,
+ em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK,
+ bg->key.objectid, bg->key.offset,
+ bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
+ ret = -EUCLEAN;
+ free_extent_map(em);
+ btrfs_put_block_group(bg);
+ break;
+ }
+ start = em->start + em->len;
+ free_extent_map(em);
+ btrfs_put_block_group(bg);
+ }
+ return ret;
+}
+
int btrfs_read_block_groups(struct btrfs_root *root)
{
struct btrfs_path *path;
@@ -9686,6 +9833,11 @@ int btrfs_read_block_groups(struct btrfs_root *root)
struct extent_buffer *leaf;
int need_clear = 0;
u64 cache_gen;
+ u64 feature;
+ int mixed;
+
+ feature = btrfs_super_incompat_flags(info->super_copy);
+ mixed = !!(feature & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS);
root = info->extent_root;
key.objectid = 0;
@@ -9739,6 +9891,15 @@ int btrfs_read_block_groups(struct btrfs_root *root)
btrfs_item_ptr_offset(leaf, path->slots[0]),
sizeof(cache->item));
cache->flags = btrfs_block_group_flags(&cache->item);
+ if (!mixed &&
+ ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) &&
+ (cache->flags & BTRFS_BLOCK_GROUP_DATA))) {
+ btrfs_err(info,
+"bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups",
+ cache->key.objectid);
+ ret = -EINVAL;
+ goto error;
+ }
key.objectid = found_key.objectid + found_key.offset;
btrfs_release_path(path);
@@ -9846,7 +10007,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
}
init_global_block_rsv(info);
- ret = 0;
+ ret = check_chunk_block_group_mappings(info);
error:
btrfs_free_path(path);
return ret;
@@ -9855,7 +10016,7 @@ error:
void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
- struct btrfs_block_group_cache *block_group, *tmp;
+ struct btrfs_block_group_cache *block_group;
struct btrfs_root *extent_root = root->fs_info->extent_root;
struct btrfs_block_group_item item;
struct btrfs_key key;
@@ -9863,7 +10024,10 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
bool can_flush_pending_bgs = trans->can_flush_pending_bgs;
trans->can_flush_pending_bgs = false;
- list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
+ while (!list_empty(&trans->new_bgs)) {
+ block_group = list_first_entry(&trans->new_bgs,
+ struct btrfs_block_group_cache,
+ bg_list);
if (ret)
goto next;
@@ -10328,7 +10492,7 @@ btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info,
* more device items and remove one chunk item), but this is done at
* btrfs_remove_chunk() through a call to check_system_chunk().
*/
- map = (struct map_lookup *)em->bdev;
+ map = em->map_lookup;
num_items = 3 + map->num_stripes;
free_extent_map(em);
@@ -10374,7 +10538,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
/* Don't want to race with allocators so take the groups_sem */
down_write(&space_info->groups_sem);
spin_lock(&block_group->lock);
- if (block_group->reserved ||
+ if (block_group->reserved || block_group->pinned ||
btrfs_block_group_used(&block_group->item) ||
block_group->ro ||
list_is_singular(&block_group->list)) {
@@ -10573,6 +10737,10 @@ static int btrfs_trim_free_extents(struct btrfs_device *device,
*trimmed = 0;
+ /* Discard not supported = nothing to do. */
+ if (!blk_queue_discard(bdev_get_queue(device->bdev)))
+ return 0;
+
/* Not writeable = nothing to do. */
if (!device->writeable)
return 0;
@@ -10644,17 +10812,9 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
u64 start;
u64 end;
u64 trimmed = 0;
- u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
int ret = 0;
- /*
- * try to trim all FS space, our block group may start from non-zero.
- */
- if (range->len == total_bytes)
- cache = btrfs_lookup_first_block_group(fs_info, range->start);
- else
- cache = btrfs_lookup_block_group(fs_info, range->start);
-
+ cache = btrfs_lookup_first_block_group(fs_info, range->start);
while (cache) {
if (cache->key.objectid >= (range->start + range->len)) {
btrfs_put_block_group(cache);
@@ -10695,8 +10855,8 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
}
mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
- devices = &root->fs_info->fs_devices->alloc_list;
- list_for_each_entry(device, devices, dev_alloc_list) {
+ devices = &root->fs_info->fs_devices->devices;
+ list_for_each_entry(device, devices, dev_list) {
ret = btrfs_trim_free_extents(device, range->minlen,
&group_trimmed);
if (ret)
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 322a4046a23a..315f21191643 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2534,7 +2534,7 @@ int end_extent_writepage(struct page *page, int err, u64 start, u64 end)
if (!uptodate) {
ClearPageUptodate(page);
SetPageError(page);
- ret = ret < 0 ? ret : -EIO;
+ ret = err < 0 ? err : -EIO;
mapping_set_error(page->mapping, ret);
}
return 0;
@@ -3106,11 +3106,11 @@ static int __do_readpage(struct extent_io_tree *tree,
*/
if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) &&
prev_em_start && *prev_em_start != (u64)-1 &&
- *prev_em_start != em->orig_start)
+ *prev_em_start != em->start)
force_bio_submit = true;
if (prev_em_start)
- *prev_em_start = em->orig_start;
+ *prev_em_start = em->start;
free_extent_map(em);
em = NULL;
@@ -3847,8 +3847,10 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
struct block_device *bdev = fs_info->fs_devices->latest_bdev;
struct extent_io_tree *tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
u64 offset = eb->start;
+ u32 nritems;
unsigned long i, num_pages;
unsigned long bio_flags = 0;
+ unsigned long start, end;
int rw = (epd->sync_io ? WRITE_SYNC : WRITE) | REQ_META;
int ret = 0;
@@ -3858,6 +3860,23 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
if (btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID)
bio_flags = EXTENT_BIO_TREE_LOG;
+ /* set btree blocks beyond nritems with 0 to avoid stale content. */
+ nritems = btrfs_header_nritems(eb);
+ if (btrfs_header_level(eb) > 0) {
+ end = btrfs_node_key_ptr_offset(nritems);
+
+ memset_extent_buffer(eb, 0, end, eb->len - end);
+ } else {
+ /*
+ * leaf:
+ * header 0 1 2 .. N ... data_N .. data_2 data_1 data_0
+ */
+ start = btrfs_item_nr_offset(nritems);
+ end = btrfs_leaf_data(eb) +
+ leaf_data_end(fs_info->tree_root, eb);
+ memset_extent_buffer(eb, 0, start, end - start);
+ }
+
for (i = 0; i < num_pages; i++) {
struct page *p = eb->pages[i];
@@ -5283,11 +5302,20 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
lock_page(page);
}
locked_pages++;
+ }
+ /*
+ * We need to firstly lock all pages to make sure that
+ * the uptodate bit of our pages won't be affected by
+ * clear_extent_buffer_uptodate().
+ */
+ for (i = start_i; i < num_pages; i++) {
+ page = eb->pages[i];
if (!PageUptodate(page)) {
num_reads++;
all_uptodate = 0;
}
}
+
if (all_uptodate) {
if (start_i == 0)
set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
@@ -5342,9 +5370,8 @@ unlock_exit:
return ret;
}
-void read_extent_buffer(struct extent_buffer *eb, void *dstv,
- unsigned long start,
- unsigned long len)
+void read_extent_buffer(const struct extent_buffer *eb, void *dstv,
+ unsigned long start, unsigned long len)
{
size_t cur;
size_t offset;
@@ -5373,9 +5400,9 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv,
}
}
-int read_extent_buffer_to_user(struct extent_buffer *eb, void __user *dstv,
- unsigned long start,
- unsigned long len)
+int read_extent_buffer_to_user(const struct extent_buffer *eb,
+ void __user *dstv,
+ unsigned long start, unsigned long len)
{
size_t cur;
size_t offset;
@@ -5410,10 +5437,10 @@ int read_extent_buffer_to_user(struct extent_buffer *eb, void __user *dstv,
return ret;
}
-int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
- unsigned long min_len, char **map,
- unsigned long *map_start,
- unsigned long *map_len)
+int map_private_extent_buffer(const struct extent_buffer *eb,
+ unsigned long start, unsigned long min_len,
+ char **map, unsigned long *map_start,
+ unsigned long *map_len)
{
size_t offset = start & (PAGE_CACHE_SIZE - 1);
char *kaddr;
@@ -5448,9 +5475,8 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
return 0;
}
-int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
- unsigned long start,
- unsigned long len)
+int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
+ unsigned long start, unsigned long len)
{
size_t cur;
size_t offset;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index f4c1ae11855f..751435967724 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -308,14 +308,13 @@ static inline void extent_buffer_get(struct extent_buffer *eb)
atomic_inc(&eb->refs);
}
-int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
- unsigned long start,
- unsigned long len);
-void read_extent_buffer(struct extent_buffer *eb, void *dst,
+int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
+ unsigned long start, unsigned long len);
+void read_extent_buffer(const struct extent_buffer *eb, void *dst,
unsigned long start,
unsigned long len);
-int read_extent_buffer_to_user(struct extent_buffer *eb, void __user *dst,
- unsigned long start,
+int read_extent_buffer_to_user(const struct extent_buffer *eb,
+ void __user *dst, unsigned long start,
unsigned long len);
void write_extent_buffer(struct extent_buffer *eb, const void *src,
unsigned long start, unsigned long len);
@@ -334,10 +333,10 @@ int set_extent_buffer_uptodate(struct extent_buffer *eb);
int clear_extent_buffer_uptodate(struct extent_buffer *eb);
int extent_buffer_uptodate(struct extent_buffer *eb);
int extent_buffer_under_io(struct extent_buffer *eb);
-int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset,
- unsigned long min_len, char **map,
- unsigned long *map_start,
- unsigned long *map_len);
+int map_private_extent_buffer(const struct extent_buffer *eb,
+ unsigned long offset, unsigned long min_len,
+ char **map, unsigned long *map_start,
+ unsigned long *map_len);
int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end);
int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end);
int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 6a98bddd8f33..84fb56d5c018 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -76,7 +76,7 @@ void free_extent_map(struct extent_map *em)
WARN_ON(extent_map_in_tree(em));
WARN_ON(!list_empty(&em->list));
if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags))
- kfree(em->bdev);
+ kfree(em->map_lookup);
kmem_cache_free(extent_map_cache, em);
}
}
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index b2991fd8583e..eb8b8fae036b 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -32,7 +32,15 @@ struct extent_map {
u64 block_len;
u64 generation;
unsigned long flags;
- struct block_device *bdev;
+ union {
+ struct block_device *bdev;
+
+ /*
+ * used for chunk mappings
+ * flags & EXTENT_FLAG_FS_MAPPING must be set
+ */
+ struct map_lookup *map_lookup;
+ };
atomic_t refs;
unsigned int compress_type;
struct list_head list;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 353f4bae658c..052973620595 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1861,10 +1861,19 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end)
{
int ret;
+ struct blk_plug plug;
+ /*
+ * This is only called in fsync, which would do synchronous writes, so
+ * a plug can merge adjacent IOs as much as possible. Esp. in case of
+ * multiple disks using raid profile, a large IO can be split to
+ * several segments of stripe length (currently 64K).
+ */
+ blk_start_plug(&plug);
atomic_inc(&BTRFS_I(inode)->sync_writers);
ret = btrfs_fdatawrite_range(inode, start, end);
atomic_dec(&BTRFS_I(inode)->sync_writers);
+ blk_finish_plug(&plug);
return ret;
}
@@ -2771,7 +2780,7 @@ static long btrfs_fallocate(struct file *file, int mode,
if (!ret)
ret = btrfs_prealloc_file_range(inode, mode,
range->start,
- range->len, 1 << inode->i_blkbits,
+ range->len, i_blocksize(inode),
offset + len, &alloc_hint);
list_del(&range->list);
kfree(range);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index cfe99bec49de..6c0161284a9e 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1258,7 +1258,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
/* Lock all pages first so we can lock the extent safely. */
ret = io_ctl_prepare_pages(io_ctl, inode, 0);
if (ret)
- goto out;
+ goto out_unlock;
lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
0, &cached_state);
@@ -1351,6 +1351,7 @@ out_nospc_locked:
out_nospc:
cleanup_write_cache_enospc(inode, io_ctl, &cached_state, &bitmap_list);
+out_unlock:
if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
up_write(&block_group->data_rwsem);
@@ -1698,6 +1699,8 @@ static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
bitmap_clear(info->bitmap, start, count);
info->bytes -= bytes;
+ if (info->max_extent_size > ctl->unit)
+ info->max_extent_size = 0;
}
static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
@@ -1781,6 +1784,13 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl,
return -1;
}
+static inline u64 get_max_extent_size(struct btrfs_free_space *entry)
+{
+ if (entry->bitmap)
+ return entry->max_extent_size;
+ return entry->bytes;
+}
+
/* Cache the size of the max extent in bytes */
static struct btrfs_free_space *
find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
@@ -1802,8 +1812,8 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
for (node = &entry->offset_index; node; node = rb_next(node)) {
entry = rb_entry(node, struct btrfs_free_space, offset_index);
if (entry->bytes < *bytes) {
- if (entry->bytes > *max_extent_size)
- *max_extent_size = entry->bytes;
+ *max_extent_size = max(get_max_extent_size(entry),
+ *max_extent_size);
continue;
}
@@ -1821,8 +1831,8 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
}
if (entry->bytes < *bytes + align_off) {
- if (entry->bytes > *max_extent_size)
- *max_extent_size = entry->bytes;
+ *max_extent_size = max(get_max_extent_size(entry),
+ *max_extent_size);
continue;
}
@@ -1834,8 +1844,10 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
*offset = tmp;
*bytes = size;
return entry;
- } else if (size > *max_extent_size) {
- *max_extent_size = size;
+ } else {
+ *max_extent_size =
+ max(get_max_extent_size(entry),
+ *max_extent_size);
}
continue;
}
@@ -2457,6 +2469,7 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
struct rb_node *n;
int count = 0;
+ spin_lock(&ctl->tree_lock);
for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
info = rb_entry(n, struct btrfs_free_space, offset_index);
if (info->bytes >= bytes && !block_group->ro)
@@ -2466,6 +2479,7 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
info->offset, info->bytes,
(info->bitmap) ? "yes" : "no");
}
+ spin_unlock(&ctl->tree_lock);
btrfs_info(block_group->fs_info, "block group has cluster?: %s",
list_empty(&block_group->cluster_list) ? "no" : "yes");
btrfs_info(block_group->fs_info,
@@ -2693,8 +2707,8 @@ static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
err = search_bitmap(ctl, entry, &search_start, &search_bytes, true);
if (err) {
- if (search_bytes > *max_extent_size)
- *max_extent_size = search_bytes;
+ *max_extent_size = max(get_max_extent_size(entry),
+ *max_extent_size);
return 0;
}
@@ -2731,8 +2745,9 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
entry = rb_entry(node, struct btrfs_free_space, offset_index);
while (1) {
- if (entry->bytes < bytes && entry->bytes > *max_extent_size)
- *max_extent_size = entry->bytes;
+ if (entry->bytes < bytes)
+ *max_extent_size = max(get_max_extent_size(entry),
+ *max_extent_size);
if (entry->bytes < bytes ||
(!entry->bitmap && entry->offset < min_start)) {
diff --git a/fs/btrfs/hash.c b/fs/btrfs/hash.c
index aae520b2aee5..6fc94fcba072 100644
--- a/fs/btrfs/hash.c
+++ b/fs/btrfs/hash.c
@@ -33,6 +33,7 @@ u32 btrfs_crc32c(u32 crc, const void *address, unsigned int length)
{
SHASH_DESC_ON_STACK(shash, tfm);
u32 *ctx = (u32 *)shash_desc_ctx(shash);
+ u32 retval;
int err;
shash->tfm = tfm;
@@ -42,5 +43,7 @@ u32 btrfs_crc32c(u32 crc, const void *address, unsigned int length)
err = crypto_shash_update(shash, address, length);
BUG_ON(err);
- return *ctx;
+ retval = *ctx;
+ barrier_data(ctx);
+ return retval;
}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 4bc9dbf29a73..383717ccecc7 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -481,6 +481,7 @@ again:
pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
if (!pages) {
/* just bail out to the uncompressed code */
+ nr_pages = 0;
goto cont;
}
@@ -1202,6 +1203,8 @@ static noinline int csum_exist_in_range(struct btrfs_root *root,
list_del(&sums->list);
kfree(sums);
}
+ if (ret < 0)
+ return ret;
return 1;
}
@@ -1292,8 +1295,11 @@ next_slot:
leaf = path->nodes[0];
if (path->slots[0] >= btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(root, path);
- if (ret < 0)
+ if (ret < 0) {
+ if (cow_start != (u64)-1)
+ cur_offset = cow_start;
goto error;
+ }
if (ret > 0)
break;
leaf = path->nodes[0];
@@ -1348,10 +1354,23 @@ next_slot:
goto out_check;
if (btrfs_extent_readonly(root, disk_bytenr))
goto out_check;
- if (btrfs_cross_ref_exist(trans, root, ino,
+ ret = btrfs_cross_ref_exist(trans, root, ino,
found_key.offset -
- extent_offset, disk_bytenr))
+ extent_offset, disk_bytenr);
+ if (ret) {
+ /*
+ * ret could be -EIO if the above fails to read
+ * metadata.
+ */
+ if (ret < 0) {
+ if (cow_start != (u64)-1)
+ cur_offset = cow_start;
+ goto error;
+ }
+
+ WARN_ON_ONCE(nolock);
goto out_check;
+ }
disk_bytenr += extent_offset;
disk_bytenr += cur_offset - found_key.offset;
num_bytes = min(end + 1, extent_end) - cur_offset;
@@ -1369,8 +1388,20 @@ next_slot:
* this ensure that csum for a given extent are
* either valid or do not exist.
*/
- if (csum_exist_in_range(root, disk_bytenr, num_bytes))
+ ret = csum_exist_in_range(root, disk_bytenr, num_bytes);
+ if (ret) {
+ /*
+ * ret could be -EIO if the above fails to read
+ * metadata.
+ */
+ if (ret < 0) {
+ if (cow_start != (u64)-1)
+ cur_offset = cow_start;
+ goto error;
+ }
+ WARN_ON_ONCE(nolock);
goto out_check;
+ }
nocow = 1;
} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
extent_end = found_key.offset +
@@ -2015,7 +2046,15 @@ again:
goto out;
}
- btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state);
+ ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
+ &cached_state);
+ if (ret) {
+ mapping_set_error(page->mapping, ret);
+ end_extent_writepage(page, ret, page_start, page_end);
+ ClearPageChecked(page);
+ goto out;
+ }
+
ClearPageChecked(page);
set_page_dirty(page);
out:
@@ -4397,8 +4436,19 @@ search_again:
if (found_type > min_type) {
del_item = 1;
} else {
- if (item_end < new_size)
+ if (item_end < new_size) {
+ /*
+ * With NO_HOLES mode, for the following mapping
+ *
+ * [0-4k][hole][8k-12k]
+ *
+ * if truncating isize down to 6k, it ends up
+ * isize being 8k.
+ */
+ if (btrfs_fs_incompat(root->fs_info, NO_HOLES))
+ last_size = new_size;
break;
+ }
if (found_key.offset >= new_size)
del_item = 1;
else
@@ -6391,8 +6441,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
goto out_unlock_inode;
} else {
btrfs_update_inode(trans, root, inode);
- unlock_new_inode(inode);
- d_instantiate(dentry, inode);
+ d_instantiate_new(dentry, inode);
}
out_unlock:
@@ -6467,8 +6516,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
goto out_unlock_inode;
BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
- unlock_new_inode(inode);
- d_instantiate(dentry, inode);
+ d_instantiate_new(dentry, inode);
out_unlock:
btrfs_end_transaction(trans, root);
@@ -6611,12 +6659,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
if (err)
goto out_fail_inode;
- d_instantiate(dentry, inode);
- /*
- * mkdir is special. We're unlocking after we call d_instantiate
- * to avoid a race with nfsd calling d_instantiate.
- */
- unlock_new_inode(inode);
+ d_instantiate_new(dentry, inode);
drop_on_err = 0;
out_fail:
@@ -6724,6 +6767,20 @@ static noinline int uncompress_inline(struct btrfs_path *path,
max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size);
ret = btrfs_decompress(compress_type, tmp, page,
extent_offset, inline_size, max_size);
+
+ /*
+ * decompression code contains a memset to fill in any space between the end
+ * of the uncompressed data and the end of max_size in case the decompressed
+ * data ends up shorter than ram_bytes. That doesn't cover the hole between
+ * the end of an inline extent and the beginning of the next block, so we
+ * cover that region here.
+ */
+
+ if (max_size + pg_offset < PAGE_SIZE) {
+ char *map = kmap(page);
+ memset(map + pg_offset + max_size, 0, PAGE_SIZE - max_size - pg_offset);
+ kunmap(page);
+ }
kfree(tmp);
return ret;
}
@@ -7318,8 +7375,8 @@ bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end)
int found = false;
void **pagep = NULL;
struct page *page = NULL;
- int start_idx;
- int end_idx;
+ unsigned long start_idx;
+ unsigned long end_idx;
start_idx = start >> PAGE_CACHE_SHIFT;
@@ -7510,11 +7567,18 @@ static void adjust_dio_outstanding_extents(struct inode *inode,
* within our reservation, otherwise we need to adjust our inode
* counter appropriately.
*/
- if (dio_data->outstanding_extents) {
+ if (dio_data->outstanding_extents >= num_extents) {
dio_data->outstanding_extents -= num_extents;
} else {
+ /*
+ * If dio write length has been split due to no large enough
+ * contiguous space, we need to compensate our inode counter
+ * appropriately.
+ */
+ u64 num_needed = num_extents - dio_data->outstanding_extents;
+
spin_lock(&BTRFS_I(inode)->lock);
- BTRFS_I(inode)->outstanding_extents += num_extents;
+ BTRFS_I(inode)->outstanding_extents += num_needed;
spin_unlock(&BTRFS_I(inode)->lock);
}
}
@@ -8691,9 +8755,14 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
* So even we call qgroup_free_data(), it won't decrease reserved
* space.
* 2) Not written to disk
- * This means the reserved space should be freed here.
+ * This means the reserved space should be freed here. However,
+ * if a truncate invalidates the page (by clearing PageDirty)
+ * and the page is accounted for while allocating extent
+ * in btrfs_check_data_free_space() we let delayed_ref to
+ * free the entire extent.
*/
- btrfs_qgroup_free_data(inode, page_start, PAGE_CACHE_SIZE);
+ if (PageDirty(page))
+ btrfs_qgroup_free_data(inode, page_start, PAGE_SIZE);
if (!inode_evicting) {
clear_extent_bit(tree, page_start, page_end,
EXTENT_LOCKED | EXTENT_DIRTY |
@@ -9741,8 +9810,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
goto out_unlock_inode;
}
- unlock_new_inode(inode);
- d_instantiate(dentry, inode);
+ d_instantiate_new(dentry, inode);
out_unlock:
btrfs_end_transaction(trans, root);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index a7e18dbadf74..3379490ce54d 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -378,7 +378,6 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
struct fstrim_range range;
u64 minlen = ULLONG_MAX;
u64 num_devices = 0;
- u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
int ret;
if (!capable(CAP_SYS_ADMIN))
@@ -402,11 +401,15 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
return -EOPNOTSUPP;
if (copy_from_user(&range, arg, sizeof(range)))
return -EFAULT;
- if (range.start > total_bytes ||
- range.len < fs_info->sb->s_blocksize)
+
+ /*
+ * NOTE: Don't truncate the range using super->total_bytes. Bytenr of
+ * block group is in the logical address space, which can be any
+ * sectorsize aligned bytenr in the range [0, U64_MAX].
+ */
+ if (range.len < fs_info->sb->s_blocksize)
return -EINVAL;
- range.len = min(range.len, total_bytes - range.start);
range.minlen = max(range.minlen, minlen);
ret = btrfs_trim_fs(fs_info->tree_root, &range);
if (ret < 0)
@@ -2231,7 +2234,7 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
if (!path)
return -ENOMEM;
- ptr = &name[BTRFS_INO_LOOKUP_PATH_MAX];
+ ptr = &name[BTRFS_INO_LOOKUP_PATH_MAX - 1];
key.objectid = tree_id;
key.type = BTRFS_ROOT_ITEM_KEY;
@@ -2984,7 +2987,7 @@ static int btrfs_cmp_data_prepare(struct inode *src, u64 loff,
out:
if (ret)
btrfs_cmp_data_free(cmp);
- return 0;
+ return ret;
}
static int btrfs_cmp_data(struct inode *src, u64 loff, struct inode *dst,
@@ -3825,6 +3828,11 @@ process_slot:
}
btrfs_release_path(path);
key.offset = next_key_min_offset;
+
+ if (fatal_signal_pending(current)) {
+ ret = -EINTR;
+ goto out;
+ }
}
ret = 0;
@@ -3918,11 +3926,6 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
if (!(src_file.file->f_mode & FMODE_READ))
goto out_fput;
- /* don't make the dst file partly checksummed */
- if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) !=
- (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM))
- goto out_fput;
-
ret = -EISDIR;
if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode))
goto out_fput;
@@ -3937,15 +3940,30 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
mutex_lock(&src->i_mutex);
}
+ /* don't make the dst file partly checksummed */
+ if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) !=
+ (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
/* determine range to clone */
ret = -EINVAL;
if (off + len > src->i_size || off + len < off)
goto out_unlock;
if (len == 0)
olen = len = src->i_size - off;
- /* if we extend to eof, continue to block boundary */
- if (off + len == src->i_size)
+ /*
+ * If we extend to eof, continue to block boundary if and only if the
+ * destination end offset matches the destination file's size, otherwise
+ * we would be corrupting data by placing the eof block into the middle
+ * of a file.
+ */
+ if (off + len == src->i_size) {
+ if (!IS_ALIGNED(len, bs) && destoff + len < inode->i_size)
+ goto out_unlock;
len = ALIGN(src->i_size, bs) - off;
+ }
if (len == 0) {
ret = 0;
@@ -4113,6 +4131,10 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
ret = PTR_ERR(new_root);
goto out;
}
+ if (!is_fstree(new_root->objectid)) {
+ ret = -ENOENT;
+ goto out;
+ }
path = btrfs_alloc_path();
if (!path) {
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index bcc965ed5fa1..90e29d40aa82 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2186,6 +2186,21 @@ void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
}
/*
+ * Check if the leaf is the last leaf. Which means all node pointers
+ * are at their last position.
+ */
+static bool is_last_leaf(struct btrfs_path *path)
+{
+ int i;
+
+ for (i = 1; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) {
+ if (path->slots[i] != btrfs_header_nritems(path->nodes[i]) - 1)
+ return false;
+ }
+ return true;
+}
+
+/*
* returns < 0 on error, 0 when more leafs are to be scanned.
* returns 1 when done.
*/
@@ -2198,6 +2213,7 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
struct ulist *roots = NULL;
struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem);
u64 num_bytes;
+ bool done;
int slot;
int ret;
@@ -2225,6 +2241,7 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
mutex_unlock(&fs_info->qgroup_rescan_lock);
return ret;
}
+ done = is_last_leaf(path);
btrfs_item_key_to_cpu(path->nodes[0], &found,
btrfs_header_nritems(path->nodes[0]) - 1);
@@ -2271,6 +2288,8 @@ out:
}
btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);
+ if (done && !ret)
+ ret = 1;
return ret;
}
@@ -2283,10 +2302,6 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
int err = -ENOMEM;
int ret = 0;
- mutex_lock(&fs_info->qgroup_rescan_lock);
- fs_info->qgroup_rescan_running = true;
- mutex_unlock(&fs_info->qgroup_rescan_lock);
-
path = btrfs_alloc_path();
if (!path)
goto out;
@@ -2397,6 +2412,7 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
sizeof(fs_info->qgroup_rescan_progress));
fs_info->qgroup_rescan_progress.objectid = progress_objectid;
init_completion(&fs_info->qgroup_rescan_completion);
+ fs_info->qgroup_rescan_running = true;
spin_unlock(&fs_info->qgroup_lock);
mutex_unlock(&fs_info->qgroup_rescan_lock);
@@ -2430,6 +2446,7 @@ qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info)
qgroup->rfer_cmpr = 0;
qgroup->excl = 0;
qgroup->excl_cmpr = 0;
+ qgroup_dirty(fs_info, qgroup);
}
spin_unlock(&fs_info->qgroup_lock);
}
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 1a33d3eb36de..2d2a76906786 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -2160,11 +2160,21 @@ int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
}
/*
- * reconstruct from the q stripe if they are
- * asking for mirror 3
+ * Loop retry:
+ * for 'mirror == 2', reconstruct from all other stripes.
+ * for 'mirror_num > 2', select a stripe to fail on every retry.
*/
- if (mirror_num == 3)
- rbio->failb = rbio->real_stripes - 2;
+ if (mirror_num > 2) {
+ /*
+ * 'mirror == 3' is to fail the p stripe and
+ * reconstruct from the q stripe. 'mirror > 3' is to
+ * fail a data stripe and reconstruct from p+q stripe.
+ */
+ rbio->failb = rbio->real_stripes - (mirror_num - 1);
+ ASSERT(rbio->failb > 0);
+ if (rbio->failb <= rbio->faila)
+ rbio->failb--;
+ }
ret = lock_stripe_add(rbio);
@@ -2410,8 +2420,9 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
bitmap_clear(rbio->dbitmap, pagenr, 1);
kunmap(p);
- for (stripe = 0; stripe < rbio->real_stripes; stripe++)
+ for (stripe = 0; stripe < nr_data; stripe++)
kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
+ kunmap(p_page);
}
__free_page(p_page);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index b4ca5454ef1a..d6ccfb31aef0 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -921,9 +921,16 @@ again:
path2->slots[level]--;
eb = path2->nodes[level];
- WARN_ON(btrfs_node_blockptr(eb, path2->slots[level]) !=
- cur->bytenr);
-
+ if (btrfs_node_blockptr(eb, path2->slots[level]) !=
+ cur->bytenr) {
+ btrfs_err(root->fs_info,
+ "couldn't find block (%llu) (level %d) in tree (%llu) with key (%llu %u %llu)",
+ cur->bytenr, level - 1, root->objectid,
+ node_key->objectid, node_key->type,
+ node_key->offset);
+ err = -ENOENT;
+ goto out;
+ }
lower = cur;
need_check = true;
for (; level < BTRFS_MAX_LEVEL; level++) {
@@ -1311,18 +1318,19 @@ static void __del_reloc_root(struct btrfs_root *root)
struct mapping_node *node = NULL;
struct reloc_control *rc = root->fs_info->reloc_ctl;
- spin_lock(&rc->reloc_root_tree.lock);
- rb_node = tree_search(&rc->reloc_root_tree.rb_root,
- root->node->start);
- if (rb_node) {
- node = rb_entry(rb_node, struct mapping_node, rb_node);
- rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
+ if (rc && root->node) {
+ spin_lock(&rc->reloc_root_tree.lock);
+ rb_node = tree_search(&rc->reloc_root_tree.rb_root,
+ root->node->start);
+ if (rb_node) {
+ node = rb_entry(rb_node, struct mapping_node, rb_node);
+ rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
+ }
+ spin_unlock(&rc->reloc_root_tree.lock);
+ if (!node)
+ return;
+ BUG_ON((struct btrfs_root *)node->data != root);
}
- spin_unlock(&rc->reloc_root_tree.lock);
-
- if (!node)
- return;
- BUG_ON((struct btrfs_root *)node->data != root);
spin_lock(&root->fs_info->trans_lock);
list_del_init(&root->root_list);
@@ -2344,6 +2352,10 @@ void free_reloc_roots(struct list_head *list)
reloc_root = list_entry(list->next, struct btrfs_root,
root_list);
__del_reloc_root(reloc_root);
+ free_extent_buffer(reloc_root->node);
+ free_extent_buffer(reloc_root->commit_root);
+ reloc_root->node = NULL;
+ reloc_root->commit_root = NULL;
}
}
@@ -2676,11 +2688,15 @@ static int do_relocation(struct btrfs_trans_handle *trans,
if (!upper->eb) {
ret = btrfs_search_slot(trans, root, key, path, 0, 1);
- if (ret < 0) {
- err = ret;
+ if (ret) {
+ if (ret < 0)
+ err = ret;
+ else
+ err = -ENOENT;
+
+ btrfs_release_path(path);
break;
}
- BUG_ON(ret > 0);
if (!upper->eb) {
upper->eb = path->nodes[upper->level];
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 2c849b08a91b..6a6efb26d52f 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -272,6 +272,23 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
root_key.objectid = key.offset;
key.offset++;
+ /*
+ * The root might have been inserted already, as before we look
+ * for orphan roots, log replay might have happened, which
+ * triggers a transaction commit and qgroup accounting, which
+ * in turn reads and inserts fs roots while doing backref
+ * walking.
+ */
+ root = btrfs_lookup_fs_root(tree_root->fs_info,
+ root_key.objectid);
+ if (root) {
+ WARN_ON(!test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED,
+ &root->state));
+ if (btrfs_root_refs(&root->root_item) == 0)
+ btrfs_add_dead_root(root);
+ continue;
+ }
+
root = btrfs_read_fs_root(tree_root, &root_key);
err = PTR_ERR_OR_ZERO(root);
if (err && err != -ENOENT) {
@@ -310,16 +327,8 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state);
err = btrfs_insert_fs_root(root->fs_info, root);
- /*
- * The root might have been inserted already, as before we look
- * for orphan roots, log replay might have happened, which
- * triggers a transaction commit and qgroup accounting, which
- * in turn reads and inserts fs roots while doing backref
- * walking.
- */
- if (err == -EEXIST)
- err = 0;
if (err) {
+ BUG_ON(err == -EEXIST);
btrfs_free_fs_root(root);
break;
}
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index b091d94ceef6..cc9ccc42f469 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -2513,7 +2513,7 @@ static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
have_csum = scrub_find_csum(sctx, logical, csum);
if (have_csum == 0)
++sctx->stat.no_csum;
- if (sctx->is_dev_replace && !have_csum) {
+ if (0 && sctx->is_dev_replace && !have_csum) {
ret = copy_nocow_pages(sctx, logical, l,
mirror_num,
physical_for_dev_replace);
@@ -3460,7 +3460,7 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
return ret;
}
- map = (struct map_lookup *)em->bdev;
+ map = em->map_lookup;
if (em->start != chunk_offset)
goto out;
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 63a6152be04b..40d1ab957fb6 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1648,6 +1648,9 @@ static int is_inode_existent(struct send_ctx *sctx, u64 ino, u64 gen)
{
int ret;
+ if (ino == BTRFS_FIRST_FREE_OBJECTID)
+ return 1;
+
ret = get_cur_inode_state(sctx, ino, gen);
if (ret < 0)
goto out;
@@ -1833,7 +1836,7 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
* not delted and then re-created, if it was then we have no overwrite
* and we can just unlink this entry.
*/
- if (sctx->parent_root) {
+ if (sctx->parent_root && dir != BTRFS_FIRST_FREE_OBJECTID) {
ret = get_inode_info(sctx->parent_root, dir, NULL, &gen, NULL,
NULL, NULL, NULL);
if (ret < 0 && ret != -ENOENT)
@@ -3229,7 +3232,8 @@ static void free_pending_move(struct send_ctx *sctx, struct pending_dir_move *m)
kfree(m);
}
-static void tail_append_pending_moves(struct pending_dir_move *moves,
+static void tail_append_pending_moves(struct send_ctx *sctx,
+ struct pending_dir_move *moves,
struct list_head *stack)
{
if (list_empty(&moves->list)) {
@@ -3240,6 +3244,10 @@ static void tail_append_pending_moves(struct pending_dir_move *moves,
list_add_tail(&moves->list, stack);
list_splice_tail(&list, stack);
}
+ if (!RB_EMPTY_NODE(&moves->node)) {
+ rb_erase(&moves->node, &sctx->pending_dir_moves);
+ RB_CLEAR_NODE(&moves->node);
+ }
}
static int apply_children_dir_moves(struct send_ctx *sctx)
@@ -3254,7 +3262,7 @@ static int apply_children_dir_moves(struct send_ctx *sctx)
return 0;
INIT_LIST_HEAD(&stack);
- tail_append_pending_moves(pm, &stack);
+ tail_append_pending_moves(sctx, pm, &stack);
while (!list_empty(&stack)) {
pm = list_first_entry(&stack, struct pending_dir_move, list);
@@ -3265,7 +3273,7 @@ static int apply_children_dir_moves(struct send_ctx *sctx)
goto out;
pm = get_pending_dir_moves(sctx, parent_ino);
if (pm)
- tail_append_pending_moves(pm, &stack);
+ tail_append_pending_moves(sctx, pm, &stack);
}
return 0;
@@ -4671,6 +4679,9 @@ static int send_hole(struct send_ctx *sctx, u64 end)
u64 len;
int ret = 0;
+ if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA)
+ return send_update_extent(sctx, offset, end - offset);
+
p = fs_path_alloc();
if (!p)
return -ENOMEM;
@@ -5005,13 +5016,19 @@ static int is_extent_unchanged(struct send_ctx *sctx,
while (key.offset < ekey->offset + left_len) {
ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
right_type = btrfs_file_extent_type(eb, ei);
- if (right_type != BTRFS_FILE_EXTENT_REG) {
+ if (right_type != BTRFS_FILE_EXTENT_REG &&
+ right_type != BTRFS_FILE_EXTENT_INLINE) {
ret = 0;
goto out;
}
right_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
- right_len = btrfs_file_extent_num_bytes(eb, ei);
+ if (right_type == BTRFS_FILE_EXTENT_INLINE) {
+ right_len = btrfs_file_extent_inline_len(eb, slot, ei);
+ right_len = PAGE_ALIGN(right_len);
+ } else {
+ right_len = btrfs_file_extent_num_bytes(eb, ei);
+ }
right_offset = btrfs_file_extent_offset(eb, ei);
right_gen = btrfs_file_extent_generation(eb, ei);
@@ -5025,6 +5042,19 @@ static int is_extent_unchanged(struct send_ctx *sctx,
goto out;
}
+ /*
+ * We just wanted to see if when we have an inline extent, what
+ * follows it is a regular extent (wanted to check the above
+ * condition for inline extents too). This should normally not
+ * happen but it's possible for example when we have an inline
+ * compressed extent representing data with a size matching
+ * the page size (currently the same as sector size).
+ */
+ if (right_type == BTRFS_FILE_EXTENT_INLINE) {
+ ret = 0;
+ goto out;
+ }
+
left_offset_fixed = left_offset;
if (key.offset < ekey->offset) {
/* Fix the right offset for 2a and 7. */
diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c
index b976597b0721..63ffd213b0b7 100644
--- a/fs/btrfs/struct-funcs.c
+++ b/fs/btrfs/struct-funcs.c
@@ -50,8 +50,8 @@ static inline void put_unaligned_le8(u8 val, void *p)
*/
#define DEFINE_BTRFS_SETGET_BITS(bits) \
-u##bits btrfs_get_token_##bits(struct extent_buffer *eb, void *ptr, \
- unsigned long off, \
+u##bits btrfs_get_token_##bits(const struct extent_buffer *eb, \
+ const void *ptr, unsigned long off, \
struct btrfs_map_token *token) \
{ \
unsigned long part_offset = (unsigned long)ptr; \
@@ -90,7 +90,8 @@ u##bits btrfs_get_token_##bits(struct extent_buffer *eb, void *ptr, \
return res; \
} \
void btrfs_set_token_##bits(struct extent_buffer *eb, \
- void *ptr, unsigned long off, u##bits val, \
+ const void *ptr, unsigned long off, \
+ u##bits val, \
struct btrfs_map_token *token) \
{ \
unsigned long part_offset = (unsigned long)ptr; \
@@ -133,7 +134,7 @@ DEFINE_BTRFS_SETGET_BITS(16)
DEFINE_BTRFS_SETGET_BITS(32)
DEFINE_BTRFS_SETGET_BITS(64)
-void btrfs_node_key(struct extent_buffer *eb,
+void btrfs_node_key(const struct extent_buffer *eb,
struct btrfs_disk_key *disk_key, int nr)
{
unsigned long ptr = btrfs_node_key_ptr_offset(nr);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 5d34a062ca4f..0f99336c37eb 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1727,6 +1727,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
goto restore;
}
+ btrfs_qgroup_rescan_resume(fs_info);
+
if (!fs_info->uuid_root) {
btrfs_info(fs_info, "creating UUID tree");
ret = btrfs_create_uuid_tree(fs_info);
@@ -2102,6 +2104,7 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
vol = memdup_user((void __user *)arg, sizeof(*vol));
if (IS_ERR(vol))
return PTR_ERR(vol);
+ vol->name[BTRFS_PATH_NAME_MAX] = '\0';
switch (cmd) {
case BTRFS_IOC_SCAN_DEV:
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
index 846d277b1901..2b2978c04e80 100644
--- a/fs/btrfs/tests/qgroup-tests.c
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -70,7 +70,7 @@ static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr,
btrfs_set_extent_generation(leaf, item, 1);
btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_TREE_BLOCK);
block_info = (struct btrfs_tree_block_info *)(item + 1);
- btrfs_set_tree_block_level(leaf, block_info, 1);
+ btrfs_set_tree_block_level(leaf, block_info, 0);
iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
if (parent > 0) {
btrfs_set_extent_inline_ref_type(leaf, iref,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index be8eae80ff65..098016338f98 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1821,6 +1821,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
return ret;
}
+ btrfs_trans_release_metadata(trans, root);
+ trans->block_rsv = NULL;
+
/* make a pass through all the delayed refs we have so far
* any runnings procs may add more while we are here
*/
@@ -1830,9 +1833,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
return ret;
}
- btrfs_trans_release_metadata(trans, root);
- trans->block_rsv = NULL;
-
cur_trans = trans->transaction;
/*
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
new file mode 100644
index 000000000000..5b98f3c76ce4
--- /dev/null
+++ b/fs/btrfs/tree-checker.c
@@ -0,0 +1,649 @@
+/*
+ * Copyright (C) Qu Wenruo 2017. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program.
+ */
+
+/*
+ * The module is used to catch unexpected/corrupted tree block data.
+ * Such behavior can be caused either by a fuzzed image or bugs.
+ *
+ * The objective is to do leaf/node validation checks when tree block is read
+ * from disk, and check *every* possible member, so other code won't
+ * need to checking them again.
+ *
+ * Due to the potential and unwanted damage, every checker needs to be
+ * carefully reviewed otherwise so it does not prevent mount of valid images.
+ */
+
+#include "ctree.h"
+#include "tree-checker.h"
+#include "disk-io.h"
+#include "compression.h"
+#include "hash.h"
+#include "volumes.h"
+
+#define CORRUPT(reason, eb, root, slot) \
+ btrfs_crit(root->fs_info, \
+ "corrupt %s, %s: block=%llu, root=%llu, slot=%d", \
+ btrfs_header_level(eb) == 0 ? "leaf" : "node", \
+ reason, btrfs_header_bytenr(eb), root->objectid, slot)
+
+/*
+ * Error message should follow the following format:
+ * corrupt <type>: <identifier>, <reason>[, <bad_value>]
+ *
+ * @type: leaf or node
+ * @identifier: the necessary info to locate the leaf/node.
+ * It's recommened to decode key.objecitd/offset if it's
+ * meaningful.
+ * @reason: describe the error
+ * @bad_value: optional, it's recommened to output bad value and its
+ * expected value (range).
+ *
+ * Since comma is used to separate the components, only space is allowed
+ * inside each component.
+ */
+
+/*
+ * Append generic "corrupt leaf/node root=%llu block=%llu slot=%d: " to @fmt.
+ * Allows callers to customize the output.
+ */
+__printf(4, 5)
+static void generic_err(const struct btrfs_root *root,
+ const struct extent_buffer *eb, int slot,
+ const char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, fmt);
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ btrfs_crit(root->fs_info,
+ "corrupt %s: root=%llu block=%llu slot=%d, %pV",
+ btrfs_header_level(eb) == 0 ? "leaf" : "node",
+ root->objectid, btrfs_header_bytenr(eb), slot, &vaf);
+ va_end(args);
+}
+
+static int check_extent_data_item(struct btrfs_root *root,
+ struct extent_buffer *leaf,
+ struct btrfs_key *key, int slot)
+{
+ struct btrfs_file_extent_item *fi;
+ u32 sectorsize = root->sectorsize;
+ u32 item_size = btrfs_item_size_nr(leaf, slot);
+
+ if (!IS_ALIGNED(key->offset, sectorsize)) {
+ CORRUPT("unaligned key offset for file extent",
+ leaf, root, slot);
+ return -EUCLEAN;
+ }
+
+ fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+
+ if (btrfs_file_extent_type(leaf, fi) > BTRFS_FILE_EXTENT_TYPES) {
+ CORRUPT("invalid file extent type", leaf, root, slot);
+ return -EUCLEAN;
+ }
+
+ /*
+ * Support for new compression/encrption must introduce incompat flag,
+ * and must be caught in open_ctree().
+ */
+ if (btrfs_file_extent_compression(leaf, fi) > BTRFS_COMPRESS_TYPES) {
+ CORRUPT("invalid file extent compression", leaf, root, slot);
+ return -EUCLEAN;
+ }
+ if (btrfs_file_extent_encryption(leaf, fi)) {
+ CORRUPT("invalid file extent encryption", leaf, root, slot);
+ return -EUCLEAN;
+ }
+ if (btrfs_file_extent_type(leaf, fi) == BTRFS_FILE_EXTENT_INLINE) {
+ /* Inline extent must have 0 as key offset */
+ if (key->offset) {
+ CORRUPT("inline extent has non-zero key offset",
+ leaf, root, slot);
+ return -EUCLEAN;
+ }
+
+ /* Compressed inline extent has no on-disk size, skip it */
+ if (btrfs_file_extent_compression(leaf, fi) !=
+ BTRFS_COMPRESS_NONE)
+ return 0;
+
+ /* Uncompressed inline extent size must match item size */
+ if (item_size != BTRFS_FILE_EXTENT_INLINE_DATA_START +
+ btrfs_file_extent_ram_bytes(leaf, fi)) {
+ CORRUPT("plaintext inline extent has invalid size",
+ leaf, root, slot);
+ return -EUCLEAN;
+ }
+ return 0;
+ }
+
+ /* Regular or preallocated extent has fixed item size */
+ if (item_size != sizeof(*fi)) {
+ CORRUPT(
+ "regluar or preallocated extent data item size is invalid",
+ leaf, root, slot);
+ return -EUCLEAN;
+ }
+ if (!IS_ALIGNED(btrfs_file_extent_ram_bytes(leaf, fi), sectorsize) ||
+ !IS_ALIGNED(btrfs_file_extent_disk_bytenr(leaf, fi), sectorsize) ||
+ !IS_ALIGNED(btrfs_file_extent_disk_num_bytes(leaf, fi), sectorsize) ||
+ !IS_ALIGNED(btrfs_file_extent_offset(leaf, fi), sectorsize) ||
+ !IS_ALIGNED(btrfs_file_extent_num_bytes(leaf, fi), sectorsize)) {
+ CORRUPT(
+ "regular or preallocated extent data item has unaligned value",
+ leaf, root, slot);
+ return -EUCLEAN;
+ }
+
+ return 0;
+}
+
+static int check_csum_item(struct btrfs_root *root, struct extent_buffer *leaf,
+ struct btrfs_key *key, int slot)
+{
+ u32 sectorsize = root->sectorsize;
+ u32 csumsize = btrfs_super_csum_size(root->fs_info->super_copy);
+
+ if (key->objectid != BTRFS_EXTENT_CSUM_OBJECTID) {
+ CORRUPT("invalid objectid for csum item", leaf, root, slot);
+ return -EUCLEAN;
+ }
+ if (!IS_ALIGNED(key->offset, sectorsize)) {
+ CORRUPT("unaligned key offset for csum item", leaf, root, slot);
+ return -EUCLEAN;
+ }
+ if (!IS_ALIGNED(btrfs_item_size_nr(leaf, slot), csumsize)) {
+ CORRUPT("unaligned csum item size", leaf, root, slot);
+ return -EUCLEAN;
+ }
+ return 0;
+}
+
+/*
+ * Customized reported for dir_item, only important new info is key->objectid,
+ * which represents inode number
+ */
+__printf(4, 5)
+static void dir_item_err(const struct btrfs_root *root,
+ const struct extent_buffer *eb, int slot,
+ const char *fmt, ...)
+{
+ struct btrfs_key key;
+ struct va_format vaf;
+ va_list args;
+
+ btrfs_item_key_to_cpu(eb, &key, slot);
+ va_start(args, fmt);
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ btrfs_crit(root->fs_info,
+ "corrupt %s: root=%llu block=%llu slot=%d ino=%llu, %pV",
+ btrfs_header_level(eb) == 0 ? "leaf" : "node", root->objectid,
+ btrfs_header_bytenr(eb), slot, key.objectid, &vaf);
+ va_end(args);
+}
+
+static int check_dir_item(struct btrfs_root *root,
+ struct extent_buffer *leaf,
+ struct btrfs_key *key, int slot)
+{
+ struct btrfs_dir_item *di;
+ u32 item_size = btrfs_item_size_nr(leaf, slot);
+ u32 cur = 0;
+
+ di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
+ while (cur < item_size) {
+ u32 name_len;
+ u32 data_len;
+ u32 max_name_len;
+ u32 total_size;
+ u32 name_hash;
+ u8 dir_type;
+
+ /* header itself should not cross item boundary */
+ if (cur + sizeof(*di) > item_size) {
+ dir_item_err(root, leaf, slot,
+ "dir item header crosses item boundary, have %zu boundary %u",
+ cur + sizeof(*di), item_size);
+ return -EUCLEAN;
+ }
+
+ /* dir type check */
+ dir_type = btrfs_dir_type(leaf, di);
+ if (dir_type >= BTRFS_FT_MAX) {
+ dir_item_err(root, leaf, slot,
+ "invalid dir item type, have %u expect [0, %u)",
+ dir_type, BTRFS_FT_MAX);
+ return -EUCLEAN;
+ }
+
+ if (key->type == BTRFS_XATTR_ITEM_KEY &&
+ dir_type != BTRFS_FT_XATTR) {
+ dir_item_err(root, leaf, slot,
+ "invalid dir item type for XATTR key, have %u expect %u",
+ dir_type, BTRFS_FT_XATTR);
+ return -EUCLEAN;
+ }
+ if (dir_type == BTRFS_FT_XATTR &&
+ key->type != BTRFS_XATTR_ITEM_KEY) {
+ dir_item_err(root, leaf, slot,
+ "xattr dir type found for non-XATTR key");
+ return -EUCLEAN;
+ }
+ if (dir_type == BTRFS_FT_XATTR)
+ max_name_len = XATTR_NAME_MAX;
+ else
+ max_name_len = BTRFS_NAME_LEN;
+
+ /* Name/data length check */
+ name_len = btrfs_dir_name_len(leaf, di);
+ data_len = btrfs_dir_data_len(leaf, di);
+ if (name_len > max_name_len) {
+ dir_item_err(root, leaf, slot,
+ "dir item name len too long, have %u max %u",
+ name_len, max_name_len);
+ return -EUCLEAN;
+ }
+ if (name_len + data_len > BTRFS_MAX_XATTR_SIZE(root)) {
+ dir_item_err(root, leaf, slot,
+ "dir item name and data len too long, have %u max %zu",
+ name_len + data_len,
+ BTRFS_MAX_XATTR_SIZE(root));
+ return -EUCLEAN;
+ }
+
+ if (data_len && dir_type != BTRFS_FT_XATTR) {
+ dir_item_err(root, leaf, slot,
+ "dir item with invalid data len, have %u expect 0",
+ data_len);
+ return -EUCLEAN;
+ }
+
+ total_size = sizeof(*di) + name_len + data_len;
+
+ /* header and name/data should not cross item boundary */
+ if (cur + total_size > item_size) {
+ dir_item_err(root, leaf, slot,
+ "dir item data crosses item boundary, have %u boundary %u",
+ cur + total_size, item_size);
+ return -EUCLEAN;
+ }
+
+ /*
+ * Special check for XATTR/DIR_ITEM, as key->offset is name
+ * hash, should match its name
+ */
+ if (key->type == BTRFS_DIR_ITEM_KEY ||
+ key->type == BTRFS_XATTR_ITEM_KEY) {
+ char namebuf[max(BTRFS_NAME_LEN, XATTR_NAME_MAX)];
+
+ read_extent_buffer(leaf, namebuf,
+ (unsigned long)(di + 1), name_len);
+ name_hash = btrfs_name_hash(namebuf, name_len);
+ if (key->offset != name_hash) {
+ dir_item_err(root, leaf, slot,
+ "name hash mismatch with key, have 0x%016x expect 0x%016llx",
+ name_hash, key->offset);
+ return -EUCLEAN;
+ }
+ }
+ cur += total_size;
+ di = (struct btrfs_dir_item *)((void *)di + total_size);
+ }
+ return 0;
+}
+
+__printf(4, 5)
+__cold
+static void block_group_err(const struct btrfs_fs_info *fs_info,
+ const struct extent_buffer *eb, int slot,
+ const char *fmt, ...)
+{
+ struct btrfs_key key;
+ struct va_format vaf;
+ va_list args;
+
+ btrfs_item_key_to_cpu(eb, &key, slot);
+ va_start(args, fmt);
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ btrfs_crit(fs_info,
+ "corrupt %s: root=%llu block=%llu slot=%d bg_start=%llu bg_len=%llu, %pV",
+ btrfs_header_level(eb) == 0 ? "leaf" : "node",
+ btrfs_header_owner(eb), btrfs_header_bytenr(eb), slot,
+ key.objectid, key.offset, &vaf);
+ va_end(args);
+}
+
+static int check_block_group_item(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *leaf,
+ struct btrfs_key *key, int slot)
+{
+ struct btrfs_block_group_item bgi;
+ u32 item_size = btrfs_item_size_nr(leaf, slot);
+ u64 flags;
+ u64 type;
+
+ /*
+ * Here we don't really care about alignment since extent allocator can
+ * handle it. We care more about the size, as if one block group is
+ * larger than maximum size, it's must be some obvious corruption.
+ */
+ if (key->offset > BTRFS_MAX_DATA_CHUNK_SIZE || key->offset == 0) {
+ block_group_err(fs_info, leaf, slot,
+ "invalid block group size, have %llu expect (0, %llu]",
+ key->offset, BTRFS_MAX_DATA_CHUNK_SIZE);
+ return -EUCLEAN;
+ }
+
+ if (item_size != sizeof(bgi)) {
+ block_group_err(fs_info, leaf, slot,
+ "invalid item size, have %u expect %zu",
+ item_size, sizeof(bgi));
+ return -EUCLEAN;
+ }
+
+ read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot),
+ sizeof(bgi));
+ if (btrfs_block_group_chunk_objectid(&bgi) !=
+ BTRFS_FIRST_CHUNK_TREE_OBJECTID) {
+ block_group_err(fs_info, leaf, slot,
+ "invalid block group chunk objectid, have %llu expect %llu",
+ btrfs_block_group_chunk_objectid(&bgi),
+ BTRFS_FIRST_CHUNK_TREE_OBJECTID);
+ return -EUCLEAN;
+ }
+
+ if (btrfs_block_group_used(&bgi) > key->offset) {
+ block_group_err(fs_info, leaf, slot,
+ "invalid block group used, have %llu expect [0, %llu)",
+ btrfs_block_group_used(&bgi), key->offset);
+ return -EUCLEAN;
+ }
+
+ flags = btrfs_block_group_flags(&bgi);
+ if (hweight64(flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) > 1) {
+ block_group_err(fs_info, leaf, slot,
+"invalid profile flags, have 0x%llx (%lu bits set) expect no more than 1 bit set",
+ flags & BTRFS_BLOCK_GROUP_PROFILE_MASK,
+ hweight64(flags & BTRFS_BLOCK_GROUP_PROFILE_MASK));
+ return -EUCLEAN;
+ }
+
+ type = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
+ if (type != BTRFS_BLOCK_GROUP_DATA &&
+ type != BTRFS_BLOCK_GROUP_METADATA &&
+ type != BTRFS_BLOCK_GROUP_SYSTEM &&
+ type != (BTRFS_BLOCK_GROUP_METADATA |
+ BTRFS_BLOCK_GROUP_DATA)) {
+ block_group_err(fs_info, leaf, slot,
+"invalid type, have 0x%llx (%lu bits set) expect either 0x%llx, 0x%llx, 0x%llx or 0x%llx",
+ type, hweight64(type),
+ BTRFS_BLOCK_GROUP_DATA, BTRFS_BLOCK_GROUP_METADATA,
+ BTRFS_BLOCK_GROUP_SYSTEM,
+ BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA);
+ return -EUCLEAN;
+ }
+ return 0;
+}
+
+/*
+ * Common point to switch the item-specific validation.
+ */
+static int check_leaf_item(struct btrfs_root *root,
+ struct extent_buffer *leaf,
+ struct btrfs_key *key, int slot)
+{
+ int ret = 0;
+
+ switch (key->type) {
+ case BTRFS_EXTENT_DATA_KEY:
+ ret = check_extent_data_item(root, leaf, key, slot);
+ break;
+ case BTRFS_EXTENT_CSUM_KEY:
+ ret = check_csum_item(root, leaf, key, slot);
+ break;
+ case BTRFS_DIR_ITEM_KEY:
+ case BTRFS_DIR_INDEX_KEY:
+ case BTRFS_XATTR_ITEM_KEY:
+ ret = check_dir_item(root, leaf, key, slot);
+ break;
+ case BTRFS_BLOCK_GROUP_ITEM_KEY:
+ ret = check_block_group_item(root->fs_info, leaf, key, slot);
+ break;
+ }
+ return ret;
+}
+
+static int check_leaf(struct btrfs_root *root, struct extent_buffer *leaf,
+ bool check_item_data)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ /* No valid key type is 0, so all key should be larger than this key */
+ struct btrfs_key prev_key = {0, 0, 0};
+ struct btrfs_key key;
+ u32 nritems = btrfs_header_nritems(leaf);
+ int slot;
+
+ if (btrfs_header_level(leaf) != 0) {
+ generic_err(root, leaf, 0,
+ "invalid level for leaf, have %d expect 0",
+ btrfs_header_level(leaf));
+ return -EUCLEAN;
+ }
+
+ /*
+ * Extent buffers from a relocation tree have a owner field that
+ * corresponds to the subvolume tree they are based on. So just from an
+ * extent buffer alone we can not find out what is the id of the
+ * corresponding subvolume tree, so we can not figure out if the extent
+ * buffer corresponds to the root of the relocation tree or not. So
+ * skip this check for relocation trees.
+ */
+ if (nritems == 0 && !btrfs_header_flag(leaf, BTRFS_HEADER_FLAG_RELOC)) {
+ u64 owner = btrfs_header_owner(leaf);
+ struct btrfs_root *check_root;
+
+ /* These trees must never be empty */
+ if (owner == BTRFS_ROOT_TREE_OBJECTID ||
+ owner == BTRFS_CHUNK_TREE_OBJECTID ||
+ owner == BTRFS_EXTENT_TREE_OBJECTID ||
+ owner == BTRFS_DEV_TREE_OBJECTID ||
+ owner == BTRFS_FS_TREE_OBJECTID ||
+ owner == BTRFS_DATA_RELOC_TREE_OBJECTID) {
+ generic_err(root, leaf, 0,
+ "invalid root, root %llu must never be empty",
+ owner);
+ return -EUCLEAN;
+ }
+ key.objectid = owner;
+ key.type = BTRFS_ROOT_ITEM_KEY;
+ key.offset = (u64)-1;
+
+ check_root = btrfs_get_fs_root(fs_info, &key, false);
+ /*
+ * The only reason we also check NULL here is that during
+ * open_ctree() some roots has not yet been set up.
+ */
+ if (!IS_ERR_OR_NULL(check_root)) {
+ struct extent_buffer *eb;
+
+ eb = btrfs_root_node(check_root);
+ /* if leaf is the root, then it's fine */
+ if (leaf != eb) {
+ CORRUPT("non-root leaf's nritems is 0",
+ leaf, check_root, 0);
+ free_extent_buffer(eb);
+ return -EUCLEAN;
+ }
+ free_extent_buffer(eb);
+ }
+ return 0;
+ }
+
+ if (nritems == 0)
+ return 0;
+
+ /*
+ * Check the following things to make sure this is a good leaf, and
+ * leaf users won't need to bother with similar sanity checks:
+ *
+ * 1) key ordering
+ * 2) item offset and size
+ * No overlap, no hole, all inside the leaf.
+ * 3) item content
+ * If possible, do comprehensive sanity check.
+ * NOTE: All checks must only rely on the item data itself.
+ */
+ for (slot = 0; slot < nritems; slot++) {
+ u32 item_end_expected;
+ int ret;
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+
+ /* Make sure the keys are in the right order */
+ if (btrfs_comp_cpu_keys(&prev_key, &key) >= 0) {
+ CORRUPT("bad key order", leaf, root, slot);
+ return -EUCLEAN;
+ }
+
+ /*
+ * Make sure the offset and ends are right, remember that the
+ * item data starts at the end of the leaf and grows towards the
+ * front.
+ */
+ if (slot == 0)
+ item_end_expected = BTRFS_LEAF_DATA_SIZE(root);
+ else
+ item_end_expected = btrfs_item_offset_nr(leaf,
+ slot - 1);
+ if (btrfs_item_end_nr(leaf, slot) != item_end_expected) {
+ CORRUPT("slot offset bad", leaf, root, slot);
+ return -EUCLEAN;
+ }
+
+ /*
+ * Check to make sure that we don't point outside of the leaf,
+ * just in case all the items are consistent to each other, but
+ * all point outside of the leaf.
+ */
+ if (btrfs_item_end_nr(leaf, slot) >
+ BTRFS_LEAF_DATA_SIZE(root)) {
+ CORRUPT("slot end outside of leaf", leaf, root, slot);
+ return -EUCLEAN;
+ }
+
+ /* Also check if the item pointer overlaps with btrfs item. */
+ if (btrfs_item_nr_offset(slot) + sizeof(struct btrfs_item) >
+ btrfs_item_ptr_offset(leaf, slot)) {
+ CORRUPT("slot overlap with its data", leaf, root, slot);
+ return -EUCLEAN;
+ }
+
+ if (check_item_data) {
+ /*
+ * Check if the item size and content meet other
+ * criteria
+ */
+ ret = check_leaf_item(root, leaf, &key, slot);
+ if (ret < 0)
+ return ret;
+ }
+
+ prev_key.objectid = key.objectid;
+ prev_key.type = key.type;
+ prev_key.offset = key.offset;
+ }
+
+ return 0;
+}
+
+int btrfs_check_leaf_full(struct btrfs_root *root, struct extent_buffer *leaf)
+{
+ return check_leaf(root, leaf, true);
+}
+
+int btrfs_check_leaf_relaxed(struct btrfs_root *root,
+ struct extent_buffer *leaf)
+{
+ return check_leaf(root, leaf, false);
+}
+
+int btrfs_check_node(struct btrfs_root *root, struct extent_buffer *node)
+{
+ unsigned long nr = btrfs_header_nritems(node);
+ struct btrfs_key key, next_key;
+ int slot;
+ int level = btrfs_header_level(node);
+ u64 bytenr;
+ int ret = 0;
+
+ if (level <= 0 || level >= BTRFS_MAX_LEVEL) {
+ generic_err(root, node, 0,
+ "invalid level for node, have %d expect [1, %d]",
+ level, BTRFS_MAX_LEVEL - 1);
+ return -EUCLEAN;
+ }
+ if (nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(root)) {
+ btrfs_crit(root->fs_info,
+"corrupt node: root=%llu block=%llu, nritems too %s, have %lu expect range [1,%zu]",
+ root->objectid, node->start,
+ nr == 0 ? "small" : "large", nr,
+ BTRFS_NODEPTRS_PER_BLOCK(root));
+ return -EUCLEAN;
+ }
+
+ for (slot = 0; slot < nr - 1; slot++) {
+ bytenr = btrfs_node_blockptr(node, slot);
+ btrfs_node_key_to_cpu(node, &key, slot);
+ btrfs_node_key_to_cpu(node, &next_key, slot + 1);
+
+ if (!bytenr) {
+ generic_err(root, node, slot,
+ "invalid NULL node pointer");
+ ret = -EUCLEAN;
+ goto out;
+ }
+ if (!IS_ALIGNED(bytenr, root->sectorsize)) {
+ generic_err(root, node, slot,
+ "unaligned pointer, have %llu should be aligned to %u",
+ bytenr, root->sectorsize);
+ ret = -EUCLEAN;
+ goto out;
+ }
+
+ if (btrfs_comp_cpu_keys(&key, &next_key) >= 0) {
+ generic_err(root, node, slot,
+ "bad key order, current (%llu %u %llu) next (%llu %u %llu)",
+ key.objectid, key.type, key.offset,
+ next_key.objectid, next_key.type,
+ next_key.offset);
+ ret = -EUCLEAN;
+ goto out;
+ }
+ }
+out:
+ return ret;
+}
diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h
new file mode 100644
index 000000000000..3d53e8d6fda0
--- /dev/null
+++ b/fs/btrfs/tree-checker.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (C) Qu Wenruo 2017. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program.
+ */
+
+#ifndef __BTRFS_TREE_CHECKER__
+#define __BTRFS_TREE_CHECKER__
+
+#include "ctree.h"
+#include "extent_io.h"
+
+/*
+ * Comprehensive leaf checker.
+ * Will check not only the item pointers, but also every possible member
+ * in item data.
+ */
+int btrfs_check_leaf_full(struct btrfs_root *root, struct extent_buffer *leaf);
+
+/*
+ * Less strict leaf checker.
+ * Will only check item pointers, not reading item data.
+ */
+int btrfs_check_leaf_relaxed(struct btrfs_root *root,
+ struct extent_buffer *leaf);
+int btrfs_check_node(struct btrfs_root *root, struct extent_buffer *node);
+
+#endif
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 1415f6d58633..c7190f322576 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -26,6 +26,7 @@
#include "print-tree.h"
#include "backref.h"
#include "hash.h"
+#include "inode-map.h"
/* magic values for the inode_only field in btrfs_log_inode:
*
@@ -1923,12 +1924,11 @@ static noinline int find_dir_range(struct btrfs_root *root,
next:
/* check the next slot in the tree to see if it is a valid item */
nritems = btrfs_header_nritems(path->nodes[0]);
+ path->slots[0]++;
if (path->slots[0] >= nritems) {
ret = btrfs_next_leaf(root, path);
if (ret)
goto out;
- } else {
- path->slots[0]++;
}
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
@@ -2223,8 +2223,10 @@ again:
nritems = btrfs_header_nritems(path->nodes[0]);
if (path->slots[0] >= nritems) {
ret = btrfs_next_leaf(root, path);
- if (ret)
+ if (ret == 1)
break;
+ else if (ret < 0)
+ goto out;
}
btrfs_item_key_to_cpu(path->nodes[0], &found_key,
path->slots[0]);
@@ -2446,6 +2448,9 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
next);
btrfs_wait_tree_block_writeback(next);
btrfs_tree_unlock(next);
+ } else {
+ if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
+ clear_extent_buffer_dirty(next);
}
WARN_ON(root_owner !=
@@ -2525,6 +2530,9 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
next);
btrfs_wait_tree_block_writeback(next);
btrfs_tree_unlock(next);
+ } else {
+ if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
+ clear_extent_buffer_dirty(next);
}
WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
@@ -2601,6 +2609,9 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
clean_tree_block(trans, log->fs_info, next);
btrfs_wait_tree_block_writeback(next);
btrfs_tree_unlock(next);
+ } else {
+ if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
+ clear_extent_buffer_dirty(next);
}
WARN_ON(log->root_key.objectid !=
@@ -2696,14 +2707,12 @@ static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root,
int index, int error)
{
struct btrfs_log_ctx *ctx;
+ struct btrfs_log_ctx *safe;
- if (!error) {
- INIT_LIST_HEAD(&root->log_ctxs[index]);
- return;
- }
-
- list_for_each_entry(ctx, &root->log_ctxs[index], list)
+ list_for_each_entry_safe(ctx, safe, &root->log_ctxs[index], list) {
+ list_del_init(&ctx->list);
ctx->log_ret = error;
+ }
INIT_LIST_HEAD(&root->log_ctxs[index]);
}
@@ -2944,34 +2953,34 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
mutex_unlock(&root->log_mutex);
out_wake_log_root:
- /*
- * We needn't get log_mutex here because we are sure all
- * the other tasks are blocked.
- */
+ mutex_lock(&log_root_tree->log_mutex);
btrfs_remove_all_log_ctxs(log_root_tree, index2, ret);
- mutex_lock(&log_root_tree->log_mutex);
log_root_tree->log_transid_committed++;
atomic_set(&log_root_tree->log_commit[index2], 0);
mutex_unlock(&log_root_tree->log_mutex);
/*
- * The barrier before waitqueue_active is implied by mutex_unlock
+ * The barrier before waitqueue_active is needed so all the updates
+ * above are seen by the woken threads. It might not be necessary, but
+ * proving that seems to be hard.
*/
+ smp_mb();
if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
wake_up(&log_root_tree->log_commit_wait[index2]);
out:
- /* See above. */
- btrfs_remove_all_log_ctxs(root, index1, ret);
-
mutex_lock(&root->log_mutex);
+ btrfs_remove_all_log_ctxs(root, index1, ret);
root->log_transid_committed++;
atomic_set(&root->log_commit[index1], 0);
mutex_unlock(&root->log_mutex);
/*
- * The barrier before waitqueue_active is implied by mutex_unlock
+ * The barrier before waitqueue_active is needed so all the updates
+ * above are seen by the woken threads. It might not be necessary, but
+ * proving that seems to be hard.
*/
+ smp_mb();
if (waitqueue_active(&root->log_commit_wait[index1]))
wake_up(&root->log_commit_wait[index1]);
return ret;
@@ -3312,9 +3321,16 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
}
btrfs_release_path(path);
- /* find the first key from this transaction again */
+ /*
+ * Find the first key from this transaction again. See the note for
+ * log_new_dir_dentries, if we're logging a directory recursively we
+ * won't be holding its i_mutex, which means we can modify the directory
+ * while we're logging it. If we remove an entry between our first
+ * search and this search we'll not find the key again and can just
+ * bail.
+ */
ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
- if (WARN_ON(ret != 0))
+ if (ret != 0)
goto done;
/*
@@ -3377,8 +3393,11 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
* from this directory and from this transaction
*/
ret = btrfs_next_leaf(root, path);
- if (ret == 1) {
- last_offset = (u64)-1;
+ if (ret) {
+ if (ret == 1)
+ last_offset = (u64)-1;
+ else
+ err = ret;
goto done;
}
btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
@@ -3829,6 +3848,7 @@ fill_holes:
ASSERT(ret == 0);
src = src_path->nodes[0];
i = 0;
+ need_find_last_extent = true;
}
btrfs_item_key_to_cpu(src, &key, i);
@@ -4567,6 +4587,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
u64 logged_isize = 0;
bool need_log_inode_item = true;
+ bool xattrs_logged = false;
path = btrfs_alloc_path();
if (!path)
@@ -4807,6 +4828,7 @@ next_slot:
err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path);
if (err)
goto out_unlock;
+ xattrs_logged = true;
if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) {
btrfs_release_path(path);
btrfs_release_path(dst_path);
@@ -4819,6 +4841,11 @@ log_extents:
btrfs_release_path(dst_path);
if (need_log_inode_item) {
err = log_inode_item(trans, log, dst_path, inode);
+ if (!err && !xattrs_logged) {
+ err = btrfs_log_all_xattrs(trans, root, inode, path,
+ dst_path);
+ btrfs_release_path(path);
+ }
if (err)
goto out_unlock;
}
@@ -5220,9 +5247,33 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
dir_inode = btrfs_iget(root->fs_info->sb, &inode_key,
root, NULL);
- /* If parent inode was deleted, skip it. */
- if (IS_ERR(dir_inode))
- continue;
+ /*
+ * If the parent inode was deleted, return an error to
+ * fallback to a transaction commit. This is to prevent
+ * getting an inode that was moved from one parent A to
+ * a parent B, got its former parent A deleted and then
+ * it got fsync'ed, from existing at both parents after
+ * a log replay (and the old parent still existing).
+ * Example:
+ *
+ * mkdir /mnt/A
+ * mkdir /mnt/B
+ * touch /mnt/B/bar
+ * sync
+ * mv /mnt/B/bar /mnt/A/bar
+ * mv -T /mnt/A /mnt/B
+ * fsync /mnt/B/bar
+ * <power fail>
+ *
+ * If we ignore the old parent B which got deleted,
+ * after a log replay we would have file bar linked
+ * at both parents and the old parent B would still
+ * exist.
+ */
+ if (IS_ERR(dir_inode)) {
+ ret = PTR_ERR(dir_inode);
+ goto out;
+ }
ret = btrfs_log_inode(trans, root, dir_inode,
LOG_INODE_ALL, 0, LLONG_MAX, ctx);
@@ -5523,6 +5574,23 @@ again:
path);
}
+ if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
+ struct btrfs_root *root = wc.replay_dest;
+
+ btrfs_release_path(path);
+
+ /*
+ * We have just replayed everything, and the highest
+ * objectid of fs roots probably has changed in case
+ * some inode_item's got replayed.
+ *
+ * root->objectid_mutex is not acquired as log replay
+ * could only happen during mount.
+ */
+ ret = btrfs_find_highest_objectid(root,
+ &root->highest_objectid);
+ }
+
key.offset = found_key.offset - 1;
wc.replay_dest->log_root = NULL;
free_extent_buffer(log->node);
diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c
index 778282944530..837a9a8d579e 100644
--- a/fs/btrfs/uuid-tree.c
+++ b/fs/btrfs/uuid-tree.c
@@ -348,7 +348,5 @@ skip:
out:
btrfs_free_path(path);
- if (ret)
- btrfs_warn(fs_info, "btrfs_uuid_tree_iterate failed %d", ret);
- return 0;
+ return ret;
}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 9c62a6f9757a..d1cca19b29d3 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -108,7 +108,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
},
};
-const u64 const btrfs_raid_group[BTRFS_NR_RAID_TYPES] = {
+const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES] = {
[BTRFS_RAID_RAID10] = BTRFS_BLOCK_GROUP_RAID10,
[BTRFS_RAID_RAID1] = BTRFS_BLOCK_GROUP_RAID1,
[BTRFS_RAID_DUP] = BTRFS_BLOCK_GROUP_DUP,
@@ -568,6 +568,7 @@ void btrfs_free_stale_device(struct btrfs_device *cur_dev)
btrfs_sysfs_remove_fsid(fs_devs);
list_del(&fs_devs->list);
free_fs_devices(fs_devs);
+ break;
} else {
fs_devs->num_devices--;
list_del(&dev->dev_list);
@@ -1183,7 +1184,7 @@ again:
struct map_lookup *map;
int i;
- map = (struct map_lookup *)em->bdev;
+ map = em->map_lookup;
for (i = 0; i < map->num_stripes; i++) {
u64 end;
@@ -2756,7 +2757,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
free_extent_map(em);
return -EINVAL;
}
- map = (struct map_lookup *)em->bdev;
+ map = em->map_lookup;
lock_chunks(root->fs_info->chunk_root);
check_system_chunk(trans, extent_root, map->type);
unlock_chunks(root->fs_info->chunk_root);
@@ -3849,6 +3850,15 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
return 0;
}
+ /*
+ * A ro->rw remount sequence should continue with the paused balance
+ * regardless of who pauses it, system or the user as of now, so set
+ * the resume flag.
+ */
+ spin_lock(&fs_info->balance_lock);
+ fs_info->balance_ctl->flags |= BTRFS_BALANCE_RESUME;
+ spin_unlock(&fs_info->balance_lock);
+
tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
return PTR_ERR_OR_ZERO(tsk);
}
@@ -4530,7 +4540,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
if (type & BTRFS_BLOCK_GROUP_DATA) {
max_stripe_size = 1024 * 1024 * 1024;
- max_chunk_size = 10 * max_stripe_size;
+ max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE;
if (!devs_max)
devs_max = BTRFS_MAX_DEVS(info->chunk_root);
} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
@@ -4638,10 +4648,13 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
if (devs_max && ndevs > devs_max)
ndevs = devs_max;
/*
- * the primary goal is to maximize the number of stripes, so use as many
- * devices as possible, even if the stripes are not maximum sized.
+ * The primary goal is to maximize the number of stripes, so use as
+ * many devices as possible, even if the stripes are not maximum sized.
+ *
+ * The DUP profile stores more than one stripe per device, the
+ * max_avail is the total size so we have to adjust.
*/
- stripe_size = devices_info[ndevs-1].max_avail;
+ stripe_size = div_u64(devices_info[ndevs - 1].max_avail, dev_stripes);
num_stripes = ndevs * dev_stripes;
/*
@@ -4681,8 +4694,6 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
stripe_size = devices_info[ndevs-1].max_avail;
}
- stripe_size = div_u64(stripe_size, dev_stripes);
-
/* align to BTRFS_STRIPE_LEN */
stripe_size = div_u64(stripe_size, raid_stripe_len);
stripe_size *= raid_stripe_len;
@@ -4720,7 +4731,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
goto error;
}
set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
- em->bdev = (struct block_device *)map;
+ em->map_lookup = map;
em->start = start;
em->len = num_bytes;
em->block_start = 0;
@@ -4815,7 +4826,7 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
return -EINVAL;
}
- map = (struct map_lookup *)em->bdev;
+ map = em->map_lookup;
item_size = btrfs_chunk_item_size(map->num_stripes);
stripe_size = em->orig_block_len;
@@ -4957,7 +4968,7 @@ int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
if (!em)
return 1;
- map = (struct map_lookup *)em->bdev;
+ map = em->map_lookup;
for (i = 0; i < map->num_stripes; i++) {
if (map->stripes[i].dev->missing) {
miss_ndevs++;
@@ -5037,7 +5048,7 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
return 1;
}
- map = (struct map_lookup *)em->bdev;
+ map = em->map_lookup;
if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
ret = map->num_stripes;
else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
@@ -5045,7 +5056,14 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
ret = 2;
else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
- ret = 3;
+ /*
+ * There could be two corrupted data stripes, we need
+ * to loop retry in order to rebuild the correct data.
+ *
+ * Fail a stripe at a time on every retry except the
+ * stripe under reconstruction.
+ */
+ ret = map->num_stripes;
else
ret = 1;
free_extent_map(em);
@@ -5073,7 +5091,7 @@ unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
BUG_ON(!em);
BUG_ON(em->start > logical || em->start + em->len < logical);
- map = (struct map_lookup *)em->bdev;
+ map = em->map_lookup;
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
len = map->stripe_len * nr_data_stripes(map);
free_extent_map(em);
@@ -5094,7 +5112,7 @@ int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
BUG_ON(!em);
BUG_ON(em->start > logical || em->start + em->len < logical);
- map = (struct map_lookup *)em->bdev;
+ map = em->map_lookup;
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
ret = 1;
free_extent_map(em);
@@ -5253,7 +5271,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
return -EINVAL;
}
- map = (struct map_lookup *)em->bdev;
+ map = em->map_lookup;
offset = logical - em->start;
stripe_len = map->stripe_len;
@@ -5795,7 +5813,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
free_extent_map(em);
return -EIO;
}
- map = (struct map_lookup *)em->bdev;
+ map = em->map_lookup;
length = em->len;
rmap_len = map->stripe_len;
@@ -6190,6 +6208,101 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
return dev;
}
+/* Return -EIO if any error, otherwise return 0. */
+static int btrfs_check_chunk_valid(struct btrfs_root *root,
+ struct extent_buffer *leaf,
+ struct btrfs_chunk *chunk, u64 logical)
+{
+ u64 length;
+ u64 stripe_len;
+ u16 num_stripes;
+ u16 sub_stripes;
+ u64 type;
+ u64 features;
+ bool mixed = false;
+
+ length = btrfs_chunk_length(leaf, chunk);
+ stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
+ num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+ sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
+ type = btrfs_chunk_type(leaf, chunk);
+
+ if (!num_stripes) {
+ btrfs_err(root->fs_info, "invalid chunk num_stripes: %u",
+ num_stripes);
+ return -EIO;
+ }
+ if (!IS_ALIGNED(logical, root->sectorsize)) {
+ btrfs_err(root->fs_info,
+ "invalid chunk logical %llu", logical);
+ return -EIO;
+ }
+ if (btrfs_chunk_sector_size(leaf, chunk) != root->sectorsize) {
+ btrfs_err(root->fs_info, "invalid chunk sectorsize %u",
+ btrfs_chunk_sector_size(leaf, chunk));
+ return -EIO;
+ }
+ if (!length || !IS_ALIGNED(length, root->sectorsize)) {
+ btrfs_err(root->fs_info,
+ "invalid chunk length %llu", length);
+ return -EIO;
+ }
+ if (!is_power_of_2(stripe_len)) {
+ btrfs_err(root->fs_info, "invalid chunk stripe length: %llu",
+ stripe_len);
+ return -EIO;
+ }
+ if (~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) &
+ type) {
+ btrfs_err(root->fs_info, "unrecognized chunk type: %llu",
+ ~(BTRFS_BLOCK_GROUP_TYPE_MASK |
+ BTRFS_BLOCK_GROUP_PROFILE_MASK) &
+ btrfs_chunk_type(leaf, chunk));
+ return -EIO;
+ }
+
+ if ((type & BTRFS_BLOCK_GROUP_TYPE_MASK) == 0) {
+ btrfs_err(root->fs_info, "missing chunk type flag: 0x%llx", type);
+ return -EIO;
+ }
+
+ if ((type & BTRFS_BLOCK_GROUP_SYSTEM) &&
+ (type & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA))) {
+ btrfs_err(root->fs_info,
+ "system chunk with data or metadata type: 0x%llx", type);
+ return -EIO;
+ }
+
+ features = btrfs_super_incompat_flags(root->fs_info->super_copy);
+ if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
+ mixed = true;
+
+ if (!mixed) {
+ if ((type & BTRFS_BLOCK_GROUP_METADATA) &&
+ (type & BTRFS_BLOCK_GROUP_DATA)) {
+ btrfs_err(root->fs_info,
+ "mixed chunk type in non-mixed mode: 0x%llx", type);
+ return -EIO;
+ }
+ }
+
+ if ((type & BTRFS_BLOCK_GROUP_RAID10 && sub_stripes != 2) ||
+ (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes != 2) ||
+ (type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < 2) ||
+ (type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < 3) ||
+ (type & BTRFS_BLOCK_GROUP_DUP && num_stripes != 2) ||
+ ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 &&
+ num_stripes != 1)) {
+ btrfs_err(root->fs_info,
+ "invalid num_stripes:sub_stripes %u:%u for profile %llu",
+ num_stripes, sub_stripes,
+ type & BTRFS_BLOCK_GROUP_PROFILE_MASK);
+ return -EIO;
+ }
+
+ return 0;
+}
+
static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
struct extent_buffer *leaf,
struct btrfs_chunk *chunk)
@@ -6199,6 +6312,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
struct extent_map *em;
u64 logical;
u64 length;
+ u64 stripe_len;
u64 devid;
u8 uuid[BTRFS_UUID_SIZE];
int num_stripes;
@@ -6207,6 +6321,12 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
logical = key->offset;
length = btrfs_chunk_length(leaf, chunk);
+ stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
+ num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+
+ ret = btrfs_check_chunk_valid(root, leaf, chunk, logical);
+ if (ret)
+ return ret;
read_lock(&map_tree->map_tree.lock);
em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
@@ -6223,7 +6343,6 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
em = alloc_extent_map();
if (!em)
return -ENOMEM;
- num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
if (!map) {
free_extent_map(em);
@@ -6231,7 +6350,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
}
set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
- em->bdev = (struct block_device *)map;
+ em->map_lookup = map;
em->start = logical;
em->len = length;
em->orig_start = 0;
@@ -6455,6 +6574,7 @@ int btrfs_read_sys_array(struct btrfs_root *root)
u32 array_size;
u32 len = 0;
u32 cur_offset;
+ u64 type;
struct btrfs_key key;
ASSERT(BTRFS_SUPER_INFO_SIZE <= root->nodesize);
@@ -6521,6 +6641,15 @@ int btrfs_read_sys_array(struct btrfs_root *root)
break;
}
+ type = btrfs_chunk_type(sb, chunk);
+ if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) {
+ btrfs_err(root->fs_info,
+ "invalid chunk type %llu in sys_array at offset %u",
+ type, cur_offset);
+ ret = -EIO;
+ break;
+ }
+
len = btrfs_chunk_item_size(num_stripes);
if (cur_offset + len > array_size)
goto out_short_read;
@@ -6930,7 +7059,7 @@ void btrfs_update_commit_device_bytes_used(struct btrfs_root *root,
/* In order to kick the device replace finish process */
lock_chunks(root);
list_for_each_entry(em, &transaction->pending_chunks, list) {
- map = (struct map_lookup *)em->bdev;
+ map = em->map_lookup;
for (i = 0; i < map->num_stripes; i++) {
dev = map->stripes[i].dev;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index d5c84f6b1353..3c651df420be 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -24,6 +24,8 @@
#include <linux/btrfs.h>
#include "async-thread.h"
+#define BTRFS_MAX_DATA_CHUNK_SIZE (10ULL * SZ_1G)
+
extern struct mutex uuid_mutex;
#define BTRFS_STRIPE_LEN (64 * 1024)