diff options
Diffstat (limited to 'fs')
101 files changed, 2631 insertions, 1574 deletions
@@ -40,6 +40,7 @@ #include <linux/ramfs.h> #include <linux/percpu-refcount.h> #include <linux/mount.h> +#include <linux/nospec.h> #include <asm/kmap_types.h> #include <asm/uaccess.h> @@ -1064,6 +1065,7 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id) if (!table || id >= table->nr) goto out; + id = array_index_nospec(id, table->nr); ctx = rcu_dereference(table->table[id]); if (ctx && ctx->user_id == ctx_id) { if (percpu_ref_tryget_live(&ctx->users)) diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 6d1d0b93b1aa..c792df826e12 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -9,7 +9,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ export.o tree-log.o free-space-cache.o zlib.o lzo.o \ compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \ - uuid-tree.o props.o hash.o + uuid-tree.o props.o hash.o tree-checker.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 38ee08675468..8f4baa3cb992 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1726,20 +1726,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, return err; } -/* - * The leaf data grows from end-to-front in the node. - * this returns the address of the start of the last item, - * which is the stop of the leaf data stack - */ -static inline unsigned int leaf_data_end(struct btrfs_root *root, - struct extent_buffer *leaf) -{ - u32 nr = btrfs_header_nritems(leaf); - if (nr == 0) - return BTRFS_LEAF_DATA_SIZE(root); - return btrfs_item_offset_nr(leaf, nr - 1); -} - /* * search for key in the extent_buffer. The items start at offset p, diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index e847573c6db0..4a91d3119e59 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -35,6 +35,7 @@ #include <linux/btrfs.h> #include <linux/workqueue.h> #include <linux/security.h> +#include <linux/sizes.h> #include "extent_io.h" #include "extent_map.h" #include "async-thread.h" @@ -897,6 +898,7 @@ struct btrfs_balance_item { #define BTRFS_FILE_EXTENT_INLINE 0 #define BTRFS_FILE_EXTENT_REG 1 #define BTRFS_FILE_EXTENT_PREALLOC 2 +#define BTRFS_FILE_EXTENT_TYPES 2 struct btrfs_file_extent_item { /* @@ -2283,7 +2285,7 @@ do { \ #define BTRFS_INODE_ROOT_ITEM_INIT (1 << 31) struct btrfs_map_token { - struct extent_buffer *eb; + const struct extent_buffer *eb; char *kaddr; unsigned long offset; }; @@ -2314,18 +2316,19 @@ static inline void btrfs_init_map_token (struct btrfs_map_token *token) sizeof(((type *)0)->member))) #define DECLARE_BTRFS_SETGET_BITS(bits) \ -u##bits btrfs_get_token_##bits(struct extent_buffer *eb, void *ptr, \ - unsigned long off, \ - struct btrfs_map_token *token); \ -void btrfs_set_token_##bits(struct extent_buffer *eb, void *ptr, \ +u##bits btrfs_get_token_##bits(const struct extent_buffer *eb, \ + const void *ptr, unsigned long off, \ + struct btrfs_map_token *token); \ +void btrfs_set_token_##bits(struct extent_buffer *eb, const void *ptr, \ unsigned long off, u##bits val, \ struct btrfs_map_token *token); \ -static inline u##bits btrfs_get_##bits(struct extent_buffer *eb, void *ptr, \ +static inline u##bits btrfs_get_##bits(const struct extent_buffer *eb, \ + const void *ptr, \ unsigned long off) \ { \ return btrfs_get_token_##bits(eb, ptr, off, NULL); \ } \ -static inline void btrfs_set_##bits(struct extent_buffer *eb, void *ptr, \ +static inline void btrfs_set_##bits(struct extent_buffer *eb, void *ptr,\ unsigned long off, u##bits val) \ { \ btrfs_set_token_##bits(eb, ptr, off, val, NULL); \ @@ -2337,7 +2340,8 @@ DECLARE_BTRFS_SETGET_BITS(32) DECLARE_BTRFS_SETGET_BITS(64) #define BTRFS_SETGET_FUNCS(name, type, member, bits) \ -static inline u##bits btrfs_##name(struct extent_buffer *eb, type *s) \ +static inline u##bits btrfs_##name(const struct extent_buffer *eb, \ + const type *s) \ { \ BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \ return btrfs_get_##bits(eb, s, offsetof(type, member)); \ @@ -2348,7 +2352,8 @@ static inline void btrfs_set_##name(struct extent_buffer *eb, type *s, \ BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \ btrfs_set_##bits(eb, s, offsetof(type, member), val); \ } \ -static inline u##bits btrfs_token_##name(struct extent_buffer *eb, type *s, \ +static inline u##bits btrfs_token_##name(const struct extent_buffer *eb,\ + const type *s, \ struct btrfs_map_token *token) \ { \ BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \ @@ -2363,9 +2368,9 @@ static inline void btrfs_set_token_##name(struct extent_buffer *eb, \ } #define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \ -static inline u##bits btrfs_##name(struct extent_buffer *eb) \ +static inline u##bits btrfs_##name(const struct extent_buffer *eb) \ { \ - type *p = page_address(eb->pages[0]); \ + const type *p = page_address(eb->pages[0]); \ u##bits res = le##bits##_to_cpu(p->member); \ return res; \ } \ @@ -2377,7 +2382,7 @@ static inline void btrfs_set_##name(struct extent_buffer *eb, \ } #define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits) \ -static inline u##bits btrfs_##name(type *s) \ +static inline u##bits btrfs_##name(const type *s) \ { \ return le##bits##_to_cpu(s->member); \ } \ @@ -2678,7 +2683,7 @@ static inline unsigned long btrfs_node_key_ptr_offset(int nr) sizeof(struct btrfs_key_ptr) * nr; } -void btrfs_node_key(struct extent_buffer *eb, +void btrfs_node_key(const struct extent_buffer *eb, struct btrfs_disk_key *disk_key, int nr); static inline void btrfs_set_node_key(struct extent_buffer *eb, @@ -2707,28 +2712,28 @@ static inline struct btrfs_item *btrfs_item_nr(int nr) return (struct btrfs_item *)btrfs_item_nr_offset(nr); } -static inline u32 btrfs_item_end(struct extent_buffer *eb, +static inline u32 btrfs_item_end(const struct extent_buffer *eb, struct btrfs_item *item) { return btrfs_item_offset(eb, item) + btrfs_item_size(eb, item); } -static inline u32 btrfs_item_end_nr(struct extent_buffer *eb, int nr) +static inline u32 btrfs_item_end_nr(const struct extent_buffer *eb, int nr) { return btrfs_item_end(eb, btrfs_item_nr(nr)); } -static inline u32 btrfs_item_offset_nr(struct extent_buffer *eb, int nr) +static inline u32 btrfs_item_offset_nr(const struct extent_buffer *eb, int nr) { return btrfs_item_offset(eb, btrfs_item_nr(nr)); } -static inline u32 btrfs_item_size_nr(struct extent_buffer *eb, int nr) +static inline u32 btrfs_item_size_nr(const struct extent_buffer *eb, int nr) { return btrfs_item_size(eb, btrfs_item_nr(nr)); } -static inline void btrfs_item_key(struct extent_buffer *eb, +static inline void btrfs_item_key(const struct extent_buffer *eb, struct btrfs_disk_key *disk_key, int nr) { struct btrfs_item *item = btrfs_item_nr(nr); @@ -2764,8 +2769,8 @@ BTRFS_SETGET_STACK_FUNCS(stack_dir_name_len, struct btrfs_dir_item, BTRFS_SETGET_STACK_FUNCS(stack_dir_transid, struct btrfs_dir_item, transid, 64); -static inline void btrfs_dir_item_key(struct extent_buffer *eb, - struct btrfs_dir_item *item, +static inline void btrfs_dir_item_key(const struct extent_buffer *eb, + const struct btrfs_dir_item *item, struct btrfs_disk_key *key) { read_eb_member(eb, item, struct btrfs_dir_item, location, key); @@ -2773,7 +2778,7 @@ static inline void btrfs_dir_item_key(struct extent_buffer *eb, static inline void btrfs_set_dir_item_key(struct extent_buffer *eb, struct btrfs_dir_item *item, - struct btrfs_disk_key *key) + const struct btrfs_disk_key *key) { write_eb_member(eb, item, struct btrfs_dir_item, location, key); } @@ -2785,8 +2790,8 @@ BTRFS_SETGET_FUNCS(free_space_bitmaps, struct btrfs_free_space_header, BTRFS_SETGET_FUNCS(free_space_generation, struct btrfs_free_space_header, generation, 64); -static inline void btrfs_free_space_key(struct extent_buffer *eb, - struct btrfs_free_space_header *h, +static inline void btrfs_free_space_key(const struct extent_buffer *eb, + const struct btrfs_free_space_header *h, struct btrfs_disk_key *key) { read_eb_member(eb, h, struct btrfs_free_space_header, location, key); @@ -2794,7 +2799,7 @@ static inline void btrfs_free_space_key(struct extent_buffer *eb, static inline void btrfs_set_free_space_key(struct extent_buffer *eb, struct btrfs_free_space_header *h, - struct btrfs_disk_key *key) + const struct btrfs_disk_key *key) { write_eb_member(eb, h, struct btrfs_free_space_header, location, key); } @@ -2821,25 +2826,25 @@ static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk, disk->objectid = cpu_to_le64(cpu->objectid); } -static inline void btrfs_node_key_to_cpu(struct extent_buffer *eb, - struct btrfs_key *key, int nr) +static inline void btrfs_node_key_to_cpu(const struct extent_buffer *eb, + struct btrfs_key *key, int nr) { struct btrfs_disk_key disk_key; btrfs_node_key(eb, &disk_key, nr); btrfs_disk_key_to_cpu(key, &disk_key); } -static inline void btrfs_item_key_to_cpu(struct extent_buffer *eb, - struct btrfs_key *key, int nr) +static inline void btrfs_item_key_to_cpu(const struct extent_buffer *eb, + struct btrfs_key *key, int nr) { struct btrfs_disk_key disk_key; btrfs_item_key(eb, &disk_key, nr); btrfs_disk_key_to_cpu(key, &disk_key); } -static inline void btrfs_dir_item_key_to_cpu(struct extent_buffer *eb, - struct btrfs_dir_item *item, - struct btrfs_key *key) +static inline void btrfs_dir_item_key_to_cpu(const struct extent_buffer *eb, + const struct btrfs_dir_item *item, + struct btrfs_key *key) { struct btrfs_disk_key disk_key; btrfs_dir_item_key(eb, item, &disk_key); @@ -2872,7 +2877,7 @@ BTRFS_SETGET_STACK_FUNCS(stack_header_nritems, struct btrfs_header, nritems, 32); BTRFS_SETGET_STACK_FUNCS(stack_header_bytenr, struct btrfs_header, bytenr, 64); -static inline int btrfs_header_flag(struct extent_buffer *eb, u64 flag) +static inline int btrfs_header_flag(const struct extent_buffer *eb, u64 flag) { return (btrfs_header_flags(eb) & flag) == flag; } @@ -2891,7 +2896,7 @@ static inline int btrfs_clear_header_flag(struct extent_buffer *eb, u64 flag) return (flags & flag) == flag; } -static inline int btrfs_header_backref_rev(struct extent_buffer *eb) +static inline int btrfs_header_backref_rev(const struct extent_buffer *eb) { u64 flags = btrfs_header_flags(eb); return flags >> BTRFS_BACKREF_REV_SHIFT; @@ -2911,12 +2916,12 @@ static inline unsigned long btrfs_header_fsid(void) return offsetof(struct btrfs_header, fsid); } -static inline unsigned long btrfs_header_chunk_tree_uuid(struct extent_buffer *eb) +static inline unsigned long btrfs_header_chunk_tree_uuid(const struct extent_buffer *eb) { return offsetof(struct btrfs_header, chunk_tree_uuid); } -static inline int btrfs_is_leaf(struct extent_buffer *eb) +static inline int btrfs_is_leaf(const struct extent_buffer *eb) { return btrfs_header_level(eb) == 0; } @@ -2950,12 +2955,12 @@ BTRFS_SETGET_STACK_FUNCS(root_stransid, struct btrfs_root_item, BTRFS_SETGET_STACK_FUNCS(root_rtransid, struct btrfs_root_item, rtransid, 64); -static inline bool btrfs_root_readonly(struct btrfs_root *root) +static inline bool btrfs_root_readonly(const struct btrfs_root *root) { return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_RDONLY)) != 0; } -static inline bool btrfs_root_dead(struct btrfs_root *root) +static inline bool btrfs_root_dead(const struct btrfs_root *root) { return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_DEAD)) != 0; } @@ -3012,51 +3017,51 @@ BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup, /* struct btrfs_balance_item */ BTRFS_SETGET_FUNCS(balance_flags, struct btrfs_balance_item, flags, 64); -static inline void btrfs_balance_data(struct extent_buffer *eb, - struct btrfs_balance_item *bi, +static inline void btrfs_balance_data(const struct extent_buffer *eb, + const struct btrfs_balance_item *bi, struct btrfs_disk_balance_args *ba) { read_eb_member(eb, bi, struct btrfs_balance_item, data, ba); } static inline void btrfs_set_balance_data(struct extent_buffer *eb, - struct btrfs_balance_item *bi, - struct btrfs_disk_balance_args *ba) + struct btrfs_balance_item *bi, + const struct btrfs_disk_balance_args *ba) { write_eb_member(eb, bi, struct btrfs_balance_item, data, ba); } -static inline void btrfs_balance_meta(struct extent_buffer *eb, - struct btrfs_balance_item *bi, +static inline void btrfs_balance_meta(const struct extent_buffer *eb, + const struct btrfs_balance_item *bi, struct btrfs_disk_balance_args *ba) { read_eb_member(eb, bi, struct btrfs_balance_item, meta, ba); } static inline void btrfs_set_balance_meta(struct extent_buffer *eb, - struct btrfs_balance_item *bi, - struct btrfs_disk_balance_args *ba) + struct btrfs_balance_item *bi, + const struct btrfs_disk_balance_args *ba) { write_eb_member(eb, bi, struct btrfs_balance_item, meta, ba); } -static inline void btrfs_balance_sys(struct extent_buffer *eb, - struct btrfs_balance_item *bi, +static inline void btrfs_balance_sys(const struct extent_buffer *eb, + const struct btrfs_balance_item *bi, struct btrfs_disk_balance_args *ba) { read_eb_member(eb, bi, struct btrfs_balance_item, sys, ba); } static inline void btrfs_set_balance_sys(struct extent_buffer *eb, - struct btrfs_balance_item *bi, - struct btrfs_disk_balance_args *ba) + struct btrfs_balance_item *bi, + const struct btrfs_disk_balance_args *ba) { write_eb_member(eb, bi, struct btrfs_balance_item, sys, ba); } static inline void btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu, - struct btrfs_disk_balance_args *disk) + const struct btrfs_disk_balance_args *disk) { memset(cpu, 0, sizeof(*cpu)); @@ -3076,7 +3081,7 @@ btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu, static inline void btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk, - struct btrfs_balance_args *cpu) + const struct btrfs_balance_args *cpu) { memset(disk, 0, sizeof(*disk)); @@ -3144,7 +3149,7 @@ BTRFS_SETGET_STACK_FUNCS(super_magic, struct btrfs_super_block, magic, 64); BTRFS_SETGET_STACK_FUNCS(super_uuid_tree_generation, struct btrfs_super_block, uuid_tree_generation, 64); -static inline int btrfs_super_csum_size(struct btrfs_super_block *s) +static inline int btrfs_super_csum_size(const struct btrfs_super_block *s) { u16 t = btrfs_super_csum_type(s); /* @@ -3158,6 +3163,21 @@ static inline unsigned long btrfs_leaf_data(struct extent_buffer *l) return offsetof(struct btrfs_leaf, items); } +/* + * The leaf data grows from end-to-front in the node. + * this returns the address of the start of the last item, + * which is the stop of the leaf data stack + */ +static inline unsigned int leaf_data_end(const struct btrfs_root *root, + const struct extent_buffer *leaf) +{ + u32 nr = btrfs_header_nritems(leaf); + + if (nr == 0) + return BTRFS_LEAF_DATA_SIZE(root); + return btrfs_item_offset_nr(leaf, nr - 1); +} + /* struct btrfs_file_extent_item */ BTRFS_SETGET_FUNCS(file_extent_type, struct btrfs_file_extent_item, type, 8); BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_bytenr, @@ -3174,7 +3194,7 @@ BTRFS_SETGET_STACK_FUNCS(stack_file_extent_compression, struct btrfs_file_extent_item, compression, 8); static inline unsigned long -btrfs_file_extent_inline_start(struct btrfs_file_extent_item *e) +btrfs_file_extent_inline_start(const struct btrfs_file_extent_item *e) { return (unsigned long)e + BTRFS_FILE_EXTENT_INLINE_DATA_START; } @@ -3208,8 +3228,9 @@ BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item, * size of any extent headers. If a file is compressed on disk, this is * the compressed size */ -static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb, - struct btrfs_item *e) +static inline u32 btrfs_file_extent_inline_item_len( + const struct extent_buffer *eb, + struct btrfs_item *e) { return btrfs_item_size(eb, e) - BTRFS_FILE_EXTENT_INLINE_DATA_START; } @@ -3217,9 +3238,9 @@ static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb, /* this returns the number of file bytes represented by the inline item. * If an item is compressed, this is the uncompressed size */ -static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb, - int slot, - struct btrfs_file_extent_item *fi) +static inline u32 btrfs_file_extent_inline_len(const struct extent_buffer *eb, + int slot, + const struct btrfs_file_extent_item *fi) { struct btrfs_map_token token; @@ -3241,8 +3262,8 @@ static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb, /* btrfs_dev_stats_item */ -static inline u64 btrfs_dev_stats_value(struct extent_buffer *eb, - struct btrfs_dev_stats_item *ptr, +static inline u64 btrfs_dev_stats_value(const struct extent_buffer *eb, + const struct btrfs_dev_stats_item *ptr, int index) { u64 val; diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 176a27bc63aa..81e5bc62e8e3 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -620,7 +620,7 @@ static void btrfs_dev_replace_update_device_in_mapping_tree( em = lookup_extent_mapping(em_tree, start, (u64)-1); if (!em) break; - map = (struct map_lookup *)em->bdev; + map = em->map_lookup; for (i = 0; i < map->num_stripes; i++) if (srcdev == map->stripes[i].dev) map->stripes[i].dev = tgtdev; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 1f21c6c33228..78722aaffecd 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -49,6 +49,7 @@ #include "raid56.h" #include "sysfs.h" #include "qgroup.h" +#include "tree-checker.h" #ifdef CONFIG_X86 #include <asm/cpufeature.h> @@ -522,72 +523,6 @@ static int check_tree_block_fsid(struct btrfs_fs_info *fs_info, return ret; } -#define CORRUPT(reason, eb, root, slot) \ - btrfs_crit(root->fs_info, "corrupt leaf, %s: block=%llu," \ - "root=%llu, slot=%d", reason, \ - btrfs_header_bytenr(eb), root->objectid, slot) - -static noinline int check_leaf(struct btrfs_root *root, - struct extent_buffer *leaf) -{ - struct btrfs_key key; - struct btrfs_key leaf_key; - u32 nritems = btrfs_header_nritems(leaf); - int slot; - - if (nritems == 0) - return 0; - - /* Check the 0 item */ - if (btrfs_item_offset_nr(leaf, 0) + btrfs_item_size_nr(leaf, 0) != - BTRFS_LEAF_DATA_SIZE(root)) { - CORRUPT("invalid item offset size pair", leaf, root, 0); - return -EIO; - } - - /* - * Check to make sure each items keys are in the correct order and their - * offsets make sense. We only have to loop through nritems-1 because - * we check the current slot against the next slot, which verifies the - * next slot's offset+size makes sense and that the current's slot - * offset is correct. - */ - for (slot = 0; slot < nritems - 1; slot++) { - btrfs_item_key_to_cpu(leaf, &leaf_key, slot); - btrfs_item_key_to_cpu(leaf, &key, slot + 1); - - /* Make sure the keys are in the right order */ - if (btrfs_comp_cpu_keys(&leaf_key, &key) >= 0) { - CORRUPT("bad key order", leaf, root, slot); - return -EIO; - } - - /* - * Make sure the offset and ends are right, remember that the - * item data starts at the end of the leaf and grows towards the - * front. - */ - if (btrfs_item_offset_nr(leaf, slot) != - btrfs_item_end_nr(leaf, slot + 1)) { - CORRUPT("slot offset bad", leaf, root, slot); - return -EIO; - } - - /* - * Check to make sure that we don't point outside of the leaf, - * just incase all the items are consistent to eachother, but - * all point outside of the leaf. - */ - if (btrfs_item_end_nr(leaf, slot) > - BTRFS_LEAF_DATA_SIZE(root)) { - CORRUPT("slot end outside of leaf", leaf, root, slot); - return -EIO; - } - } - - return 0; -} - static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, u64 phy_offset, struct page *page, u64 start, u64 end, int mirror) @@ -654,11 +589,14 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, * that we don't try and read the other copies of this block, just * return -EIO. */ - if (found_level == 0 && check_leaf(root, eb)) { + if (found_level == 0 && btrfs_check_leaf_full(root, eb)) { set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); ret = -EIO; } + if (found_level > 0 && btrfs_check_node(root, eb)) + ret = -EIO; + if (!ret) set_extent_buffer_uptodate(eb); err: @@ -3958,7 +3896,13 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) buf->len, root->fs_info->dirty_metadata_batch); #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY - if (btrfs_header_level(buf) == 0 && check_leaf(root, buf)) { + /* + * Since btrfs_mark_buffer_dirty() can be called with item pointer set + * but item data not updated. + * So here we should only check item pointers, not item data. + */ + if (btrfs_header_level(buf) == 0 && + btrfs_check_leaf_relaxed(root, buf)) { btrfs_print_leaf(root, buf); ASSERT(0); } @@ -4167,6 +4111,14 @@ static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info) spin_lock(&fs_info->ordered_root_lock); } spin_unlock(&fs_info->ordered_root_lock); + + /* + * We need this here because if we've been flipped read-only we won't + * get sync() from the umount, so we need to make sure any ordered + * extents that haven't had their dirty pages IO start writeout yet + * actually get run and error out properly. + */ + btrfs_wait_ordered_roots(fs_info, -1); } static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 13ff0fdae03e..978bbfed5a2c 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2342,7 +2342,13 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, ins.type = BTRFS_EXTENT_ITEM_KEY; } - BUG_ON(node->ref_mod != 1); + if (node->ref_mod != 1) { + btrfs_err(root->fs_info, + "btree block(%llu) has %d references rather than 1: action %d ref_root %llu parent %llu", + node->bytenr, node->ref_mod, node->action, ref_root, + parent); + return -EIO; + } if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { BUG_ON(!extent_op || !extent_op->update_flags); ret = alloc_reserved_tree_block(trans, root, @@ -9481,6 +9487,8 @@ static int find_first_block_group(struct btrfs_root *root, int ret = 0; struct btrfs_key found_key; struct extent_buffer *leaf; + struct btrfs_block_group_item bg; + u64 flags; int slot; ret = btrfs_search_slot(NULL, root, key, path, 0, 0); @@ -9502,7 +9510,47 @@ static int find_first_block_group(struct btrfs_root *root, if (found_key.objectid >= key->objectid && found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) { - ret = 0; + struct extent_map_tree *em_tree; + struct extent_map *em; + + em_tree = &root->fs_info->mapping_tree.map_tree; + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, found_key.objectid, + found_key.offset); + read_unlock(&em_tree->lock); + if (!em) { + btrfs_err(root->fs_info, + "logical %llu len %llu found bg but no related chunk", + found_key.objectid, found_key.offset); + ret = -ENOENT; + } else if (em->start != found_key.objectid || + em->len != found_key.offset) { + btrfs_err(root->fs_info, + "block group %llu len %llu mismatch with chunk %llu len %llu", + found_key.objectid, found_key.offset, + em->start, em->len); + ret = -EUCLEAN; + } else { + read_extent_buffer(leaf, &bg, + btrfs_item_ptr_offset(leaf, slot), + sizeof(bg)); + flags = btrfs_block_group_flags(&bg) & + BTRFS_BLOCK_GROUP_TYPE_MASK; + + if (flags != (em->map_lookup->type & + BTRFS_BLOCK_GROUP_TYPE_MASK)) { + btrfs_err(root->fs_info, +"block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx", + found_key.objectid, + found_key.offset, flags, + (BTRFS_BLOCK_GROUP_TYPE_MASK & + em->map_lookup->type)); + ret = -EUCLEAN; + } else { + ret = 0; + } + } + free_extent_map(em); goto out; } path->slots[0]++; @@ -9717,6 +9765,62 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size) return cache; } + +/* + * Iterate all chunks and verify that each of them has the corresponding block + * group + */ +static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info) +{ + struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; + struct extent_map *em; + struct btrfs_block_group_cache *bg; + u64 start = 0; + int ret = 0; + + while (1) { + read_lock(&map_tree->map_tree.lock); + /* + * lookup_extent_mapping will return the first extent map + * intersecting the range, so setting @len to 1 is enough to + * get the first chunk. + */ + em = lookup_extent_mapping(&map_tree->map_tree, start, 1); + read_unlock(&map_tree->map_tree.lock); + if (!em) + break; + + bg = btrfs_lookup_block_group(fs_info, em->start); + if (!bg) { + btrfs_err(fs_info, + "chunk start=%llu len=%llu doesn't have corresponding block group", + em->start, em->len); + ret = -EUCLEAN; + free_extent_map(em); + break; + } + if (bg->key.objectid != em->start || + bg->key.offset != em->len || + (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) != + (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { + btrfs_err(fs_info, +"chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx", + em->start, em->len, + em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK, + bg->key.objectid, bg->key.offset, + bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK); + ret = -EUCLEAN; + free_extent_map(em); + btrfs_put_block_group(bg); + break; + } + start = em->start + em->len; + free_extent_map(em); + btrfs_put_block_group(bg); + } + return ret; +} + int btrfs_read_block_groups(struct btrfs_root *root) { struct btrfs_path *path; @@ -9903,7 +10007,7 @@ int btrfs_read_block_groups(struct btrfs_root *root) } init_global_block_rsv(info); - ret = 0; + ret = check_chunk_block_group_mappings(info); error: btrfs_free_path(path); return ret; @@ -10388,7 +10492,7 @@ btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info, * more device items and remove one chunk item), but this is done at * btrfs_remove_chunk() through a call to check_system_chunk(). */ - map = (struct map_lookup *)em->bdev; + map = em->map_lookup; num_items = 3 + map->num_stripes; free_extent_map(em); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 7de8d545f4d6..573c9d62cfc9 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3847,8 +3847,10 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, struct block_device *bdev = fs_info->fs_devices->latest_bdev; struct extent_io_tree *tree = &BTRFS_I(fs_info->btree_inode)->io_tree; u64 offset = eb->start; + u32 nritems; unsigned long i, num_pages; unsigned long bio_flags = 0; + unsigned long start, end; int rw = (epd->sync_io ? WRITE_SYNC : WRITE) | REQ_META; int ret = 0; @@ -3858,6 +3860,23 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, if (btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID) bio_flags = EXTENT_BIO_TREE_LOG; + /* set btree blocks beyond nritems with 0 to avoid stale content. */ + nritems = btrfs_header_nritems(eb); + if (btrfs_header_level(eb) > 0) { + end = btrfs_node_key_ptr_offset(nritems); + + memset_extent_buffer(eb, 0, end, eb->len - end); + } else { + /* + * leaf: + * header 0 1 2 .. N ... data_N .. data_2 data_1 data_0 + */ + start = btrfs_item_nr_offset(nritems); + end = btrfs_leaf_data(eb) + + leaf_data_end(fs_info->tree_root, eb); + memset_extent_buffer(eb, 0, start, end - start); + } + for (i = 0; i < num_pages; i++) { struct page *p = eb->pages[i]; @@ -5351,9 +5370,8 @@ unlock_exit: return ret; } -void read_extent_buffer(struct extent_buffer *eb, void *dstv, - unsigned long start, - unsigned long len) +void read_extent_buffer(const struct extent_buffer *eb, void *dstv, + unsigned long start, unsigned long len) { size_t cur; size_t offset; @@ -5382,9 +5400,9 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv, } } -int read_extent_buffer_to_user(struct extent_buffer *eb, void __user *dstv, - unsigned long start, - unsigned long len) +int read_extent_buffer_to_user(const struct extent_buffer *eb, + void __user *dstv, + unsigned long start, unsigned long len) { size_t cur; size_t offset; @@ -5419,10 +5437,10 @@ int read_extent_buffer_to_user(struct extent_buffer *eb, void __user *dstv, return ret; } -int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, - unsigned long min_len, char **map, - unsigned long *map_start, - unsigned long *map_len) +int map_private_extent_buffer(const struct extent_buffer *eb, + unsigned long start, unsigned long min_len, + char **map, unsigned long *map_start, + unsigned long *map_len) { size_t offset = start & (PAGE_CACHE_SIZE - 1); char *kaddr; @@ -5457,9 +5475,8 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, return 0; } -int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, - unsigned long start, - unsigned long len) +int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv, + unsigned long start, unsigned long len) { size_t cur; size_t offset; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index f4c1ae11855f..751435967724 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -308,14 +308,13 @@ static inline void extent_buffer_get(struct extent_buffer *eb) atomic_inc(&eb->refs); } -int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, - unsigned long start, - unsigned long len); -void read_extent_buffer(struct extent_buffer *eb, void *dst, +int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv, + unsigned long start, unsigned long len); +void read_extent_buffer(const struct extent_buffer *eb, void *dst, unsigned long start, unsigned long len); -int read_extent_buffer_to_user(struct extent_buffer *eb, void __user *dst, - unsigned long start, +int read_extent_buffer_to_user(const struct extent_buffer *eb, + void __user *dst, unsigned long start, unsigned long len); void write_extent_buffer(struct extent_buffer *eb, const void *src, unsigned long start, unsigned long len); @@ -334,10 +333,10 @@ int set_extent_buffer_uptodate(struct extent_buffer *eb); int clear_extent_buffer_uptodate(struct extent_buffer *eb); int extent_buffer_uptodate(struct extent_buffer *eb); int extent_buffer_under_io(struct extent_buffer *eb); -int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset, - unsigned long min_len, char **map, - unsigned long *map_start, - unsigned long *map_len); +int map_private_extent_buffer(const struct extent_buffer *eb, + unsigned long offset, unsigned long min_len, + char **map, unsigned long *map_start, + unsigned long *map_len); int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end); int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end); int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 6a98bddd8f33..84fb56d5c018 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -76,7 +76,7 @@ void free_extent_map(struct extent_map *em) WARN_ON(extent_map_in_tree(em)); WARN_ON(!list_empty(&em->list)); if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags)) - kfree(em->bdev); + kfree(em->map_lookup); kmem_cache_free(extent_map_cache, em); } } diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index b2991fd8583e..eb8b8fae036b 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -32,7 +32,15 @@ struct extent_map { u64 block_len; u64 generation; unsigned long flags; - struct block_device *bdev; + union { + struct block_device *bdev; + + /* + * used for chunk mappings + * flags & EXTENT_FLAG_FS_MAPPING must be set + */ + struct map_lookup *map_lookup; + }; atomic_t refs; unsigned int compress_type; struct list_head list; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 6dca9f937bf6..cc9ccc42f469 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -3460,7 +3460,7 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, return ret; } - map = (struct map_lookup *)em->bdev; + map = em->map_lookup; if (em->start != chunk_offset) goto out; diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 83c73738165e..40d1ab957fb6 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -3232,7 +3232,8 @@ static void free_pending_move(struct send_ctx *sctx, struct pending_dir_move *m) kfree(m); } -static void tail_append_pending_moves(struct pending_dir_move *moves, +static void tail_append_pending_moves(struct send_ctx *sctx, + struct pending_dir_move *moves, struct list_head *stack) { if (list_empty(&moves->list)) { @@ -3243,6 +3244,10 @@ static void tail_append_pending_moves(struct pending_dir_move *moves, list_add_tail(&moves->list, stack); list_splice_tail(&list, stack); } + if (!RB_EMPTY_NODE(&moves->node)) { + rb_erase(&moves->node, &sctx->pending_dir_moves); + RB_CLEAR_NODE(&moves->node); + } } static int apply_children_dir_moves(struct send_ctx *sctx) @@ -3257,7 +3262,7 @@ static int apply_children_dir_moves(struct send_ctx *sctx) return 0; INIT_LIST_HEAD(&stack); - tail_append_pending_moves(pm, &stack); + tail_append_pending_moves(sctx, pm, &stack); while (!list_empty(&stack)) { pm = list_first_entry(&stack, struct pending_dir_move, list); @@ -3268,7 +3273,7 @@ static int apply_children_dir_moves(struct send_ctx *sctx) goto out; pm = get_pending_dir_moves(sctx, parent_ino); if (pm) - tail_append_pending_moves(pm, &stack); + tail_append_pending_moves(sctx, pm, &stack); } return 0; diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c index b976597b0721..63ffd213b0b7 100644 --- a/fs/btrfs/struct-funcs.c +++ b/fs/btrfs/struct-funcs.c @@ -50,8 +50,8 @@ static inline void put_unaligned_le8(u8 val, void *p) */ #define DEFINE_BTRFS_SETGET_BITS(bits) \ -u##bits btrfs_get_token_##bits(struct extent_buffer *eb, void *ptr, \ - unsigned long off, \ +u##bits btrfs_get_token_##bits(const struct extent_buffer *eb, \ + const void *ptr, unsigned long off, \ struct btrfs_map_token *token) \ { \ unsigned long part_offset = (unsigned long)ptr; \ @@ -90,7 +90,8 @@ u##bits btrfs_get_token_##bits(struct extent_buffer *eb, void *ptr, \ return res; \ } \ void btrfs_set_token_##bits(struct extent_buffer *eb, \ - void *ptr, unsigned long off, u##bits val, \ + const void *ptr, unsigned long off, \ + u##bits val, \ struct btrfs_map_token *token) \ { \ unsigned long part_offset = (unsigned long)ptr; \ @@ -133,7 +134,7 @@ DEFINE_BTRFS_SETGET_BITS(16) DEFINE_BTRFS_SETGET_BITS(32) DEFINE_BTRFS_SETGET_BITS(64) -void btrfs_node_key(struct extent_buffer *eb, +void btrfs_node_key(const struct extent_buffer *eb, struct btrfs_disk_key *disk_key, int nr) { unsigned long ptr = btrfs_node_key_ptr_offset(nr); diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c new file mode 100644 index 000000000000..5b98f3c76ce4 --- /dev/null +++ b/fs/btrfs/tree-checker.c @@ -0,0 +1,649 @@ +/* + * Copyright (C) Qu Wenruo 2017. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program. + */ + +/* + * The module is used to catch unexpected/corrupted tree block data. + * Such behavior can be caused either by a fuzzed image or bugs. + * + * The objective is to do leaf/node validation checks when tree block is read + * from disk, and check *every* possible member, so other code won't + * need to checking them again. + * + * Due to the potential and unwanted damage, every checker needs to be + * carefully reviewed otherwise so it does not prevent mount of valid images. + */ + +#include "ctree.h" +#include "tree-checker.h" +#include "disk-io.h" +#include "compression.h" +#include "hash.h" +#include "volumes.h" + +#define CORRUPT(reason, eb, root, slot) \ + btrfs_crit(root->fs_info, \ + "corrupt %s, %s: block=%llu, root=%llu, slot=%d", \ + btrfs_header_level(eb) == 0 ? "leaf" : "node", \ + reason, btrfs_header_bytenr(eb), root->objectid, slot) + +/* + * Error message should follow the following format: + * corrupt <type>: <identifier>, <reason>[, <bad_value>] + * + * @type: leaf or node + * @identifier: the necessary info to locate the leaf/node. + * It's recommened to decode key.objecitd/offset if it's + * meaningful. + * @reason: describe the error + * @bad_value: optional, it's recommened to output bad value and its + * expected value (range). + * + * Since comma is used to separate the components, only space is allowed + * inside each component. + */ + +/* + * Append generic "corrupt leaf/node root=%llu block=%llu slot=%d: " to @fmt. + * Allows callers to customize the output. + */ +__printf(4, 5) +static void generic_err(const struct btrfs_root *root, + const struct extent_buffer *eb, int slot, + const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + btrfs_crit(root->fs_info, + "corrupt %s: root=%llu block=%llu slot=%d, %pV", + btrfs_header_level(eb) == 0 ? "leaf" : "node", + root->objectid, btrfs_header_bytenr(eb), slot, &vaf); + va_end(args); +} + +static int check_extent_data_item(struct btrfs_root *root, + struct extent_buffer *leaf, + struct btrfs_key *key, int slot) +{ + struct btrfs_file_extent_item *fi; + u32 sectorsize = root->sectorsize; + u32 item_size = btrfs_item_size_nr(leaf, slot); + + if (!IS_ALIGNED(key->offset, sectorsize)) { + CORRUPT("unaligned key offset for file extent", + leaf, root, slot); + return -EUCLEAN; + } + + fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); + + if (btrfs_file_extent_type(leaf, fi) > BTRFS_FILE_EXTENT_TYPES) { + CORRUPT("invalid file extent type", leaf, root, slot); + return -EUCLEAN; + } + + /* + * Support for new compression/encrption must introduce incompat flag, + * and must be caught in open_ctree(). + */ + if (btrfs_file_extent_compression(leaf, fi) > BTRFS_COMPRESS_TYPES) { + CORRUPT("invalid file extent compression", leaf, root, slot); + return -EUCLEAN; + } + if (btrfs_file_extent_encryption(leaf, fi)) { + CORRUPT("invalid file extent encryption", leaf, root, slot); + return -EUCLEAN; + } + if (btrfs_file_extent_type(leaf, fi) == BTRFS_FILE_EXTENT_INLINE) { + /* Inline extent must have 0 as key offset */ + if (key->offset) { + CORRUPT("inline extent has non-zero key offset", + leaf, root, slot); + return -EUCLEAN; + } + + /* Compressed inline extent has no on-disk size, skip it */ + if (btrfs_file_extent_compression(leaf, fi) != + BTRFS_COMPRESS_NONE) + return 0; + + /* Uncompressed inline extent size must match item size */ + if (item_size != BTRFS_FILE_EXTENT_INLINE_DATA_START + + btrfs_file_extent_ram_bytes(leaf, fi)) { + CORRUPT("plaintext inline extent has invalid size", + leaf, root, slot); + return -EUCLEAN; + } + return 0; + } + + /* Regular or preallocated extent has fixed item size */ + if (item_size != sizeof(*fi)) { + CORRUPT( + "regluar or preallocated extent data item size is invalid", + leaf, root, slot); + return -EUCLEAN; + } + if (!IS_ALIGNED(btrfs_file_extent_ram_bytes(leaf, fi), sectorsize) || + !IS_ALIGNED(btrfs_file_extent_disk_bytenr(leaf, fi), sectorsize) || + !IS_ALIGNED(btrfs_file_extent_disk_num_bytes(leaf, fi), sectorsize) || + !IS_ALIGNED(btrfs_file_extent_offset(leaf, fi), sectorsize) || + !IS_ALIGNED(btrfs_file_extent_num_bytes(leaf, fi), sectorsize)) { + CORRUPT( + "regular or preallocated extent data item has unaligned value", + leaf, root, slot); + return -EUCLEAN; + } + + return 0; +} + +static int check_csum_item(struct btrfs_root *root, struct extent_buffer *leaf, + struct btrfs_key *key, int slot) +{ + u32 sectorsize = root->sectorsize; + u32 csumsize = btrfs_super_csum_size(root->fs_info->super_copy); + + if (key->objectid != BTRFS_EXTENT_CSUM_OBJECTID) { + CORRUPT("invalid objectid for csum item", leaf, root, slot); + return -EUCLEAN; + } + if (!IS_ALIGNED(key->offset, sectorsize)) { + CORRUPT("unaligned key offset for csum item", leaf, root, slot); + return -EUCLEAN; + } + if (!IS_ALIGNED(btrfs_item_size_nr(leaf, slot), csumsize)) { + CORRUPT("unaligned csum item size", leaf, root, slot); + return -EUCLEAN; + } + return 0; +} + +/* + * Customized reported for dir_item, only important new info is key->objectid, + * which represents inode number + */ +__printf(4, 5) +static void dir_item_err(const struct btrfs_root *root, + const struct extent_buffer *eb, int slot, + const char *fmt, ...) +{ + struct btrfs_key key; + struct va_format vaf; + va_list args; + + btrfs_item_key_to_cpu(eb, &key, slot); + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + btrfs_crit(root->fs_info, + "corrupt %s: root=%llu block=%llu slot=%d ino=%llu, %pV", + btrfs_header_level(eb) == 0 ? "leaf" : "node", root->objectid, + btrfs_header_bytenr(eb), slot, key.objectid, &vaf); + va_end(args); +} + +static int check_dir_item(struct btrfs_root *root, + struct extent_buffer *leaf, + struct btrfs_key *key, int slot) +{ + struct btrfs_dir_item *di; + u32 item_size = btrfs_item_size_nr(leaf, slot); + u32 cur = 0; + + di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); + while (cur < item_size) { + u32 name_len; + u32 data_len; + u32 max_name_len; + u32 total_size; + u32 name_hash; + u8 dir_type; + + /* header itself should not cross item boundary */ + if (cur + sizeof(*di) > item_size) { + dir_item_err(root, leaf, slot, + "dir item header crosses item boundary, have %zu boundary %u", + cur + sizeof(*di), item_size); + return -EUCLEAN; + } + + /* dir type check */ + dir_type = btrfs_dir_type(leaf, di); + if (dir_type >= BTRFS_FT_MAX) { + dir_item_err(root, leaf, slot, + "invalid dir item type, have %u expect [0, %u)", + dir_type, BTRFS_FT_MAX); + return -EUCLEAN; + } + + if (key->type == BTRFS_XATTR_ITEM_KEY && + dir_type != BTRFS_FT_XATTR) { + dir_item_err(root, leaf, slot, + "invalid dir item type for XATTR key, have %u expect %u", + dir_type, BTRFS_FT_XATTR); + return -EUCLEAN; + } + if (dir_type == BTRFS_FT_XATTR && + key->type != BTRFS_XATTR_ITEM_KEY) { + dir_item_err(root, leaf, slot, + "xattr dir type found for non-XATTR key"); + return -EUCLEAN; + } + if (dir_type == BTRFS_FT_XATTR) + max_name_len = XATTR_NAME_MAX; + else + max_name_len = BTRFS_NAME_LEN; + + /* Name/data length check */ + name_len = btrfs_dir_name_len(leaf, di); + data_len = btrfs_dir_data_len(leaf, di); + if (name_len > max_name_len) { + dir_item_err(root, leaf, slot, + "dir item name len too long, have %u max %u", + name_len, max_name_len); + return -EUCLEAN; + } + if (name_len + data_len > BTRFS_MAX_XATTR_SIZE(root)) { + dir_item_err(root, leaf, slot, + "dir item name and data len too long, have %u max %zu", + name_len + data_len, + BTRFS_MAX_XATTR_SIZE(root)); + return -EUCLEAN; + } + + if (data_len && dir_type != BTRFS_FT_XATTR) { + dir_item_err(root, leaf, slot, + "dir item with invalid data len, have %u expect 0", + data_len); + return -EUCLEAN; + } + + total_size = sizeof(*di) + name_len + data_len; + + /* header and name/data should not cross item boundary */ + if (cur + total_size > item_size) { + dir_item_err(root, leaf, slot, + "dir item data crosses item boundary, have %u boundary %u", + cur + total_size, item_size); + return -EUCLEAN; + } + + /* + * Special check for XATTR/DIR_ITEM, as key->offset is name + * hash, should match its name + */ + if (key->type == BTRFS_DIR_ITEM_KEY || + key->type == BTRFS_XATTR_ITEM_KEY) { + char namebuf[max(BTRFS_NAME_LEN, XATTR_NAME_MAX)]; + + read_extent_buffer(leaf, namebuf, + (unsigned long)(di + 1), name_len); + name_hash = btrfs_name_hash(namebuf, name_len); + if (key->offset != name_hash) { + dir_item_err(root, leaf, slot, + "name hash mismatch with key, have 0x%016x expect 0x%016llx", + name_hash, key->offset); + return -EUCLEAN; + } + } + cur += total_size; + di = (struct btrfs_dir_item *)((void *)di + total_size); + } + return 0; +} + +__printf(4, 5) +__cold +static void block_group_err(const struct btrfs_fs_info *fs_info, + const struct extent_buffer *eb, int slot, + const char *fmt, ...) +{ + struct btrfs_key key; + struct va_format vaf; + va_list args; + + btrfs_item_key_to_cpu(eb, &key, slot); + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + btrfs_crit(fs_info, + "corrupt %s: root=%llu block=%llu slot=%d bg_start=%llu bg_len=%llu, %pV", + btrfs_header_level(eb) == 0 ? "leaf" : "node", + btrfs_header_owner(eb), btrfs_header_bytenr(eb), slot, + key.objectid, key.offset, &vaf); + va_end(args); +} + +static int check_block_group_item(struct btrfs_fs_info *fs_info, + struct extent_buffer *leaf, + struct btrfs_key *key, int slot) +{ + struct btrfs_block_group_item bgi; + u32 item_size = btrfs_item_size_nr(leaf, slot); + u64 flags; + u64 type; + + /* + * Here we don't really care about alignment since extent allocator can + * handle it. We care more about the size, as if one block group is + * larger than maximum size, it's must be some obvious corruption. + */ + if (key->offset > BTRFS_MAX_DATA_CHUNK_SIZE || key->offset == 0) { + block_group_err(fs_info, leaf, slot, + "invalid block group size, have %llu expect (0, %llu]", + key->offset, BTRFS_MAX_DATA_CHUNK_SIZE); + return -EUCLEAN; + } + + if (item_size != sizeof(bgi)) { + block_group_err(fs_info, leaf, slot, + "invalid item size, have %u expect %zu", + item_size, sizeof(bgi)); + return -EUCLEAN; + } + + read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot), + sizeof(bgi)); + if (btrfs_block_group_chunk_objectid(&bgi) != + BTRFS_FIRST_CHUNK_TREE_OBJECTID) { + block_group_err(fs_info, leaf, slot, + "invalid block group chunk objectid, have %llu expect %llu", + btrfs_block_group_chunk_objectid(&bgi), + BTRFS_FIRST_CHUNK_TREE_OBJECTID); + return -EUCLEAN; + } + + if (btrfs_block_group_used(&bgi) > key->offset) { + block_group_err(fs_info, leaf, slot, + "invalid block group used, have %llu expect [0, %llu)", + btrfs_block_group_used(&bgi), key->offset); + return -EUCLEAN; + } + + flags = btrfs_block_group_flags(&bgi); + if (hweight64(flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) > 1) { + block_group_err(fs_info, leaf, slot, +"invalid profile flags, have 0x%llx (%lu bits set) expect no more than 1 bit set", + flags & BTRFS_BLOCK_GROUP_PROFILE_MASK, + hweight64(flags & BTRFS_BLOCK_GROUP_PROFILE_MASK)); + return -EUCLEAN; + } + + type = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; + if (type != BTRFS_BLOCK_GROUP_DATA && + type != BTRFS_BLOCK_GROUP_METADATA && + type != BTRFS_BLOCK_GROUP_SYSTEM && + type != (BTRFS_BLOCK_GROUP_METADATA | + BTRFS_BLOCK_GROUP_DATA)) { + block_group_err(fs_info, leaf, slot, +"invalid type, have 0x%llx (%lu bits set) expect either 0x%llx, 0x%llx, 0x%llx or 0x%llx", + type, hweight64(type), + BTRFS_BLOCK_GROUP_DATA, BTRFS_BLOCK_GROUP_METADATA, + BTRFS_BLOCK_GROUP_SYSTEM, + BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA); + return -EUCLEAN; + } + return 0; +} + +/* + * Common point to switch the item-specific validation. + */ +static int check_leaf_item(struct btrfs_root *root, + struct extent_buffer *leaf, + struct btrfs_key *key, int slot) +{ + int ret = 0; + + switch (key->type) { + case BTRFS_EXTENT_DATA_KEY: + ret = check_extent_data_item(root, leaf, key, slot); + break; + case BTRFS_EXTENT_CSUM_KEY: + ret = check_csum_item(root, leaf, key, slot); + break; + case BTRFS_DIR_ITEM_KEY: + case BTRFS_DIR_INDEX_KEY: + case BTRFS_XATTR_ITEM_KEY: + ret = check_dir_item(root, leaf, key, slot); + break; + case BTRFS_BLOCK_GROUP_ITEM_KEY: + ret = check_block_group_item(root->fs_info, leaf, key, slot); + break; + } + return ret; +} + +static int check_leaf(struct btrfs_root *root, struct extent_buffer *leaf, + bool check_item_data) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + /* No valid key type is 0, so all key should be larger than this key */ + struct btrfs_key prev_key = {0, 0, 0}; + struct btrfs_key key; + u32 nritems = btrfs_header_nritems(leaf); + int slot; + + if (btrfs_header_level(leaf) != 0) { + generic_err(root, leaf, 0, + "invalid level for leaf, have %d expect 0", + btrfs_header_level(leaf)); + return -EUCLEAN; + } + + /* + * Extent buffers from a relocation tree have a owner field that + * corresponds to the subvolume tree they are based on. So just from an + * extent buffer alone we can not find out what is the id of the + * corresponding subvolume tree, so we can not figure out if the extent + * buffer corresponds to the root of the relocation tree or not. So + * skip this check for relocation trees. + */ + if (nritems == 0 && !btrfs_header_flag(leaf, BTRFS_HEADER_FLAG_RELOC)) { + u64 owner = btrfs_header_owner(leaf); + struct btrfs_root *check_root; + + /* These trees must never be empty */ + if (owner == BTRFS_ROOT_TREE_OBJECTID || + owner == BTRFS_CHUNK_TREE_OBJECTID || + owner == BTRFS_EXTENT_TREE_OBJECTID || + owner == BTRFS_DEV_TREE_OBJECTID || + owner == BTRFS_FS_TREE_OBJECTID || + owner == BTRFS_DATA_RELOC_TREE_OBJECTID) { + generic_err(root, leaf, 0, + "invalid root, root %llu must never be empty", + owner); + return -EUCLEAN; + } + key.objectid = owner; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + + check_root = btrfs_get_fs_root(fs_info, &key, false); + /* + * The only reason we also check NULL here is that during + * open_ctree() some roots has not yet been set up. + */ + if (!IS_ERR_OR_NULL(check_root)) { + struct extent_buffer *eb; + + eb = btrfs_root_node(check_root); + /* if leaf is the root, then it's fine */ + if (leaf != eb) { + CORRUPT("non-root leaf's nritems is 0", + leaf, check_root, 0); + free_extent_buffer(eb); + return -EUCLEAN; + } + free_extent_buffer(eb); + } + return 0; + } + + if (nritems == 0) + return 0; + + /* + * Check the following things to make sure this is a good leaf, and + * leaf users won't need to bother with similar sanity checks: + * + * 1) key ordering + * 2) item offset and size + * No overlap, no hole, all inside the leaf. + * 3) item content + * If possible, do comprehensive sanity check. + * NOTE: All checks must only rely on the item data itself. + */ + for (slot = 0; slot < nritems; slot++) { + u32 item_end_expected; + int ret; + + btrfs_item_key_to_cpu(leaf, &key, slot); + + /* Make sure the keys are in the right order */ + if (btrfs_comp_cpu_keys(&prev_key, &key) >= 0) { + CORRUPT("bad key order", leaf, root, slot); + return -EUCLEAN; + } + + /* + * Make sure the offset and ends are right, remember that the + * item data starts at the end of the leaf and grows towards the + * front. + */ + if (slot == 0) + item_end_expected = BTRFS_LEAF_DATA_SIZE(root); + else + item_end_expected = btrfs_item_offset_nr(leaf, + slot - 1); + if (btrfs_item_end_nr(leaf, slot) != item_end_expected) { + CORRUPT("slot offset bad", leaf, root, slot); + return -EUCLEAN; + } + + /* + * Check to make sure that we don't point outside of the leaf, + * just in case all the items are consistent to each other, but + * all point outside of the leaf. + */ + if (btrfs_item_end_nr(leaf, slot) > + BTRFS_LEAF_DATA_SIZE(root)) { + CORRUPT("slot end outside of leaf", leaf, root, slot); + return -EUCLEAN; + } + + /* Also check if the item pointer overlaps with btrfs item. */ + if (btrfs_item_nr_offset(slot) + sizeof(struct btrfs_item) > + btrfs_item_ptr_offset(leaf, slot)) { + CORRUPT("slot overlap with its data", leaf, root, slot); + return -EUCLEAN; + } + + if (check_item_data) { + /* + * Check if the item size and content meet other + * criteria + */ + ret = check_leaf_item(root, leaf, &key, slot); + if (ret < 0) + return ret; + } + + prev_key.objectid = key.objectid; + prev_key.type = key.type; + prev_key.offset = key.offset; + } + + return 0; +} + +int btrfs_check_leaf_full(struct btrfs_root *root, struct extent_buffer *leaf) +{ + return check_leaf(root, leaf, true); +} + +int btrfs_check_leaf_relaxed(struct btrfs_root *root, + struct extent_buffer *leaf) +{ + return check_leaf(root, leaf, false); +} + +int btrfs_check_node(struct btrfs_root *root, struct extent_buffer *node) +{ + unsigned long nr = btrfs_header_nritems(node); + struct btrfs_key key, next_key; + int slot; + int level = btrfs_header_level(node); + u64 bytenr; + int ret = 0; + + if (level <= 0 || level >= BTRFS_MAX_LEVEL) { + generic_err(root, node, 0, + "invalid level for node, have %d expect [1, %d]", + level, BTRFS_MAX_LEVEL - 1); + return -EUCLEAN; + } + if (nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(root)) { + btrfs_crit(root->fs_info, +"corrupt node: root=%llu block=%llu, nritems too %s, have %lu expect range [1,%zu]", + root->objectid, node->start, + nr == 0 ? "small" : "large", nr, + BTRFS_NODEPTRS_PER_BLOCK(root)); + return -EUCLEAN; + } + + for (slot = 0; slot < nr - 1; slot++) { + bytenr = btrfs_node_blockptr(node, slot); + btrfs_node_key_to_cpu(node, &key, slot); + btrfs_node_key_to_cpu(node, &next_key, slot + 1); + + if (!bytenr) { + generic_err(root, node, slot, + "invalid NULL node pointer"); + ret = -EUCLEAN; + goto out; + } + if (!IS_ALIGNED(bytenr, root->sectorsize)) { + generic_err(root, node, slot, + "unaligned pointer, have %llu should be aligned to %u", + bytenr, root->sectorsize); + ret = -EUCLEAN; + goto out; + } + + if (btrfs_comp_cpu_keys(&key, &next_key) >= 0) { + generic_err(root, node, slot, + "bad key order, current (%llu %u %llu) next (%llu %u %llu)", + key.objectid, key.type, key.offset, + next_key.objectid, next_key.type, + next_key.offset); + ret = -EUCLEAN; + goto out; + } + } +out: + return ret; +} diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h new file mode 100644 index 000000000000..3d53e8d6fda0 --- /dev/null +++ b/fs/btrfs/tree-checker.h @@ -0,0 +1,38 @@ +/* + * Copyright (C) Qu Wenruo 2017. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program. + */ + +#ifndef __BTRFS_TREE_CHECKER__ +#define __BTRFS_TREE_CHECKER__ + +#include "ctree.h" +#include "extent_io.h" + +/* + * Comprehensive leaf checker. + * Will check not only the item pointers, but also every possible member + * in item data. + */ +int btrfs_check_leaf_full(struct btrfs_root *root, struct extent_buffer *leaf); + +/* + * Less strict leaf checker. + * Will only check item pointers, not reading item data. + */ +int btrfs_check_leaf_relaxed(struct btrfs_root *root, + struct extent_buffer *leaf); +int btrfs_check_node(struct btrfs_root *root, struct extent_buffer *node); + +#endif diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index b4d63a9842fa..5e8fe8f3942d 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1184,7 +1184,7 @@ again: struct map_lookup *map; int i; - map = (struct map_lookup *)em->bdev; + map = em->map_lookup; for (i = 0; i < map->num_stripes; i++) { u64 end; @@ -2757,7 +2757,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, free_extent_map(em); return -EINVAL; } - map = (struct map_lookup *)em->bdev; + map = em->map_lookup; lock_chunks(root->fs_info->chunk_root); check_system_chunk(trans, extent_root, map->type); unlock_chunks(root->fs_info->chunk_root); @@ -4540,7 +4540,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, if (type & BTRFS_BLOCK_GROUP_DATA) { max_stripe_size = 1024 * 1024 * 1024; - max_chunk_size = 10 * max_stripe_size; + max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE; if (!devs_max) devs_max = BTRFS_MAX_DEVS(info->chunk_root); } else if (type & BTRFS_BLOCK_GROUP_METADATA) { @@ -4731,7 +4731,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, goto error; } set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); - em->bdev = (struct block_device *)map; + em->map_lookup = map; em->start = start; em->len = num_bytes; em->block_start = 0; @@ -4826,7 +4826,7 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, return -EINVAL; } - map = (struct map_lookup *)em->bdev; + map = em->map_lookup; item_size = btrfs_chunk_item_size(map->num_stripes); stripe_size = em->orig_block_len; @@ -4968,7 +4968,7 @@ int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset) if (!em) return 1; - map = (struct map_lookup *)em->bdev; + map = em->map_lookup; for (i = 0; i < map->num_stripes; i++) { if (map->stripes[i].dev->missing) { miss_ndevs++; @@ -5048,7 +5048,7 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) return 1; } - map = (struct map_lookup *)em->bdev; + map = em->map_lookup; if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1)) ret = map->num_stripes; else if (map->type & BTRFS_BLOCK_GROUP_RAID10) @@ -5091,7 +5091,7 @@ unsigned long btrfs_full_stripe_len(struct btrfs_root *root, BUG_ON(!em); BUG_ON(em->start > logical || em->start + em->len < logical); - map = (struct map_lookup *)em->bdev; + map = em->map_lookup; if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) len = map->stripe_len * nr_data_stripes(map); free_extent_map(em); @@ -5112,7 +5112,7 @@ int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree, BUG_ON(!em); BUG_ON(em->start > logical || em->start + em->len < logical); - map = (struct map_lookup *)em->bdev; + map = em->map_lookup; if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) ret = 1; free_extent_map(em); @@ -5271,7 +5271,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, return -EINVAL; } - map = (struct map_lookup *)em->bdev; + map = em->map_lookup; offset = logical - em->start; stripe_len = map->stripe_len; @@ -5813,7 +5813,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, free_extent_map(em); return -EIO; } - map = (struct map_lookup *)em->bdev; + map = em->map_lookup; length = em->len; rmap_len = map->stripe_len; @@ -6208,6 +6208,101 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, return dev; } +/* Return -EIO if any error, otherwise return 0. */ +static int btrfs_check_chunk_valid(struct btrfs_root *root, + struct extent_buffer *leaf, + struct btrfs_chunk *chunk, u64 logical) +{ + u64 length; + u64 stripe_len; + u16 num_stripes; + u16 sub_stripes; + u64 type; + u64 features; + bool mixed = false; + + length = btrfs_chunk_length(leaf, chunk); + stripe_len = btrfs_chunk_stripe_len(leaf, chunk); + num_stripes = btrfs_chunk_num_stripes(leaf, chunk); + sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); + type = btrfs_chunk_type(leaf, chunk); + + if (!num_stripes) { + btrfs_err(root->fs_info, "invalid chunk num_stripes: %u", + num_stripes); + return -EIO; + } + if (!IS_ALIGNED(logical, root->sectorsize)) { + btrfs_err(root->fs_info, + "invalid chunk logical %llu", logical); + return -EIO; + } + if (btrfs_chunk_sector_size(leaf, chunk) != root->sectorsize) { + btrfs_err(root->fs_info, "invalid chunk sectorsize %u", + btrfs_chunk_sector_size(leaf, chunk)); + return -EIO; + } + if (!length || !IS_ALIGNED(length, root->sectorsize)) { + btrfs_err(root->fs_info, + "invalid chunk length %llu", length); + return -EIO; + } + if (!is_power_of_2(stripe_len)) { + btrfs_err(root->fs_info, "invalid chunk stripe length: %llu", + stripe_len); + return -EIO; + } + if (~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) & + type) { + btrfs_err(root->fs_info, "unrecognized chunk type: %llu", + ~(BTRFS_BLOCK_GROUP_TYPE_MASK | + BTRFS_BLOCK_GROUP_PROFILE_MASK) & + btrfs_chunk_type(leaf, chunk)); + return -EIO; + } + + if ((type & BTRFS_BLOCK_GROUP_TYPE_MASK) == 0) { + btrfs_err(root->fs_info, "missing chunk type flag: 0x%llx", type); + return -EIO; + } + + if ((type & BTRFS_BLOCK_GROUP_SYSTEM) && + (type & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA))) { + btrfs_err(root->fs_info, + "system chunk with data or metadata type: 0x%llx", type); + return -EIO; + } + + features = btrfs_super_incompat_flags(root->fs_info->super_copy); + if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) + mixed = true; + + if (!mixed) { + if ((type & BTRFS_BLOCK_GROUP_METADATA) && + (type & BTRFS_BLOCK_GROUP_DATA)) { + btrfs_err(root->fs_info, + "mixed chunk type in non-mixed mode: 0x%llx", type); + return -EIO; + } + } + + if ((type & BTRFS_BLOCK_GROUP_RAID10 && sub_stripes != 2) || + (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes < 1) || + (type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < 2) || + (type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < 3) || + (type & BTRFS_BLOCK_GROUP_DUP && num_stripes > 2) || + ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 && + num_stripes != 1)) { + btrfs_err(root->fs_info, + "invalid num_stripes:sub_stripes %u:%u for profile %llu", + num_stripes, sub_stripes, + type & BTRFS_BLOCK_GROUP_PROFILE_MASK); + return -EIO; + } + + return 0; +} + static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, struct extent_buffer *leaf, struct btrfs_chunk *chunk) @@ -6217,6 +6312,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, struct extent_map *em; u64 logical; u64 length; + u64 stripe_len; u64 devid; u8 uuid[BTRFS_UUID_SIZE]; int num_stripes; @@ -6225,6 +6321,12 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, logical = key->offset; length = btrfs_chunk_length(leaf, chunk); + stripe_len = btrfs_chunk_stripe_len(leaf, chunk); + num_stripes = btrfs_chunk_num_stripes(leaf, chunk); + + ret = btrfs_check_chunk_valid(root, leaf, chunk, logical); + if (ret) + return ret; read_lock(&map_tree->map_tree.lock); em = lookup_extent_mapping(&map_tree->map_tree, logical, 1); @@ -6241,7 +6343,6 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, em = alloc_extent_map(); if (!em) return -ENOMEM; - num_stripes = btrfs_chunk_num_stripes(leaf, chunk); map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); if (!map) { free_extent_map(em); @@ -6249,7 +6350,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, } set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); - em->bdev = (struct block_device *)map; + em->map_lookup = map; em->start = logical; em->len = length; em->orig_start = 0; @@ -6473,6 +6574,7 @@ int btrfs_read_sys_array(struct btrfs_root *root) u32 array_size; u32 len = 0; u32 cur_offset; + u64 type; struct btrfs_key key; ASSERT(BTRFS_SUPER_INFO_SIZE <= root->nodesize); @@ -6539,6 +6641,15 @@ int btrfs_read_sys_array(struct btrfs_root *root) break; } + type = btrfs_chunk_type(sb, chunk); + if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) { + btrfs_err(root->fs_info, + "invalid chunk type %llu in sys_array at offset %u", + type, cur_offset); + ret = -EIO; + break; + } + len = btrfs_chunk_item_size(num_stripes); if (cur_offset + len > array_size) goto out_short_read; @@ -6948,7 +7059,7 @@ void btrfs_update_commit_device_bytes_used(struct btrfs_root *root, /* In order to kick the device replace finish process */ lock_chunks(root); list_for_each_entry(em, &transaction->pending_chunks, list) { - map = (struct map_lookup *)em->bdev; + map = em->map_lookup; for (i = 0; i < map->num_stripes; i++) { dev = map->stripes[i].dev; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index d5c84f6b1353..3c651df420be 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -24,6 +24,8 @@ #include <linux/btrfs.h> #include "async-thread.h" +#define BTRFS_MAX_DATA_CHUNK_SIZE (10ULL * SZ_1G) + extern struct mutex uuid_mutex; #define BTRFS_STRIPE_LEN (64 * 1024) diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c index 5b68cf526887..c05ab2ec0fef 100644 --- a/fs/cachefiles/rdwr.c +++ b/fs/cachefiles/rdwr.c @@ -963,11 +963,8 @@ error: void cachefiles_uncache_page(struct fscache_object *_object, struct page *page) { struct cachefiles_object *object; - struct cachefiles_cache *cache; object = container_of(_object, struct cachefiles_object, fscache); - cache = container_of(object->fscache.cache, - struct cachefiles_cache, cache); _enter("%p,{%lu}", object, page->index); diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 0e3de1bb6500..e7b54514d99a 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -3243,7 +3243,6 @@ retry: tcap->cap_id = t_cap_id; tcap->seq = t_seq - 1; tcap->issue_seq = t_seq - 1; - tcap->mseq = t_mseq; tcap->issued |= issued; tcap->implemented |= issued; if (cap == ci->i_auth_cap) diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 1eeb4780c3ed..eacf57c24ca9 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -48,6 +48,7 @@ #include "cifs_unicode.h" #include "cifs_debug.h" #include "cifs_fs_sb.h" +#include "dns_resolve.h" #include "ntlmssp.h" #include "nterr.h" #include "rfc1002pdu.h" @@ -304,6 +305,53 @@ static int cifs_setup_volume_info(struct smb_vol *volume_info, char *mount_data, const char *devname); /* + * Resolve hostname and set ip addr in tcp ses. Useful for hostnames that may + * get their ip addresses changed at some point. + * + * This should be called with server->srv_mutex held. + */ +#ifdef CONFIG_CIFS_DFS_UPCALL +static int reconn_set_ipaddr(struct TCP_Server_Info *server) +{ + int rc; + int len; + char *unc, *ipaddr = NULL; + + if (!server->hostname) + return -EINVAL; + + len = strlen(server->hostname) + 3; + + unc = kmalloc(len, GFP_KERNEL); + if (!unc) { + cifs_dbg(FYI, "%s: failed to create UNC path\n", __func__); + return -ENOMEM; + } + snprintf(unc, len, "\\\\%s", server->hostname); + + rc = dns_resolve_server_name_to_ip(unc, &ipaddr); + kfree(unc); + + if (rc < 0) { + cifs_dbg(FYI, "%s: failed to resolve server part of %s to IP: %d\n", + __func__, server->hostname, rc); + return rc; + } + + rc = cifs_convert_address((struct sockaddr *)&server->dstaddr, ipaddr, + strlen(ipaddr)); + kfree(ipaddr); + + return !rc ? -1 : 0; +} +#else +static inline int reconn_set_ipaddr(struct TCP_Server_Info *server) +{ + return 0; +} +#endif + +/* * cifs tcp session reconnection * * mark tcp session as reconnecting so temporarily locked @@ -400,6 +448,11 @@ cifs_reconnect(struct TCP_Server_Info *server) rc = generic_ip_connect(server); if (rc) { cifs_dbg(FYI, "reconnect error %d\n", rc); + rc = reconn_set_ipaddr(server); + if (rc) { + cifs_dbg(FYI, "%s: failed to resolve hostname: %d\n", + __func__, rc); + } mutex_unlock(&server->srv_mutex); msleep(3000); } else { diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 79a1bad88931..e5357c7b5d4c 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1073,14 +1073,18 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile) /* * Accessing maxBuf is racy with cifs_reconnect - need to store value - * and check it for zero before using. + * and check it before using. */ max_buf = tcon->ses->server->maxBuf; - if (!max_buf) { + if (max_buf < (sizeof(struct smb_hdr) + sizeof(LOCKING_ANDX_RANGE))) { free_xid(xid); return -EINVAL; } + BUILD_BUG_ON(sizeof(struct smb_hdr) + sizeof(LOCKING_ANDX_RANGE) > + PAGE_SIZE); + max_buf = min_t(unsigned int, max_buf - sizeof(struct smb_hdr), + PAGE_SIZE); max_num = (max_buf - sizeof(struct smb_hdr)) / sizeof(LOCKING_ANDX_RANGE); buf = kcalloc(max_num, sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL); @@ -1404,12 +1408,16 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, /* * Accessing maxBuf is racy with cifs_reconnect - need to store value - * and check it for zero before using. + * and check it before using. */ max_buf = tcon->ses->server->maxBuf; - if (!max_buf) + if (max_buf < (sizeof(struct smb_hdr) + sizeof(LOCKING_ANDX_RANGE))) return -EINVAL; + BUILD_BUG_ON(sizeof(struct smb_hdr) + sizeof(LOCKING_ANDX_RANGE) > + PAGE_SIZE); + max_buf = min_t(unsigned int, max_buf - sizeof(struct smb_hdr), + PAGE_SIZE); max_num = (max_buf - sizeof(struct smb_hdr)) / sizeof(LOCKING_ANDX_RANGE); buf = kcalloc(max_num, sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL); diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 57b039ebfb1f..43fa471c88d7 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -652,7 +652,14 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos, /* scan and find it */ int i; char *cur_ent; - char *end_of_smb = cfile->srch_inf.ntwrk_buf_start + + char *end_of_smb; + + if (cfile->srch_inf.ntwrk_buf_start == NULL) { + cifs_dbg(VFS, "ntwrk_buf_start is NULL during readdir\n"); + return -EIO; + } + + end_of_smb = cfile->srch_inf.ntwrk_buf_start + server->ops->calc_smb_size( cfile->srch_inf.ntwrk_buf_start); diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c index b2aff0c6f22c..dee5250701de 100644 --- a/fs/cifs/smb2file.c +++ b/fs/cifs/smb2file.c @@ -123,12 +123,14 @@ smb2_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, /* * Accessing maxBuf is racy with cifs_reconnect - need to store value - * and check it for zero before using. + * and check it before using. */ max_buf = tcon->ses->server->maxBuf; - if (!max_buf) + if (max_buf < sizeof(struct smb2_lock_element)) return -EINVAL; + BUILD_BUG_ON(sizeof(struct smb2_lock_element) > PAGE_SIZE); + max_buf = min_t(unsigned int, max_buf, PAGE_SIZE); max_num = max_buf / sizeof(struct smb2_lock_element); buf = kcalloc(max_num, sizeof(struct smb2_lock_element), GFP_KERNEL); if (!buf) @@ -265,6 +267,8 @@ smb2_push_mandatory_locks(struct cifsFileInfo *cfile) return -EINVAL; } + BUILD_BUG_ON(sizeof(struct smb2_lock_element) > PAGE_SIZE); + max_buf = min_t(unsigned int, max_buf, PAGE_SIZE); max_num = max_buf / sizeof(struct smb2_lock_element); buf = kcalloc(max_num, sizeof(struct smb2_lock_element), GFP_KERNEL); if (!buf) { diff --git a/fs/cifs/smb2maperror.c b/fs/cifs/smb2maperror.c index 8257a5a97cc0..98c25b969ab8 100644 --- a/fs/cifs/smb2maperror.c +++ b/fs/cifs/smb2maperror.c @@ -377,8 +377,8 @@ static const struct status_to_posix_error smb2_error_map_table[] = { {STATUS_NONEXISTENT_EA_ENTRY, -EIO, "STATUS_NONEXISTENT_EA_ENTRY"}, {STATUS_NO_EAS_ON_FILE, -ENODATA, "STATUS_NO_EAS_ON_FILE"}, {STATUS_EA_CORRUPT_ERROR, -EIO, "STATUS_EA_CORRUPT_ERROR"}, - {STATUS_FILE_LOCK_CONFLICT, -EIO, "STATUS_FILE_LOCK_CONFLICT"}, - {STATUS_LOCK_NOT_GRANTED, -EIO, "STATUS_LOCK_NOT_GRANTED"}, + {STATUS_FILE_LOCK_CONFLICT, -EACCES, "STATUS_FILE_LOCK_CONFLICT"}, + {STATUS_LOCK_NOT_GRANTED, -EACCES, "STATUS_LOCK_NOT_GRANTED"}, {STATUS_DELETE_PENDING, -ENOENT, "STATUS_DELETE_PENDING"}, {STATUS_CTL_FILE_NOT_SUPPORTED, -ENOSYS, "STATUS_CTL_FILE_NOT_SUPPORTED"}, diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 2725085a3f9f..eae3cdffaf7f 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -143,14 +143,14 @@ smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size, scredits = server->credits; /* can deadlock with reopen */ - if (scredits == 1) { + if (scredits <= 8) { *num = SMB2_MAX_BUFFER_SIZE; *credits = 0; break; } - /* leave one credit for a possible reopen */ - scredits--; + /* leave some credits for reopen and other ops */ + scredits -= 8; *num = min_t(unsigned int, size, scredits * SMB2_MAX_BUFFER_SIZE); diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index f7111bb88ec1..5e21d58c49ef 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -2523,8 +2523,8 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, if (rc == -ENODATA && rsp->hdr.Status == STATUS_NO_MORE_FILES) { srch_inf->endOfSearch = true; rc = 0; - } - cifs_stats_fail_inc(tcon, SMB2_QUERY_DIRECTORY_HE); + } else + cifs_stats_fail_inc(tcon, SMB2_QUERY_DIRECTORY_HE); goto qdir_exit; } diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 54af10204e83..1cf0a336ec06 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -360,7 +360,7 @@ uncork: if (rc < 0 && rc != -EINTR) cifs_dbg(VFS, "Error %d sending data on socket to server\n", rc); - else + else if (rc > 0) rc = 0; return rc; diff --git a/fs/dcache.c b/fs/dcache.c index 86f52a555dec..2416ad64cc62 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1155,15 +1155,11 @@ static enum lru_status dentry_lru_isolate_shrink(struct list_head *item, */ void shrink_dcache_sb(struct super_block *sb) { - long freed; - do { LIST_HEAD(dispose); - freed = list_lru_walk(&sb->s_dentry_lru, + list_lru_walk(&sb->s_dentry_lru, dentry_lru_isolate_shrink, &dispose, 1024); - - this_cpu_sub(nr_dentry_unused, freed); shrink_dentry_list(&dispose); cond_resched(); } while (list_lru_count(&sb->s_dentry_lru) > 0); diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index e49ba072bd64..22fe11baef2b 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -671,6 +671,13 @@ struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry, struct dentry *dentry = NULL, *trap; struct name_snapshot old_name; + if (IS_ERR(old_dir)) + return old_dir; + if (IS_ERR(new_dir)) + return new_dir; + if (IS_ERR_OR_NULL(old_dentry)) + return old_dentry; + trap = lock_rename(new_dir, old_dir); /* Source or destination directories don't exist? */ if (d_really_is_negative(old_dir) || d_really_is_negative(new_dir)) diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c index dcea1e37a1b7..f18619bc2e09 100644 --- a/fs/dlm/ast.c +++ b/fs/dlm/ast.c @@ -290,6 +290,8 @@ void dlm_callback_suspend(struct dlm_ls *ls) flush_workqueue(ls->ls_callback_wq); } +#define MAX_CB_QUEUE 25 + void dlm_callback_resume(struct dlm_ls *ls) { struct dlm_lkb *lkb, *safe; @@ -300,15 +302,23 @@ void dlm_callback_resume(struct dlm_ls *ls) if (!ls->ls_callback_wq) return; +more: mutex_lock(&ls->ls_cb_mutex); list_for_each_entry_safe(lkb, safe, &ls->ls_cb_delay, lkb_cb_list) { list_del_init(&lkb->lkb_cb_list); queue_work(ls->ls_callback_wq, &lkb->lkb_cb_work); count++; + if (count == MAX_CB_QUEUE) + break; } mutex_unlock(&ls->ls_cb_mutex); if (count) log_rinfo(ls, "dlm_callback_resume %d", count); + if (count == MAX_CB_QUEUE) { + count = 0; + cond_resched(); + goto more; + } } diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index 35502d4046f5..3a7f401e943c 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -1210,6 +1210,7 @@ static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret) if (rv < 0) { log_error(ls, "create_lkb idr error %d", rv); + dlm_free_lkb(lkb); return rv; } @@ -4177,6 +4178,7 @@ static int receive_convert(struct dlm_ls *ls, struct dlm_message *ms) (unsigned long long)lkb->lkb_recover_seq, ms->m_header.h_nodeid, ms->m_lkid); error = -ENOENT; + dlm_put_lkb(lkb); goto fail; } @@ -4230,6 +4232,7 @@ static int receive_unlock(struct dlm_ls *ls, struct dlm_message *ms) lkb->lkb_id, lkb->lkb_remid, ms->m_header.h_nodeid, ms->m_lkid); error = -ENOENT; + dlm_put_lkb(lkb); goto fail; } @@ -5792,20 +5795,20 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, goto out; } } - - /* After ua is attached to lkb it will be freed by dlm_free_lkb(). - When DLM_IFL_USER is set, the dlm knows that this is a userspace - lock and that lkb_astparam is the dlm_user_args structure. */ - error = set_lock_args(mode, &ua->lksb, flags, namelen, timeout_cs, fake_astfn, ua, fake_bastfn, &args); - lkb->lkb_flags |= DLM_IFL_USER; - if (error) { + kfree(ua->lksb.sb_lvbptr); + ua->lksb.sb_lvbptr = NULL; + kfree(ua); __put_lkb(ls, lkb); goto out; } + /* After ua is attached to lkb it will be freed by dlm_free_lkb(). + When DLM_IFL_USER is set, the dlm knows that this is a userspace + lock and that lkb_astparam is the dlm_user_args structure. */ + lkb->lkb_flags |= DLM_IFL_USER; error = request_lock(ls, lkb, name, namelen, &args); switch (error) { diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index f3e72787e7f9..30e4e01db35a 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -673,11 +673,11 @@ static int new_lockspace(const char *name, const char *cluster, kfree(ls->ls_recover_buf); out_lkbidr: idr_destroy(&ls->ls_lkbidr); + out_rsbtbl: for (i = 0; i < DLM_REMOVE_NAMES_MAX; i++) { if (ls->ls_remove_names[i]) kfree(ls->ls_remove_names[i]); } - out_rsbtbl: vfree(ls->ls_rsbtbl); out_lsfree: if (do_unreg) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 066df649a6b0..ac21caad6729 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1035,7 +1035,7 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k * semantics). All the events that happen during that period of time are * chained in ep->ovflist and requeued later on. */ - if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) { + if (ep->ovflist != EP_UNACTIVE_PTR) { if (epi->next == EP_UNACTIVE_PTR) { epi->next = ep->ovflist; ep->ovflist = epi; diff --git a/fs/exec.c b/fs/exec.c index 6bbea71c6d92..ab6e34e1de21 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -191,6 +191,7 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, { struct page *page; int ret; + unsigned int gup_flags = FOLL_FORCE; #ifdef CONFIG_STACK_GROWSUP if (write) { @@ -199,8 +200,12 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, return NULL; } #endif - ret = get_user_pages(current, bprm->mm, pos, - 1, write, 1, &page, NULL); + + if (write) + gup_flags |= FOLL_WRITE; + + ret = get_user_pages(current, bprm->mm, pos, 1, gup_flags, + &page, NULL); if (ret <= 0) return NULL; diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 714cd37a6ba3..6599c6124552 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -76,7 +76,7 @@ static bool dentry_connected(struct dentry *dentry) struct dentry *parent = dget_parent(dentry); dput(dentry); - if (IS_ROOT(dentry)) { + if (dentry == parent) { dput(parent); return false; } diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index cab694540930..ec506c2733ee 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -715,8 +715,11 @@ int ext4_try_to_write_inline_data(struct address_space *mapping, if (!PageUptodate(page)) { ret = ext4_read_inline_page(inode, page); - if (ret < 0) + if (ret < 0) { + unlock_page(page); + put_page(page); goto out_up_read; + } } ret = 1; @@ -1870,12 +1873,12 @@ int ext4_inline_data_fiemap(struct inode *inode, physical += (char *)ext4_raw_inode(&iloc) - iloc.bh->b_data; physical += offsetof(struct ext4_inode, i_block); - if (physical) - error = fiemap_fill_next_extent(fieinfo, start, physical, - inline_len, flags); brelse(iloc.bh); out: up_read(&EXT4_I(inode)->xattr_sem); + if (physical) + error = fiemap_fill_next_extent(fieinfo, start, physical, + inline_len, flags); return (error < 0 ? error : 0); } diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index bad13f049fb0..2fc1564f62dd 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1600,7 +1600,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) } if (reserved_gdb || gdb_off == 0) { - if (ext4_has_feature_resize_inode(sb) || + if (!ext4_has_feature_resize_inode(sb) || !le16_to_cpu(es->s_reserved_gdt_blocks)) { ext4_warning(sb, "No reserved GDT blocks, can't resize"); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 2cd426a4161d..0ec3689a8990 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1048,6 +1048,16 @@ static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid, ext4_nfs_get_inode); } +static int ext4_nfs_commit_metadata(struct inode *inode) +{ + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL + }; + + trace_ext4_nfs_commit_metadata(inode); + return ext4_write_inode(inode, &wbc); +} + /* * Try to release metadata pages (indirect blocks, directories) which are * mapped via the block device. Since these pages could have journal heads @@ -1142,6 +1152,7 @@ static const struct export_operations ext4_export_ops = { .fh_to_dentry = ext4_fh_to_dentry, .fh_to_parent = ext4_fh_to_parent, .get_parent = ext4_get_parent, + .commit_metadata = ext4_nfs_commit_metadata, }; enum { @@ -5187,9 +5198,9 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id, qf_inode->i_flags |= S_NOQUOTA; lockdep_set_quota_inode(qf_inode, I_DATA_SEM_QUOTA); err = dquot_enable(qf_inode, type, format_id, flags); - iput(qf_inode); if (err) lockdep_set_quota_inode(qf_inode, I_DATA_SEM_NORMAL); + iput(qf_inode); return err; } diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index f82f916e1f2c..7ebf7b958f9e 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -160,7 +160,7 @@ static void *f2fs_acl_to_disk(struct f2fs_sb_info *sbi, return (void *)f2fs_acl; fail: - kfree(f2fs_acl); + kvfree(f2fs_acl); return ERR_PTR(-EINVAL); } @@ -190,7 +190,7 @@ static struct posix_acl *__f2fs_get_acl(struct inode *inode, int type, acl = NULL; else acl = ERR_PTR(retval); - kfree(value); + kvfree(value); if (!IS_ERR(acl)) set_cached_acl(inode, type, acl); @@ -243,7 +243,7 @@ static int __f2fs_set_acl(struct inode *inode, int type, error = f2fs_setxattr(inode, name_index, "", value, size, ipage, 0); - kfree(value); + kvfree(value); if (!error) set_cached_acl(inode, type, acl); @@ -355,12 +355,14 @@ static int f2fs_acl_create(struct inode *dir, umode_t *mode, return PTR_ERR(p); clone = f2fs_acl_clone(p, GFP_NOFS); - if (!clone) - goto no_mem; + if (!clone) { + ret = -ENOMEM; + goto release_acl; + } ret = f2fs_acl_create_masq(clone, mode); if (ret < 0) - goto no_mem_clone; + goto release_clone; if (ret == 0) posix_acl_release(clone); @@ -374,11 +376,11 @@ static int f2fs_acl_create(struct inode *dir, umode_t *mode, return 0; -no_mem_clone: +release_clone: posix_acl_release(clone); -no_mem: +release_acl: posix_acl_release(p); - return -ENOMEM; + return ret; } int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage, diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 4ed2d2a0bf02..940372ebfc60 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -44,7 +44,7 @@ repeat: cond_resched(); goto repeat; } - f2fs_wait_on_page_writeback(page, META, true); + f2fs_wait_on_page_writeback(page, META, true, true); if (!PageUptodate(page)) SetPageUptodate(page); return page; @@ -371,9 +371,8 @@ continue_unlock: goto continue_unlock; } - f2fs_wait_on_page_writeback(page, META, true); + f2fs_wait_on_page_writeback(page, META, true, true); - BUG_ON(PageWriteback(page)); if (!clear_page_dirty_for_io(page)) goto continue_unlock; @@ -912,7 +911,7 @@ free_fail_no_cp: f2fs_put_page(cp1, 1); f2fs_put_page(cp2, 1); fail_no_cp: - kfree(sbi->ckpt); + kvfree(sbi->ckpt); return -EINVAL; } @@ -1291,11 +1290,11 @@ static void commit_checkpoint(struct f2fs_sb_info *sbi, struct page *page = f2fs_grab_meta_page(sbi, blk_addr); int err; + f2fs_wait_on_page_writeback(page, META, true, true); + memcpy(page_address(page), src, PAGE_SIZE); - set_page_dirty(page); - f2fs_wait_on_page_writeback(page, META, true); - f2fs_bug_on(sbi, PageWriteback(page)); + set_page_dirty(page); if (unlikely(!clear_page_dirty_for_io(page))) f2fs_bug_on(sbi, 1); @@ -1329,11 +1328,9 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) int err; /* Flush all the NAT/SIT pages */ - while (get_pages(sbi, F2FS_DIRTY_META)) { - f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); - if (unlikely(f2fs_cp_error(sbi))) - break; - } + f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); + f2fs_bug_on(sbi, get_pages(sbi, F2FS_DIRTY_META) && + !f2fs_cp_error(sbi)); /* * modify checkpoint @@ -1406,14 +1403,6 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) for (i = 0; i < nm_i->nat_bits_blocks; i++) f2fs_update_meta_page(sbi, nm_i->nat_bits + (i << F2FS_BLKSIZE_BITS), blk + i); - - /* Flush all the NAT BITS pages */ - while (get_pages(sbi, F2FS_DIRTY_META)) { - f2fs_sync_meta_pages(sbi, META, LONG_MAX, - FS_CP_META_IO); - if (unlikely(f2fs_cp_error(sbi))) - break; - } } /* write out checkpoint buffer at block 0 */ @@ -1449,6 +1438,8 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* Here, we have one bio having CP pack except cp pack 2 page */ f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); + f2fs_bug_on(sbi, get_pages(sbi, F2FS_DIRTY_META) && + !f2fs_cp_error(sbi)); /* wait for previous submitted meta pages writeback */ f2fs_wait_on_all_pages_writeback(sbi); @@ -1466,7 +1457,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) * invalidate intermediate page cache borrowed from meta inode * which are used for migration of encrypted inode's blocks. */ - if (f2fs_sb_has_encrypt(sbi->sb)) + if (f2fs_sb_has_encrypt(sbi)) invalidate_mapping_pages(META_MAPPING(sbi), MAIN_BLKADDR(sbi), MAX_BLKADDR(sbi) - 1); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 8286f2114c37..84b3ee71d175 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -142,6 +142,8 @@ static bool f2fs_bio_post_read_required(struct bio *bio) static void f2fs_read_end_io(struct bio *bio) { + struct page *first_page = bio->bi_io_vec[0].bv_page; + if (time_to_inject(F2FS_P_SB(bio->bi_io_vec->bv_page), FAULT_READ_IO)) { f2fs_show_injection_info(FAULT_READ_IO); bio->bi_error = -EIO; @@ -155,6 +157,13 @@ static void f2fs_read_end_io(struct bio *bio) return; } + if (first_page != NULL && + __read_io_type(first_page) == F2FS_RD_DATA) { + trace_android_fs_dataread_end(first_page->mapping->host, + page_offset(first_page), + bio->bi_iter.bi_size); + } + __read_end_io(bio); } @@ -321,6 +330,32 @@ submit_io: submit_bio(bio_op(bio), bio); } +static void __f2fs_submit_read_bio(struct f2fs_sb_info *sbi, + struct bio *bio, enum page_type type) +{ + if (trace_android_fs_dataread_start_enabled() && (type == DATA)) { + struct page *first_page = bio->bi_io_vec[0].bv_page; + + if (first_page != NULL && + __read_io_type(first_page) == F2FS_RD_DATA) { + char *path, pathbuf[MAX_TRACE_PATHBUF_LEN]; + + path = android_fstrace_get_pathname(pathbuf, + MAX_TRACE_PATHBUF_LEN, + first_page->mapping->host); + + trace_android_fs_dataread_start( + first_page->mapping->host, + page_offset(first_page), + bio->bi_iter.bi_size, + current->pid, + path, + current->comm); + } + } + __submit_bio(sbi, bio, type); +} + static void __submit_merged_bio(struct f2fs_bio_info *io) { struct f2fs_io_info *fio = &io->fio; @@ -370,29 +405,6 @@ static bool __has_merged_page(struct f2fs_bio_info *io, struct inode *inode, return false; } -static bool has_merged_page(struct f2fs_sb_info *sbi, struct inode *inode, - struct page *page, nid_t ino, - enum page_type type) -{ - enum page_type btype = PAGE_TYPE_OF_BIO(type); - enum temp_type temp; - struct f2fs_bio_info *io; - bool ret = false; - - for (temp = HOT; temp < NR_TEMP_TYPE; temp++) { - io = sbi->write_io[btype] + temp; - - down_read(&io->io_rwsem); - ret = __has_merged_page(io, inode, page, ino); - up_read(&io->io_rwsem); - - /* TODO: use HOT temp only for meta pages now. */ - if (ret || btype == META) - break; - } - return ret; -} - static void __f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type, enum temp_type temp) { @@ -418,13 +430,19 @@ static void __submit_merged_write_cond(struct f2fs_sb_info *sbi, nid_t ino, enum page_type type, bool force) { enum temp_type temp; - - if (!force && !has_merged_page(sbi, inode, page, ino, type)) - return; + bool ret = true; for (temp = HOT; temp < NR_TEMP_TYPE; temp++) { + if (!force) { + enum page_type btype = PAGE_TYPE_OF_BIO(type); + struct f2fs_bio_info *io = sbi->write_io[btype] + temp; - __f2fs_submit_merged_write(sbi, type, temp); + down_read(&io->io_rwsem); + ret = __has_merged_page(io, inode, page, ino); + up_read(&io->io_rwsem); + } + if (ret) + __f2fs_submit_merged_write(sbi, type, temp); /* TODO: use HOT temp only for meta pages now. */ if (type >= META) @@ -485,7 +503,7 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) inc_page_count(fio->sbi, is_read_io(fio->op) ? __read_io_type(page): WB_DATA_TYPE(fio->page)); - __submit_bio(fio->sbi, bio, fio->type); + __f2fs_submit_read_bio(fio->sbi, bio, fio->type); return 0; } @@ -615,7 +633,7 @@ static int f2fs_submit_page_read(struct inode *inode, struct page *page, } ClearPageError(page); inc_page_count(F2FS_I_SB(inode), F2FS_RD_DATA); - __submit_bio(F2FS_I_SB(inode), bio, DATA); + __f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA); return 0; } @@ -641,7 +659,7 @@ static void __set_data_blkaddr(struct dnode_of_data *dn) */ void f2fs_set_data_blkaddr(struct dnode_of_data *dn) { - f2fs_wait_on_page_writeback(dn->node_page, NODE, true); + f2fs_wait_on_page_writeback(dn->node_page, NODE, true, true); __set_data_blkaddr(dn); if (set_page_dirty(dn->node_page)) dn->node_changed = true; @@ -671,7 +689,7 @@ int f2fs_reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count) trace_f2fs_reserve_new_blocks(dn->inode, dn->nid, dn->ofs_in_node, count); - f2fs_wait_on_page_writeback(dn->node_page, NODE, true); + f2fs_wait_on_page_writeback(dn->node_page, NODE, true, true); for (; count > 0; dn->ofs_in_node++) { block_t blkaddr = datablock_addr(dn->inode, @@ -955,6 +973,9 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) return err; } + if (direct_io && allow_outplace_dio(inode, iocb, from)) + return 0; + if (is_inode_flag_set(inode, FI_NO_PREALLOC)) return 0; @@ -968,6 +989,7 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) map.m_next_pgofs = NULL; map.m_next_extent = NULL; map.m_seg_type = NO_CHECK_TYPE; + map.m_may_create = true; if (direct_io) { map.m_seg_type = f2fs_rw_hint_to_seg_type(iocb->ki_hint); @@ -1026,7 +1048,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, unsigned int maxblocks = map->m_len; struct dnode_of_data dn; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - int mode = create ? ALLOC_NODE : LOOKUP_NODE; + int mode = map->m_may_create ? ALLOC_NODE : LOOKUP_NODE; pgoff_t pgofs, end_offset, end; int err = 0, ofs = 1; unsigned int ofs_in_node, last_ofs_in_node; @@ -1046,6 +1068,10 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, end = pgofs + maxblocks; if (!create && f2fs_lookup_extent_cache(inode, pgofs, &ei)) { + if (test_opt(sbi, LFS) && flag == F2FS_GET_BLOCK_DIO && + map->m_may_create) + goto next_dnode; + map->m_pblk = ei.blk + pgofs - ei.fofs; map->m_len = min((pgoff_t)maxblocks, ei.fofs + ei.len - pgofs); map->m_flags = F2FS_MAP_MAPPED; @@ -1060,7 +1086,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, } next_dnode: - if (create) + if (map->m_may_create) __do_map_lock(sbi, flag, true); /* When reading holes, we need its node page */ @@ -1097,11 +1123,13 @@ next_block: if (is_valid_data_blkaddr(sbi, blkaddr)) { /* use out-place-update for driect IO under LFS mode */ - if (test_opt(sbi, LFS) && create && - flag == F2FS_GET_BLOCK_DIO) { + if (test_opt(sbi, LFS) && flag == F2FS_GET_BLOCK_DIO && + map->m_may_create) { err = __allocate_data_block(&dn, map->m_seg_type); - if (!err) + if (!err) { + blkaddr = dn.data_blkaddr; set_inode_flag(inode, FI_APPEND_WRITE); + } } } else { if (create) { @@ -1207,7 +1235,7 @@ skip: f2fs_put_dnode(&dn); - if (create) { + if (map->m_may_create) { __do_map_lock(sbi, flag, false); f2fs_balance_fs(sbi, dn.node_changed); } @@ -1233,7 +1261,7 @@ sync_out: } f2fs_put_dnode(&dn); unlock_out: - if (create) { + if (map->m_may_create) { __do_map_lock(sbi, flag, false); f2fs_balance_fs(sbi, dn.node_changed); } @@ -1255,6 +1283,7 @@ bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len) map.m_next_pgofs = NULL; map.m_next_extent = NULL; map.m_seg_type = NO_CHECK_TYPE; + map.m_may_create = false; last_lblk = F2FS_BLK_ALIGN(pos + len); while (map.m_lblk < last_lblk) { @@ -1269,7 +1298,7 @@ bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len) static int __get_data_block(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create, int flag, - pgoff_t *next_pgofs, int seg_type) + pgoff_t *next_pgofs, int seg_type, bool may_write) { struct f2fs_map_blocks map; int err; @@ -1279,6 +1308,7 @@ static int __get_data_block(struct inode *inode, sector_t iblock, map.m_next_pgofs = next_pgofs; map.m_next_extent = NULL; map.m_seg_type = seg_type; + map.m_may_create = may_write; err = f2fs_map_blocks(inode, &map, create, flag); if (!err) { @@ -1295,16 +1325,25 @@ static int get_data_block(struct inode *inode, sector_t iblock, { return __get_data_block(inode, iblock, bh_result, create, flag, next_pgofs, - NO_CHECK_TYPE); + NO_CHECK_TYPE, create); +} + +static int get_data_block_dio_write(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + return __get_data_block(inode, iblock, bh_result, create, + F2FS_GET_BLOCK_DIO, NULL, + f2fs_rw_hint_to_seg_type(inode->i_write_hint), + true); } static int get_data_block_dio(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { return __get_data_block(inode, iblock, bh_result, create, - F2FS_GET_BLOCK_DIO, NULL, - f2fs_rw_hint_to_seg_type( - inode->i_write_hint)); + F2FS_GET_BLOCK_DIO, NULL, + f2fs_rw_hint_to_seg_type(inode->i_write_hint), + false); } static int get_data_block_bmap(struct inode *inode, sector_t iblock, @@ -1316,7 +1355,7 @@ static int get_data_block_bmap(struct inode *inode, sector_t iblock, return __get_data_block(inode, iblock, bh_result, create, F2FS_GET_BLOCK_BMAP, NULL, - NO_CHECK_TYPE); + NO_CHECK_TYPE, create); } static inline sector_t logical_to_blk(struct inode *inode, loff_t offset) @@ -1523,6 +1562,7 @@ static int f2fs_mpage_readpages(struct address_space *mapping, map.m_next_pgofs = NULL; map.m_next_extent = NULL; map.m_seg_type = NO_CHECK_TYPE; + map.m_may_create = false; for (; nr_pages; nr_pages--) { if (pages) { @@ -1592,7 +1632,7 @@ got_it: if (bio && (last_block_in_bio != block_nr - 1 || !__same_bdev(F2FS_I_SB(inode), block_nr, bio))) { submit_and_realloc: - __submit_bio(F2FS_I_SB(inode), bio, DATA); + __f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA); bio = NULL; } if (bio == NULL) { @@ -1624,7 +1664,7 @@ set_error_page: goto next_page; confused: if (bio) { - __submit_bio(F2FS_I_SB(inode), bio, DATA); + __f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA); bio = NULL; } unlock_page(page); @@ -1634,7 +1674,7 @@ next_page: } BUG_ON(pages && !list_empty(pages)); if (bio) - __submit_bio(F2FS_I_SB(inode), bio, DATA); + __f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA); return 0; } @@ -1852,6 +1892,8 @@ got_it: if (fio->need_lock == LOCK_REQ) f2fs_unlock_op(fio->sbi); err = f2fs_inplace_write_data(fio); + if (err && PageWriteback(page)) + end_page_writeback(page); trace_f2fs_do_write_data_page(fio->page, IPU); set_inode_flag(inode, FI_UPDATE_WRITE); return err; @@ -2139,12 +2181,11 @@ continue_unlock: if (PageWriteback(page)) { if (wbc->sync_mode != WB_SYNC_NONE) f2fs_wait_on_page_writeback(page, - DATA, true); + DATA, true, true); else goto continue_unlock; } - BUG_ON(PageWriteback(page)); if (!clear_page_dirty_for_io(page)) goto continue_unlock; @@ -2321,6 +2362,7 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi, bool locked = false; struct extent_info ei = {0,0,0}; int err = 0; + int flag; /* * we already allocated all the blocks, so we don't need to get @@ -2330,9 +2372,15 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi, !is_inode_flag_set(inode, FI_NO_PREALLOC)) return 0; + /* f2fs_lock_op avoids race between write CP and convert_inline_page */ + if (f2fs_has_inline_data(inode) && pos + len > MAX_INLINE_DATA(inode)) + flag = F2FS_GET_BLOCK_DEFAULT; + else + flag = F2FS_GET_BLOCK_PRE_AIO; + if (f2fs_has_inline_data(inode) || (pos & PAGE_MASK) >= i_size_read(inode)) { - __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true); + __do_map_lock(sbi, flag, true); locked = true; } restart: @@ -2370,6 +2418,7 @@ restart: f2fs_put_dnode(&dn); __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true); + WARN_ON(flag != F2FS_GET_BLOCK_PRE_AIO); locked = true; goto restart; } @@ -2383,7 +2432,7 @@ out: f2fs_put_dnode(&dn); unlock_out: if (locked) - __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false); + __do_map_lock(sbi, flag, false); return err; } @@ -2464,7 +2513,7 @@ repeat: } } - f2fs_wait_on_page_writeback(page, DATA, false); + f2fs_wait_on_page_writeback(page, DATA, false, true); if (len == PAGE_SIZE || PageUptodate(page)) return 0; @@ -2556,6 +2605,53 @@ static int check_direct_IO(struct inode *inode, struct iov_iter *iter, return 0; } +static void f2fs_dio_end_io(struct bio *bio) +{ + struct f2fs_private_dio *dio = bio->bi_private; + + dec_page_count(F2FS_I_SB(dio->inode), + dio->write ? F2FS_DIO_WRITE : F2FS_DIO_READ); + + bio->bi_private = dio->orig_private; + bio->bi_end_io = dio->orig_end_io; + + kvfree(dio); + + bio_endio(bio); +} + +static void f2fs_dio_submit_bio(int rw, struct bio *bio, struct inode *inode, + loff_t file_offset) +{ + struct f2fs_private_dio *dio; + bool write = (rw == REQ_OP_WRITE); + int err; + + dio = f2fs_kzalloc(F2FS_I_SB(inode), + sizeof(struct f2fs_private_dio), GFP_NOFS); + if (!dio) { + err = -ENOMEM; + goto out; + } + + dio->inode = inode; + dio->orig_end_io = bio->bi_end_io; + dio->orig_private = bio->bi_private; + dio->write = write; + + bio->bi_end_io = f2fs_dio_end_io; + bio->bi_private = dio; + + inc_page_count(F2FS_I_SB(inode), + write ? F2FS_DIO_WRITE : F2FS_DIO_READ); + + submit_bio(rw, bio); + return; +out: + bio->bi_error = -EIO; + bio_endio(bio); +} + static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset) { @@ -2625,7 +2721,11 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, down_read(&fi->i_gc_rwsem[READ]); } - err = blockdev_direct_IO(iocb, inode, iter, offset, get_data_block_dio); + err = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, + iter, offset, + rw == WRITE ? get_data_block_dio_write : + get_data_block_dio, NULL, f2fs_dio_submit_bio, + DIO_LOCKING | DIO_SKIP_HOLES); if (do_opu) up_read(&fi->i_gc_rwsem[READ]); diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 139b4d5c83d5..f05b37ef7182 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -53,6 +53,8 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->vw_cnt = atomic_read(&sbi->vw_cnt); si->max_aw_cnt = atomic_read(&sbi->max_aw_cnt); si->max_vw_cnt = atomic_read(&sbi->max_vw_cnt); + si->nr_dio_read = get_pages(sbi, F2FS_DIO_READ); + si->nr_dio_write = get_pages(sbi, F2FS_DIO_WRITE); si->nr_wb_cp_data = get_pages(sbi, F2FS_WB_CP_DATA); si->nr_wb_data = get_pages(sbi, F2FS_WB_DATA); si->nr_rd_data = get_pages(sbi, F2FS_RD_DATA); @@ -62,7 +64,7 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->nr_flushed = atomic_read(&SM_I(sbi)->fcc_info->issued_flush); si->nr_flushing = - atomic_read(&SM_I(sbi)->fcc_info->issing_flush); + atomic_read(&SM_I(sbi)->fcc_info->queued_flush); si->flush_list_empty = llist_empty(&SM_I(sbi)->fcc_info->issue_list); } @@ -70,7 +72,7 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->nr_discarded = atomic_read(&SM_I(sbi)->dcc_info->issued_discard); si->nr_discarding = - atomic_read(&SM_I(sbi)->dcc_info->issing_discard); + atomic_read(&SM_I(sbi)->dcc_info->queued_discard); si->nr_discard_cmd = atomic_read(&SM_I(sbi)->dcc_info->discard_cmd_cnt); si->undiscard_blks = SM_I(sbi)->dcc_info->undiscard_blks; @@ -94,8 +96,10 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->free_secs = free_sections(sbi); si->prefree_count = prefree_segments(sbi); si->dirty_count = dirty_segments(sbi); - si->node_pages = NODE_MAPPING(sbi)->nrpages; - si->meta_pages = META_MAPPING(sbi)->nrpages; + if (sbi->node_inode) + si->node_pages = NODE_MAPPING(sbi)->nrpages; + if (sbi->meta_inode) + si->meta_pages = META_MAPPING(sbi)->nrpages; si->nats = NM_I(sbi)->nat_cnt; si->dirty_nats = NM_I(sbi)->dirty_nat_cnt; si->sits = MAIN_SEGS(sbi); @@ -173,7 +177,6 @@ static void update_sit_info(struct f2fs_sb_info *sbi) static void update_mem_info(struct f2fs_sb_info *sbi) { struct f2fs_stat_info *si = F2FS_STAT(sbi); - unsigned npages; int i; if (si->base_mem) @@ -197,7 +200,7 @@ static void update_mem_info(struct f2fs_sb_info *sbi) si->base_mem += 2 * SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi); si->base_mem += SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi); si->base_mem += SIT_VBLOCK_MAP_SIZE; - if (sbi->segs_per_sec > 1) + if (__is_large_section(sbi)) si->base_mem += MAIN_SECS(sbi) * sizeof(struct sec_entry); si->base_mem += __bitmap_size(sbi, SIT_BITMAP); @@ -256,10 +259,14 @@ get_cache: sizeof(struct extent_node); si->page_mem = 0; - npages = NODE_MAPPING(sbi)->nrpages; - si->page_mem += (unsigned long long)npages << PAGE_SHIFT; - npages = META_MAPPING(sbi)->nrpages; - si->page_mem += (unsigned long long)npages << PAGE_SHIFT; + if (sbi->node_inode) { + unsigned npages = NODE_MAPPING(sbi)->nrpages; + si->page_mem += (unsigned long long)npages << PAGE_SHIFT; + } + if (sbi->meta_inode) { + unsigned npages = META_MAPPING(sbi)->nrpages; + si->page_mem += (unsigned long long)npages << PAGE_SHIFT; + } } static int stat_show(struct seq_file *s, void *v) @@ -374,6 +381,8 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n", si->ext_tree, si->zombie_tree, si->ext_node); seq_puts(s, "\nBalancing F2FS Async:\n"); + seq_printf(s, " - DIO (R: %4d, W: %4d)\n", + si->nr_dio_read, si->nr_dio_write); seq_printf(s, " - IO_R (Data: %4d, Node: %4d, Meta: %4d\n", si->nr_rd_data, si->nr_rd_node, si->nr_rd_meta); seq_printf(s, " - IO_W (CP: %4d, Data: %4d, Flush: (%4d %4d %4d), " @@ -510,7 +519,7 @@ void f2fs_destroy_stats(struct f2fs_sb_info *sbi) list_del(&si->stat_list); mutex_unlock(&f2fs_stat_mutex); - kfree(si); + kvfree(si); } int __init f2fs_create_root_stats(void) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index eedc07b46a52..719ba4d92098 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -293,7 +293,7 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, { enum page_type type = f2fs_has_inline_dentry(dir) ? NODE : DATA; lock_page(page); - f2fs_wait_on_page_writeback(page, type, true); + f2fs_wait_on_page_writeback(page, type, true, true); de->ino = cpu_to_le32(inode->i_ino); set_de_type(de, inode->i_mode); set_page_dirty(page); @@ -307,7 +307,7 @@ static void init_dent_inode(const struct qstr *name, struct page *ipage) { struct f2fs_inode *ri; - f2fs_wait_on_page_writeback(ipage, NODE, true); + f2fs_wait_on_page_writeback(ipage, NODE, true, true); /* copy name info. to this inode page */ ri = F2FS_INODE(ipage); @@ -550,7 +550,7 @@ start: ++level; goto start; add_dentry: - f2fs_wait_on_page_writeback(dentry_page, DATA, true); + f2fs_wait_on_page_writeback(dentry_page, DATA, true, true); if (inode) { down_write(&F2FS_I(inode)->i_sem); @@ -705,7 +705,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, return f2fs_delete_inline_entry(dentry, page, dir, inode); lock_page(page); - f2fs_wait_on_page_writeback(page, DATA, true); + f2fs_wait_on_page_writeback(page, DATA, true, true); dentry_blk = page_address(page); bit_pos = dentry - dentry_blk->dentry; @@ -808,6 +808,17 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, de_name.name = d->filename[bit_pos]; de_name.len = le16_to_cpu(de->name_len); + /* check memory boundary before moving forward */ + bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); + if (unlikely(bit_pos > d->max)) { + f2fs_msg(sbi->sb, KERN_WARNING, + "%s: corrupted namelen=%d, run fsck to fix.", + __func__, le16_to_cpu(de->name_len)); + set_sbi_flag(sbi, SBI_NEED_FSCK); + err = -EINVAL; + goto out; + } + if (f2fs_encrypted_inode(d->inode)) { int save_len = fstr->len; @@ -830,7 +841,6 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, if (readdir_ra) f2fs_ra_node_page(sbi, le32_to_cpu(de->ino)); - bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); ctx->pos = start_pos + bit_pos; } out: diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 85542cba3fa2..b401578f8fb4 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -69,7 +69,7 @@ struct f2fs_fault_info { unsigned int inject_type; }; -extern char *f2fs_fault_name[FAULT_MAX]; +extern const char *f2fs_fault_name[FAULT_MAX]; #define IS_FAULT_SET(fi, type) ((fi)->inject_type & (1 << (type))) #endif @@ -154,12 +154,13 @@ struct f2fs_mount_info { #define F2FS_FEATURE_VERITY 0x0400 /* reserved */ #define F2FS_FEATURE_SB_CHKSUM 0x0800 -#define F2FS_HAS_FEATURE(sb, mask) \ - ((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0) -#define F2FS_SET_FEATURE(sb, mask) \ - (F2FS_SB(sb)->raw_super->feature |= cpu_to_le32(mask)) -#define F2FS_CLEAR_FEATURE(sb, mask) \ - (F2FS_SB(sb)->raw_super->feature &= ~cpu_to_le32(mask)) +#define __F2FS_HAS_FEATURE(raw_super, mask) \ + ((raw_super->feature & cpu_to_le32(mask)) != 0) +#define F2FS_HAS_FEATURE(sbi, mask) __F2FS_HAS_FEATURE(sbi->raw_super, mask) +#define F2FS_SET_FEATURE(sbi, mask) \ + (sbi->raw_super->feature |= cpu_to_le32(mask)) +#define F2FS_CLEAR_FEATURE(sbi, mask) \ + (sbi->raw_super->feature &= ~cpu_to_le32(mask)) /* bio stuffs */ #define REQ_OP_READ READ @@ -347,7 +348,7 @@ struct discard_cmd { struct block_device *bdev; /* bdev */ unsigned short ref; /* reference count */ unsigned char state; /* state */ - unsigned char issuing; /* issuing discard */ + unsigned char queued; /* queued discard */ int error; /* bio error */ spinlock_t lock; /* for state/bio_ref updating */ unsigned short bio_ref; /* bio reference count */ @@ -389,7 +390,7 @@ struct discard_cmd_control { unsigned int undiscard_blks; /* # of undiscard blocks */ unsigned int next_pos; /* next discard position */ atomic_t issued_discard; /* # of issued discard */ - atomic_t issing_discard; /* # of issing discard */ + atomic_t queued_discard; /* # of queued discard */ atomic_t discard_cmd_cnt; /* # of cached cmd count */ struct rb_root root; /* root of discard rb-tree */ bool rbtree_check; /* config for consistence check */ @@ -479,6 +480,7 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal, #define F2FS_GOING_DOWN_METASYNC 0x1 /* going down with metadata */ #define F2FS_GOING_DOWN_NOSYNC 0x2 /* going down */ #define F2FS_GOING_DOWN_METAFLUSH 0x3 /* going down with meta flush */ +#define F2FS_GOING_DOWN_NEED_FSCK 0x4 /* going down to trigger fsck */ #if defined(__KERNEL__) && defined(CONFIG_COMPAT) /* @@ -617,16 +619,8 @@ struct extent_info { }; struct extent_node { - struct rb_node rb_node; - union { - struct { - unsigned int fofs; - unsigned int len; - u32 blk; - }; - struct extent_info ei; /* extent info */ - - }; + struct rb_node rb_node; /* rb node located in rb-tree */ + struct extent_info ei; /* extent info */ struct list_head list; /* node in global extent list of sbi */ struct extent_tree *et; /* extent tree pointer */ }; @@ -661,6 +655,7 @@ struct f2fs_map_blocks { pgoff_t *m_next_pgofs; /* point next possible non-hole pgofs */ pgoff_t *m_next_extent; /* point to next possible extent */ int m_seg_type; + bool m_may_create; /* indicate it is from write path */ }; /* for flag in get_data_block */ @@ -949,7 +944,7 @@ struct flush_cmd_control { struct task_struct *f2fs_issue_flush; /* flush thread */ wait_queue_head_t flush_wait_queue; /* waiting queue for wake-up */ atomic_t issued_flush; /* # of issued flushes */ - atomic_t issing_flush; /* # of issing flushes */ + atomic_t queued_flush; /* # of queued flushes */ struct llist_head issue_list; /* list for command issue */ struct llist_node *dispatch_list; /* list for command dispatch */ }; @@ -1016,6 +1011,8 @@ enum count_type { F2FS_RD_DATA, F2FS_RD_NODE, F2FS_RD_META, + F2FS_DIO_WRITE, + F2FS_DIO_READ, NR_COUNT_TYPE, }; @@ -1230,8 +1227,6 @@ struct f2fs_sb_info { /* for bio operations */ struct f2fs_bio_info *write_io[NR_PAGE_TYPE]; /* for write bios */ - struct mutex wio_mutex[NR_PAGE_TYPE - 1][NR_TEMP_TYPE]; - /* bio ordering for NODE/DATA */ /* keep migration IO order for LFS mode */ struct rw_semaphore io_order_lock; mempool_t *write_io_dummy; /* Dummy pages */ @@ -1323,6 +1318,7 @@ struct f2fs_sb_info { struct f2fs_gc_kthread *gc_thread; /* GC thread */ unsigned int cur_victim_sec; /* current victim section num */ unsigned int gc_mode; /* current GC state */ + unsigned int next_victim_seg[2]; /* next segment in victim section */ /* for skip statistic */ unsigned long long skipped_atomic_files[2]; /* FG_GC and BG_GC */ unsigned long long skipped_gc_rwsem; /* FG_GC only */ @@ -1332,6 +1328,8 @@ struct f2fs_sb_info { /* maximum # of trials to find a victim segment for SSR and GC */ unsigned int max_victim_search; + /* migration granularity of garbage collection, unit: segment */ + unsigned int migration_granularity; /* * for stat information. @@ -1390,6 +1388,13 @@ struct f2fs_sb_info { __u32 s_chksum_seed; }; +struct f2fs_private_dio { + struct inode *inode; + void *orig_private; + bio_end_io_t *orig_end_io; + bool write; +}; + #ifdef CONFIG_F2FS_FAULT_INJECTION #define f2fs_show_injection_info(type) \ printk_ratelimited("%sF2FS-fs : inject %s in %s of %pF\n", \ @@ -1668,12 +1673,16 @@ static inline void disable_nat_bits(struct f2fs_sb_info *sbi, bool lock) { unsigned long flags; - set_sbi_flag(sbi, SBI_NEED_FSCK); + /* + * In order to re-enable nat_bits we need to call fsck.f2fs by + * set_sbi_flag(sbi, SBI_NEED_FSCK). But it may give huge cost, + * so let's rely on regular fsck or unclean shutdown. + */ if (lock) spin_lock_irqsave(&sbi->cp_lock, flags); __clear_ckpt_flags(F2FS_CKPT(sbi), CP_NAT_BITS_FLAG); - kfree(NM_I(sbi)->nat_bits); + kvfree(NM_I(sbi)->nat_bits); NM_I(sbi)->nat_bits = NULL; if (lock) spin_unlock_irqrestore(&sbi->cp_lock, flags); @@ -2206,7 +2215,11 @@ static inline bool is_idle(struct f2fs_sb_info *sbi, int type) { if (get_pages(sbi, F2FS_RD_DATA) || get_pages(sbi, F2FS_RD_NODE) || get_pages(sbi, F2FS_RD_META) || get_pages(sbi, F2FS_WB_DATA) || - get_pages(sbi, F2FS_WB_CP_DATA)) + get_pages(sbi, F2FS_WB_CP_DATA) || + get_pages(sbi, F2FS_DIO_READ) || + get_pages(sbi, F2FS_DIO_WRITE) || + atomic_read(&SM_I(sbi)->dcc_info->queued_discard) || + atomic_read(&SM_I(sbi)->fcc_info->queued_flush)) return false; return f2fs_time_over(sbi, type); } @@ -2430,6 +2443,7 @@ static inline void __mark_inode_dirty_flag(struct inode *inode, case FI_NEW_INODE: if (set) return; + /* fall through */ case FI_DATA_EXIST: case FI_INLINE_DOTS: case FI_PIN_FILE: @@ -2732,32 +2746,47 @@ static inline bool is_dot_dotdot(const struct qstr *str) static inline bool f2fs_may_extent_tree(struct inode *inode) { - if (!test_opt(F2FS_I_SB(inode), EXTENT_CACHE) || + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + if (!test_opt(sbi, EXTENT_CACHE) || is_inode_flag_set(inode, FI_NO_EXTENT)) return false; + /* + * for recovered files during mount do not create extents + * if shrinker is not registered. + */ + if (list_empty(&sbi->s_list)) + return false; + return S_ISREG(inode->i_mode); } +static inline void *kvmalloc(size_t size, gfp_t flags) +{ + void *ret; + + ret = kmalloc(size, flags | __GFP_NOWARN); + if (!ret) + ret = __vmalloc(size, flags, PAGE_KERNEL); + return ret; +} + static inline void *f2fs_kmalloc(struct f2fs_sb_info *sbi, size_t size, gfp_t flags) { + void *ret; + if (time_to_inject(sbi, FAULT_KMALLOC)) { f2fs_show_injection_info(FAULT_KMALLOC); return NULL; } - return kmalloc(size, flags); -} - -static inline void *kvmalloc(size_t size, gfp_t flags) -{ - void *ret; + ret = kmalloc(size, flags); + if (ret) + return ret; - ret = kmalloc(size, flags | __GFP_NOWARN); - if (!ret) - ret = __vmalloc(size, flags, PAGE_KERNEL); - return ret; + return kvmalloc(size, flags); } static inline void *kvzalloc(size_t size, gfp_t flags) @@ -2842,6 +2871,8 @@ static inline void f2fs_update_iostat(struct f2fs_sb_info *sbi, spin_unlock(&sbi->iostat_lock); } +#define __is_large_section(sbi) ((sbi)->segs_per_sec > 1) + #define __is_meta_io(fio) (PAGE_TYPE_OF_BIO(fio->type) == META && \ (!is_read_io(fio->op) || fio->is_meta)) @@ -3086,7 +3117,7 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, struct f2fs_summary *sum, int type, struct f2fs_io_info *fio, bool add_list); void f2fs_wait_on_page_writeback(struct page *page, - enum page_type type, bool ordered); + enum page_type type, bool ordered, bool locked); void f2fs_wait_on_block_writeback(struct inode *inode, block_t blkaddr); void f2fs_wait_on_block_writeback_range(struct inode *inode, block_t blkaddr, block_t len); @@ -3226,6 +3257,7 @@ struct f2fs_stat_info { int total_count, utilization; int bg_gc, nr_wb_cp_data, nr_wb_data; int nr_rd_data, nr_rd_node, nr_rd_meta; + int nr_dio_read, nr_dio_write; unsigned int io_skip_bggc, other_skip_bggc; int nr_flushing, nr_flushed, flush_list_empty; int nr_discarding, nr_discarded; @@ -3537,9 +3569,9 @@ static inline bool f2fs_post_read_required(struct inode *inode) } #define F2FS_FEATURE_FUNCS(name, flagname) \ -static inline int f2fs_sb_has_##name(struct super_block *sb) \ +static inline int f2fs_sb_has_##name(struct f2fs_sb_info *sbi) \ { \ - return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_##flagname); \ + return F2FS_HAS_FEATURE(sbi, F2FS_FEATURE_##flagname); \ } F2FS_FEATURE_FUNCS(encrypt, ENCRYPT); @@ -3569,7 +3601,7 @@ static inline int get_blkz_type(struct f2fs_sb_info *sbi, static inline bool f2fs_hw_should_discard(struct f2fs_sb_info *sbi) { - return f2fs_sb_has_blkzoned(sbi->sb); + return f2fs_sb_has_blkzoned(sbi); } static inline bool f2fs_hw_support_discard(struct f2fs_sb_info *sbi) @@ -3644,7 +3676,7 @@ static inline bool f2fs_force_buffered_io(struct inode *inode, * for blkzoned device, fallback direct IO to buffered IO, so * all IOs can be serialized by log-structured write. */ - if (f2fs_sb_has_blkzoned(sbi->sb)) + if (f2fs_sb_has_blkzoned(sbi)) return true; if (test_opt(sbi, LFS) && (rw == WRITE) && block_unaligned_IO(inode, iocb, iter)) @@ -3667,7 +3699,7 @@ extern void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate, static inline bool is_journalled_quota(struct f2fs_sb_info *sbi) { #ifdef CONFIG_QUOTA - if (f2fs_sb_has_quota_ino(sbi->sb)) + if (f2fs_sb_has_quota_ino(sbi)) return true; if (F2FS_OPTION(sbi).s_qf_names[USRQUOTA] || F2FS_OPTION(sbi).s_qf_names[GRPQUOTA] || diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 3c61f9619be1..b44e02b8c677 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -85,7 +85,7 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, } /* fill the page */ - f2fs_wait_on_page_writeback(page, DATA, false); + f2fs_wait_on_page_writeback(page, DATA, false, true); /* wait for GCed page writeback via META_MAPPING */ f2fs_wait_on_block_writeback(inode, dn.data_blkaddr); @@ -219,6 +219,9 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, trace_f2fs_sync_file_enter(inode); + if (S_ISDIR(inode->i_mode)) + goto go_write; + /* if fdatasync is triggered, let's do in-place-update */ if (datasync || get_dirty_pages(inode) <= SM_I(sbi)->min_fsync_blocks) set_inode_flag(inode, FI_NEED_IPU); @@ -578,7 +581,7 @@ static int truncate_partial_data_page(struct inode *inode, u64 from, if (IS_ERR(page)) return PTR_ERR(page) == -ENOENT ? 0 : PTR_ERR(page); truncate_out: - f2fs_wait_on_page_writeback(page, DATA, true); + f2fs_wait_on_page_writeback(page, DATA, true, true); zero_user(page, offset, PAGE_SIZE - offset); /* An encrypted inode should have a key and truncate the last page. */ @@ -700,7 +703,7 @@ int f2fs_getattr(struct vfsmount *mnt, unsigned int flags; if (f2fs_has_extra_attr(inode) && - f2fs_sb_has_inode_crtime(inode->i_sb) && + f2fs_sb_has_inode_crtime(F2FS_I_SB(inode)) && F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, i_crtime)) { stat->result_mask |= STATX_BTIME; stat->btime.tv_sec = fi->i_crtime.tv_sec; @@ -899,7 +902,7 @@ static int fill_zero(struct inode *inode, pgoff_t index, if (IS_ERR(page)) return PTR_ERR(page); - f2fs_wait_on_page_writeback(page, DATA, true); + f2fs_wait_on_page_writeback(page, DATA, true, true); zero_user(page, start, len); set_page_dirty(page); f2fs_put_page(page, 1); @@ -1503,7 +1506,8 @@ static int expand_inode_data(struct inode *inode, loff_t offset, { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_map_blocks map = { .m_next_pgofs = NULL, - .m_next_extent = NULL, .m_seg_type = NO_CHECK_TYPE }; + .m_next_extent = NULL, .m_seg_type = NO_CHECK_TYPE, + .m_may_create = true }; pgoff_t pg_end; loff_t new_size = i_size_read(inode); loff_t off_end; @@ -1746,10 +1750,12 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - if (!get_dirty_pages(inode)) - goto skip_flush; - - f2fs_msg(F2FS_I_SB(inode)->sb, KERN_WARNING, + /* + * Should wait end_io to count F2FS_WB_CP_DATA correctly by + * f2fs_is_atomic_file. + */ + if (get_dirty_pages(inode)) + f2fs_msg(F2FS_I_SB(inode)->sb, KERN_WARNING, "Unexpected flush for atomic writes: ino=%lu, npages=%u", inode->i_ino, get_dirty_pages(inode)); ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); @@ -1757,7 +1763,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); goto out; } -skip_flush: + set_inode_flag(inode, FI_ATOMIC_FILE); clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); @@ -1962,6 +1968,13 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) f2fs_stop_checkpoint(sbi, false); set_sbi_flag(sbi, SBI_IS_SHUTDOWN); break; + case F2FS_GOING_DOWN_NEED_FSCK: + set_sbi_flag(sbi, SBI_NEED_FSCK); + /* do checkpoint only */ + ret = f2fs_sync_fs(sb, 1); + if (ret) + goto out; + break; default: ret = -EINVAL; goto out; @@ -2030,7 +2043,7 @@ static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); - if (!f2fs_sb_has_encrypt(inode->i_sb)) + if (!f2fs_sb_has_encrypt(F2FS_I_SB(inode))) return -EOPNOTSUPP; f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); @@ -2040,7 +2053,7 @@ static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg) static int f2fs_ioc_get_encryption_policy(struct file *filp, unsigned long arg) { - if (!f2fs_sb_has_encrypt(file_inode(filp)->i_sb)) + if (!f2fs_sb_has_encrypt(F2FS_I_SB(file_inode(filp)))) return -EOPNOTSUPP; return fscrypt_ioctl_get_policy(filp, (void __user *)arg); } @@ -2051,7 +2064,7 @@ static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg) struct f2fs_sb_info *sbi = F2FS_I_SB(inode); int err; - if (!f2fs_sb_has_encrypt(inode->i_sb)) + if (!f2fs_sb_has_encrypt(sbi)) return -EOPNOTSUPP; err = mnt_want_write_file(filp); @@ -2155,7 +2168,7 @@ do_more: } ret = f2fs_gc(sbi, range.sync, true, GET_SEGNO(sbi, range.start)); - range.start += sbi->blocks_per_seg; + range.start += BLKS_PER_SEC(sbi); if (range.start <= end) goto do_more; out: @@ -2197,7 +2210,8 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, { struct inode *inode = file_inode(filp); struct f2fs_map_blocks map = { .m_next_extent = NULL, - .m_seg_type = NO_CHECK_TYPE }; + .m_seg_type = NO_CHECK_TYPE , + .m_may_create = false }; struct extent_info ei = {0, 0, 0}; pgoff_t pg_start, pg_end, next_pgofs; unsigned int blk_per_seg = sbi->blocks_per_seg; @@ -2560,7 +2574,7 @@ static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg) return -EFAULT; if (sbi->s_ndevs <= 1 || sbi->s_ndevs - 1 <= range.dev_num || - sbi->segs_per_sec != 1) { + __is_large_section(sbi)) { f2fs_msg(sbi->sb, KERN_WARNING, "Can't flush %u in %d for segs_per_sec %u != 1\n", range.dev_num, sbi->s_ndevs, @@ -2711,6 +2725,7 @@ int f2fs_precache_extents(struct inode *inode) map.m_next_pgofs = NULL; map.m_next_extent = &m_next_extent; map.m_seg_type = NO_CHECK_TYPE; + map.m_may_create = false; end = F2FS_I_SB(inode)->max_file_blocks; while (map.m_lblk < end) { diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 843430b55a73..3624d1336bd1 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -142,7 +142,7 @@ int f2fs_start_gc_thread(struct f2fs_sb_info *sbi) "f2fs_gc-%u:%u", MAJOR(dev), MINOR(dev)); if (IS_ERR(gc_th->f2fs_gc_task)) { err = PTR_ERR(gc_th->f2fs_gc_task); - kfree(gc_th); + kvfree(gc_th); sbi->gc_thread = NULL; } out: @@ -155,7 +155,7 @@ void f2fs_stop_gc_thread(struct f2fs_sb_info *sbi) if (!gc_th) return; kthread_stop(gc_th->f2fs_gc_task); - kfree(gc_th); + kvfree(gc_th); sbi->gc_thread = NULL; } @@ -323,8 +323,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, p.min_cost = get_max_cost(sbi, &p); if (*result != NULL_SEGNO) { - if (IS_DATASEG(get_seg_entry(sbi, *result)->type) && - get_valid_blocks(sbi, *result, false) && + if (get_valid_blocks(sbi, *result, false) && !sec_usage_check(sbi, GET_SEC_FROM_SEG(sbi, *result))) p.min_segno = *result; goto out; @@ -333,6 +332,22 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, if (p.max_search == 0) goto out; + if (__is_large_section(sbi) && p.alloc_mode == LFS) { + if (sbi->next_victim_seg[BG_GC] != NULL_SEGNO) { + p.min_segno = sbi->next_victim_seg[BG_GC]; + *result = p.min_segno; + sbi->next_victim_seg[BG_GC] = NULL_SEGNO; + goto got_result; + } + if (gc_type == FG_GC && + sbi->next_victim_seg[FG_GC] != NULL_SEGNO) { + p.min_segno = sbi->next_victim_seg[FG_GC]; + *result = p.min_segno; + sbi->next_victim_seg[FG_GC] = NULL_SEGNO; + goto got_result; + } + } + last_victim = sm->last_victim[p.gc_mode]; if (p.alloc_mode == LFS && gc_type == FG_GC) { p.min_segno = check_bg_victims(sbi); @@ -395,6 +410,8 @@ next: } if (p.min_segno != NULL_SEGNO) { got_it: + *result = (p.min_segno / p.ofs_unit) * p.ofs_unit; +got_result: if (p.alloc_mode == LFS) { secno = GET_SEC_FROM_SEG(sbi, p.min_segno); if (gc_type == FG_GC) @@ -402,13 +419,13 @@ got_it: else set_bit(secno, dirty_i->victim_secmap); } - *result = (p.min_segno / p.ofs_unit) * p.ofs_unit; + } +out: + if (p.min_segno != NULL_SEGNO) trace_f2fs_get_victim(sbi->sb, type, gc_type, &p, sbi->cur_victim_sec, prefree_segments(sbi), free_segments(sbi)); - } -out: mutex_unlock(&dirty_i->seglist_lock); return (p.min_segno == NULL_SEGNO) ? 0 : 1; @@ -658,6 +675,14 @@ got_it: fio.page = page; fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr; + /* + * don't cache encrypted data into meta inode until previous dirty + * data were writebacked to avoid racing between GC and flush. + */ + f2fs_wait_on_page_writeback(page, DATA, true, true); + + f2fs_wait_on_block_writeback(inode, dn.data_blkaddr); + fio.encrypted_page = f2fs_pagecache_get_page(META_MAPPING(sbi), dn.data_blkaddr, FGP_LOCK | FGP_CREAT, GFP_NOFS); @@ -743,7 +768,9 @@ static int move_data_block(struct inode *inode, block_t bidx, * don't cache encrypted data into meta inode until previous dirty * data were writebacked to avoid racing between GC and flush. */ - f2fs_wait_on_page_writeback(page, DATA, true); + f2fs_wait_on_page_writeback(page, DATA, true, true); + + f2fs_wait_on_block_writeback(inode, dn.data_blkaddr); err = f2fs_get_node_info(fio.sbi, dn.nid, &ni); if (err) @@ -802,8 +829,8 @@ static int move_data_block(struct inode *inode, block_t bidx, } write_page: + f2fs_wait_on_page_writeback(fio.encrypted_page, DATA, true, true); set_page_dirty(fio.encrypted_page); - f2fs_wait_on_page_writeback(fio.encrypted_page, DATA, true); if (clear_page_dirty_for_io(fio.encrypted_page)) dec_page_count(fio.sbi, F2FS_DIRTY_META); @@ -811,7 +838,7 @@ write_page: ClearPageError(page); /* allocate block address */ - f2fs_wait_on_page_writeback(dn.node_page, NODE, true); + f2fs_wait_on_page_writeback(dn.node_page, NODE, true, true); fio.op = REQ_OP_WRITE; fio.op_flags = REQ_SYNC | REQ_NOIDLE; @@ -897,8 +924,9 @@ static int move_data_page(struct inode *inode, block_t bidx, int gc_type, bool is_dirty = PageDirty(page); retry: + f2fs_wait_on_page_writeback(page, DATA, true, true); + set_page_dirty(page); - f2fs_wait_on_page_writeback(page, DATA, true); if (clear_page_dirty_for_io(page)) { inode_dec_dirty_pages(inode); f2fs_remove_dirty_inode(inode); @@ -1093,15 +1121,18 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, struct blk_plug plug; unsigned int segno = start_segno; unsigned int end_segno = start_segno + sbi->segs_per_sec; - int seg_freed = 0; + int seg_freed = 0, migrated = 0; unsigned char type = IS_DATASEG(get_seg_entry(sbi, segno)->type) ? SUM_TYPE_DATA : SUM_TYPE_NODE; int submitted = 0; + if (__is_large_section(sbi)) + end_segno = rounddown(end_segno, sbi->segs_per_sec); + /* readahead multi ssa blocks those have contiguous address */ - if (sbi->segs_per_sec > 1) + if (__is_large_section(sbi)) f2fs_ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno), - sbi->segs_per_sec, META_SSA, true); + end_segno - segno, META_SSA, true); /* reference all summary page */ while (segno < end_segno) { @@ -1130,10 +1161,13 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, GET_SUM_BLOCK(sbi, segno)); f2fs_put_page(sum_page, 0); - if (get_valid_blocks(sbi, segno, false) == 0 || - !PageUptodate(sum_page) || - unlikely(f2fs_cp_error(sbi))) - goto next; + if (get_valid_blocks(sbi, segno, false) == 0) + goto freed; + if (__is_large_section(sbi) && + migrated >= sbi->migration_granularity) + goto skip; + if (!PageUptodate(sum_page) || unlikely(f2fs_cp_error(sbi))) + goto skip; sum = page_address(sum_page); if (type != GET_SUM_TYPE((&sum->footer))) { @@ -1141,7 +1175,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, "type [%d, %d] in SSA and SIT", segno, type, GET_SUM_TYPE((&sum->footer))); set_sbi_flag(sbi, SBI_NEED_FSCK); - goto next; + goto skip; } /* @@ -1160,10 +1194,15 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, stat_inc_seg_count(sbi, type, gc_type); +freed: if (gc_type == FG_GC && get_valid_blocks(sbi, segno, false) == 0) seg_freed++; -next: + migrated++; + + if (__is_large_section(sbi) && segno + 1 < end_segno) + sbi->next_victim_seg[gc_type] = segno + 1; +skip: f2fs_put_page(sum_page, 0); } @@ -1307,7 +1346,7 @@ void f2fs_build_gc_manager(struct f2fs_sb_info *sbi) sbi->gc_pin_file_threshold = DEF_GC_FAILED_PINNED_FILES; /* give warm/cold data area from slower device */ - if (sbi->s_ndevs && sbi->segs_per_sec == 1) + if (sbi->s_ndevs && !__is_large_section(sbi)) SIT_I(sbi)->last_victim[ALLOC_NEXT] = GET_SEGNO(sbi, FDEV(0).end_blk) + 1; } diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index fdf4abcf2dab..b63b2ae1acc7 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -73,7 +73,7 @@ void f2fs_truncate_inline_inode(struct inode *inode, addr = inline_data_addr(inode, ipage); - f2fs_wait_on_page_writeback(ipage, NODE, true); + f2fs_wait_on_page_writeback(ipage, NODE, true, true); memset(addr + from, 0, MAX_INLINE_DATA(inode) - from); set_page_dirty(ipage); @@ -179,7 +179,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) fio.old_blkaddr = dn->data_blkaddr; set_inode_flag(dn->inode, FI_HOT_DATA); f2fs_outplace_write_data(dn, &fio); - f2fs_wait_on_page_writeback(page, DATA, true); + f2fs_wait_on_page_writeback(page, DATA, true, true); if (dirty) { inode_dec_dirty_pages(dn->inode); f2fs_remove_dirty_inode(dn->inode); @@ -254,7 +254,7 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page) f2fs_bug_on(F2FS_I_SB(inode), page->index); - f2fs_wait_on_page_writeback(dn.inode_page, NODE, true); + f2fs_wait_on_page_writeback(dn.inode_page, NODE, true, true); src_addr = kmap_atomic(page); dst_addr = inline_data_addr(inode, dn.inode_page); memcpy(dst_addr, src_addr, MAX_INLINE_DATA(inode)); @@ -295,7 +295,7 @@ process_inline: ipage = f2fs_get_node_page(sbi, inode->i_ino); f2fs_bug_on(sbi, IS_ERR(ipage)); - f2fs_wait_on_page_writeback(ipage, NODE, true); + f2fs_wait_on_page_writeback(ipage, NODE, true, true); src_addr = inline_data_addr(inode, npage); dst_addr = inline_data_addr(inode, ipage); @@ -409,7 +409,7 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, goto out; } - f2fs_wait_on_page_writeback(page, DATA, true); + f2fs_wait_on_page_writeback(page, DATA, true, true); dentry_blk = page_address(page); @@ -519,18 +519,18 @@ static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage, stat_dec_inline_dir(dir); clear_inode_flag(dir, FI_INLINE_DENTRY); - kfree(backup_dentry); + kvfree(backup_dentry); return 0; recover: lock_page(ipage); - f2fs_wait_on_page_writeback(ipage, NODE, true); + f2fs_wait_on_page_writeback(ipage, NODE, true, true); memcpy(inline_dentry, backup_dentry, MAX_INLINE_DATA(dir)); f2fs_i_depth_write(dir, 0); f2fs_i_size_write(dir, MAX_INLINE_DATA(dir)); set_page_dirty(ipage); f2fs_put_page(ipage, 1); - kfree(backup_dentry); + kvfree(backup_dentry); return err; } @@ -583,7 +583,7 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, } } - f2fs_wait_on_page_writeback(ipage, NODE, true); + f2fs_wait_on_page_writeback(ipage, NODE, true, true); name_hash = f2fs_dentry_hash(new_name, NULL); f2fs_update_dentry(ino, mode, &d, new_name, name_hash, bit_pos); @@ -615,7 +615,7 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page, int i; lock_page(page); - f2fs_wait_on_page_writeback(page, NODE, true); + f2fs_wait_on_page_writeback(page, NODE, true, true); inline_dentry = inline_data_addr(dir, page); make_dentry_ptr_inline(dir, &d, inline_dentry); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 1b1f22faff22..3e739e4986b8 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -103,7 +103,7 @@ static void __recover_inline_status(struct inode *inode, struct page *ipage) while (start < end) { if (*start++) { - f2fs_wait_on_page_writeback(ipage, NODE, true); + f2fs_wait_on_page_writeback(ipage, NODE, true, true); set_inode_flag(inode, FI_DATA_EXIST); set_raw_inline(inode, F2FS_INODE(ipage)); @@ -118,7 +118,7 @@ static bool f2fs_enable_inode_chksum(struct f2fs_sb_info *sbi, struct page *page { struct f2fs_inode *ri = &F2FS_NODE(page)->i; - if (!f2fs_sb_has_inode_chksum(sbi->sb)) + if (!f2fs_sb_has_inode_chksum(sbi)) return false; if (!IS_INODE(page) || !(ri->i_inline & F2FS_EXTRA_ATTR)) @@ -218,7 +218,7 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) return false; } - if (f2fs_sb_has_flexible_inline_xattr(sbi->sb) + if (f2fs_sb_has_flexible_inline_xattr(sbi) && !f2fs_has_extra_attr(inode)) { set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_msg(sbi->sb, KERN_WARNING, @@ -228,7 +228,7 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) } if (f2fs_has_extra_attr(inode) && - !f2fs_sb_has_extra_attr(sbi->sb)) { + !f2fs_sb_has_extra_attr(sbi)) { set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_msg(sbi->sb, KERN_WARNING, "%s: inode (ino=%lx) is with extra_attr, " @@ -340,7 +340,7 @@ static int do_read_inode(struct inode *inode) fi->i_extra_isize = f2fs_has_extra_attr(inode) ? le16_to_cpu(ri->i_extra_isize) : 0; - if (f2fs_sb_has_flexible_inline_xattr(sbi->sb)) { + if (f2fs_sb_has_flexible_inline_xattr(sbi)) { fi->i_inline_xattr_size = le16_to_cpu(ri->i_inline_xattr_size); } else if (f2fs_has_inline_xattr(inode) || f2fs_has_inline_dentry(inode)) { @@ -390,14 +390,14 @@ static int do_read_inode(struct inode *inode) if (fi->i_flags & F2FS_PROJINHERIT_FL) set_inode_flag(inode, FI_PROJ_INHERIT); - if (f2fs_has_extra_attr(inode) && f2fs_sb_has_project_quota(sbi->sb) && + if (f2fs_has_extra_attr(inode) && f2fs_sb_has_project_quota(sbi) && F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, i_projid)) i_projid = (projid_t)le32_to_cpu(ri->i_projid); else i_projid = F2FS_DEF_PROJID; fi->i_projid = make_kprojid(&init_user_ns, i_projid); - if (f2fs_has_extra_attr(inode) && f2fs_sb_has_inode_crtime(sbi->sb) && + if (f2fs_has_extra_attr(inode) && f2fs_sb_has_inode_crtime(sbi) && F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, i_crtime)) { fi->i_crtime.tv_sec = le64_to_cpu(ri->i_crtime); fi->i_crtime.tv_nsec = le32_to_cpu(ri->i_crtime_nsec); @@ -497,7 +497,7 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page) struct f2fs_inode *ri; struct extent_tree *et = F2FS_I(inode)->extent_tree; - f2fs_wait_on_page_writeback(node_page, NODE, true); + f2fs_wait_on_page_writeback(node_page, NODE, true, true); set_page_dirty(node_page); f2fs_inode_synced(inode); @@ -542,11 +542,11 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page) if (f2fs_has_extra_attr(inode)) { ri->i_extra_isize = cpu_to_le16(F2FS_I(inode)->i_extra_isize); - if (f2fs_sb_has_flexible_inline_xattr(F2FS_I_SB(inode)->sb)) + if (f2fs_sb_has_flexible_inline_xattr(F2FS_I_SB(inode))) ri->i_inline_xattr_size = cpu_to_le16(F2FS_I(inode)->i_inline_xattr_size); - if (f2fs_sb_has_project_quota(F2FS_I_SB(inode)->sb) && + if (f2fs_sb_has_project_quota(F2FS_I_SB(inode)) && F2FS_FITS_IN_INODE(ri, F2FS_I(inode)->i_extra_isize, i_projid)) { projid_t i_projid; @@ -556,7 +556,7 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page) ri->i_projid = cpu_to_le32(i_projid); } - if (f2fs_sb_has_inode_crtime(F2FS_I_SB(inode)->sb) && + if (f2fs_sb_has_inode_crtime(F2FS_I_SB(inode)) && F2FS_FITS_IN_INODE(ri, F2FS_I(inode)->i_extra_isize, i_crtime)) { ri->i_crtime = diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index f2cdf47b76b6..367302907d07 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -61,7 +61,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) goto fail; } - if (f2fs_sb_has_project_quota(sbi->sb) && + if (f2fs_sb_has_project_quota(sbi) && (F2FS_I(dir)->i_flags & F2FS_PROJINHERIT_FL)) F2FS_I(inode)->i_projid = F2FS_I(dir)->i_projid; else @@ -79,7 +79,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) f2fs_may_encrypt(inode)) f2fs_set_encrypted_inode(inode); - if (f2fs_sb_has_extra_attr(sbi->sb)) { + if (f2fs_sb_has_extra_attr(sbi)) { set_inode_flag(inode, FI_EXTRA_ATTR); F2FS_I(inode)->i_extra_isize = F2FS_TOTAL_EXTRA_ATTR_SIZE; } @@ -92,7 +92,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) if (f2fs_may_inline_dentry(inode)) set_inode_flag(inode, FI_INLINE_DENTRY); - if (f2fs_sb_has_flexible_inline_xattr(sbi->sb)) { + if (f2fs_sb_has_flexible_inline_xattr(sbi)) { f2fs_bug_on(sbi, !f2fs_has_extra_attr(inode)); if (f2fs_has_inline_xattr(inode)) xattr_size = F2FS_OPTION(sbi).inline_xattr_size; @@ -634,7 +634,7 @@ out_f2fs_handle_failed_inode: f2fs_handle_failed_inode(inode); out_free_encrypted_link: if (disk_link.name != (unsigned char *)symname) - kfree(disk_link.name); + kvfree(disk_link.name); return err; } diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 08edc0930831..f6a1e1e0f891 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -826,6 +826,7 @@ static int truncate_node(struct dnode_of_data *dn) struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); struct node_info ni; int err; + pgoff_t index; err = f2fs_get_node_info(sbi, dn->nid, &ni); if (err) @@ -845,10 +846,11 @@ static int truncate_node(struct dnode_of_data *dn) clear_node_page_dirty(dn->node_page); set_sbi_flag(sbi, SBI_IS_DIRTY); + index = dn->node_page->index; f2fs_put_page(dn->node_page, 1); invalidate_mapping_pages(NODE_MAPPING(sbi), - dn->node_page->index, dn->node_page->index); + index, index); dn->node_page = NULL; trace_f2fs_truncate_node(dn->inode, dn->nid, ni.blk_addr); @@ -1104,7 +1106,7 @@ skip_partial: ri->i_nid[offset[0] - NODE_DIR1_BLOCK]) { lock_page(page); BUG_ON(page->mapping != NODE_MAPPING(sbi)); - f2fs_wait_on_page_writeback(page, NODE, true); + f2fs_wait_on_page_writeback(page, NODE, true, true); ri->i_nid[offset[0] - NODE_DIR1_BLOCK] = 0; set_page_dirty(page); unlock_page(page); @@ -1232,7 +1234,7 @@ struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs) new_ni.version = 0; set_node_addr(sbi, &new_ni, NEW_ADDR, false); - f2fs_wait_on_page_writeback(page, NODE, true); + f2fs_wait_on_page_writeback(page, NODE, true, true); fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true); set_cold_node(page, S_ISDIR(dn->inode->i_mode)); if (!PageUptodate(page)) @@ -1598,10 +1600,10 @@ int f2fs_move_node_page(struct page *node_page, int gc_type) .for_reclaim = 0, }; + f2fs_wait_on_page_writeback(node_page, NODE, true, true); + set_page_dirty(node_page); - f2fs_wait_on_page_writeback(node_page, NODE, true); - f2fs_bug_on(F2FS_P_SB(node_page), PageWriteback(node_page)); if (!clear_page_dirty_for_io(node_page)) { err = -EAGAIN; goto out_page; @@ -1689,8 +1691,7 @@ continue_unlock: goto continue_unlock; } - f2fs_wait_on_page_writeback(page, NODE, true); - BUG_ON(PageWriteback(page)); + f2fs_wait_on_page_writeback(page, NODE, true, true); set_fsync_mark(page, 0); set_dentry_mark(page, 0); @@ -1741,7 +1742,7 @@ continue_unlock: "Retry to write fsync mark: ino=%u, idx=%lx", ino, last_page->index); lock_page(last_page); - f2fs_wait_on_page_writeback(last_page, NODE, true); + f2fs_wait_on_page_writeback(last_page, NODE, true, true); set_page_dirty(last_page); unlock_page(last_page); goto retry; @@ -1822,9 +1823,8 @@ continue_unlock: goto lock_node; } - f2fs_wait_on_page_writeback(page, NODE, true); + f2fs_wait_on_page_writeback(page, NODE, true, true); - BUG_ON(PageWriteback(page)); if (!clear_page_dirty_for_io(page)) goto continue_unlock; @@ -1891,7 +1891,7 @@ int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, get_page(page); spin_unlock_irqrestore(&sbi->fsync_node_lock, flags); - f2fs_wait_on_page_writeback(page, NODE, true); + f2fs_wait_on_page_writeback(page, NODE, true, false); if (TestClearPageError(page)) ret = -EIO; @@ -2472,7 +2472,7 @@ void f2fs_recover_inline_xattr(struct inode *inode, struct page *page) src_addr = inline_xattr_addr(inode, page); inline_size = inline_xattr_size(inode); - f2fs_wait_on_page_writeback(ipage, NODE, true); + f2fs_wait_on_page_writeback(ipage, NODE, true, true); memcpy(dst_addr, src_addr, inline_size); update_inode: f2fs_update_inode(inode, ipage); @@ -2566,17 +2566,17 @@ retry: if (dst->i_inline & F2FS_EXTRA_ATTR) { dst->i_extra_isize = src->i_extra_isize; - if (f2fs_sb_has_flexible_inline_xattr(sbi->sb) && + if (f2fs_sb_has_flexible_inline_xattr(sbi) && F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize), i_inline_xattr_size)) dst->i_inline_xattr_size = src->i_inline_xattr_size; - if (f2fs_sb_has_project_quota(sbi->sb) && + if (f2fs_sb_has_project_quota(sbi) && F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize), i_projid)) dst->i_projid = src->i_projid; - if (f2fs_sb_has_inode_crtime(sbi->sb) && + if (f2fs_sb_has_inode_crtime(sbi) && F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize), i_crtime_nsec)) { dst->i_crtime = src->i_crtime; @@ -3118,17 +3118,17 @@ void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi) for (i = 0; i < nm_i->nat_blocks; i++) kvfree(nm_i->free_nid_bitmap[i]); - kfree(nm_i->free_nid_bitmap); + kvfree(nm_i->free_nid_bitmap); } kvfree(nm_i->free_nid_count); - kfree(nm_i->nat_bitmap); - kfree(nm_i->nat_bits); + kvfree(nm_i->nat_bitmap); + kvfree(nm_i->nat_bits); #ifdef CONFIG_F2FS_CHECK_FS - kfree(nm_i->nat_bitmap_mir); + kvfree(nm_i->nat_bitmap_mir); #endif sbi->nm_info = NULL; - kfree(nm_i); + kvfree(nm_i); } int __init f2fs_create_node_manager_caches(void) diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 1c73d879a9bc..e05af5df5648 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -361,7 +361,7 @@ static inline int set_nid(struct page *p, int off, nid_t nid, bool i) { struct f2fs_node *rn = F2FS_NODE(p); - f2fs_wait_on_page_writeback(p, NODE, true); + f2fs_wait_on_page_writeback(p, NODE, true, true); if (i) rn->i.i_nid[off - NODE_DIR1_BLOCK] = cpu_to_le32(nid); diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 336273a81fba..aa3f5633a20c 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -250,7 +250,7 @@ static int recover_inode(struct inode *inode, struct page *page) i_gid_write(inode, le32_to_cpu(raw->i_gid)); if (raw->i_inline & F2FS_EXTRA_ATTR) { - if (f2fs_sb_has_project_quota(F2FS_I_SB(inode)->sb) && + if (f2fs_sb_has_project_quota(F2FS_I_SB(inode)) && F2FS_FITS_IN_INODE(raw, le16_to_cpu(raw->i_extra_isize), i_projid)) { projid_t i_projid; @@ -531,7 +531,7 @@ retry_dn: goto out; } - f2fs_wait_on_page_writeback(dn.node_page, NODE, true); + f2fs_wait_on_page_writeback(dn.node_page, NODE, true, true); err = f2fs_get_node_info(sbi, dn.nid, &ni); if (err) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index fb4be4629add..ccefcf8016c4 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -229,7 +229,7 @@ static int __revoke_inmem_pages(struct inode *inode, lock_page(page); - f2fs_wait_on_page_writeback(page, DATA, true); + f2fs_wait_on_page_writeback(page, DATA, true, true); if (recover) { struct dnode_of_data dn; @@ -387,8 +387,9 @@ static int __f2fs_commit_inmem_pages(struct inode *inode) if (page->mapping == inode->i_mapping) { trace_f2fs_commit_inmem_page(page, INMEM); + f2fs_wait_on_page_writeback(page, DATA, true, true); + set_page_dirty(page); - f2fs_wait_on_page_writeback(page, DATA, true); if (clear_page_dirty_for_io(page)) { inode_dec_dirty_pages(inode); f2fs_remove_dirty_inode(inode); @@ -620,14 +621,16 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino) return 0; if (!test_opt(sbi, FLUSH_MERGE)) { + atomic_inc(&fcc->queued_flush); ret = submit_flush_wait(sbi, ino); + atomic_dec(&fcc->queued_flush); atomic_inc(&fcc->issued_flush); return ret; } - if (atomic_inc_return(&fcc->issing_flush) == 1 || sbi->s_ndevs > 1) { + if (atomic_inc_return(&fcc->queued_flush) == 1 || sbi->s_ndevs > 1) { ret = submit_flush_wait(sbi, ino); - atomic_dec(&fcc->issing_flush); + atomic_dec(&fcc->queued_flush); atomic_inc(&fcc->issued_flush); return ret; @@ -646,14 +649,14 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino) if (fcc->f2fs_issue_flush) { wait_for_completion(&cmd.wait); - atomic_dec(&fcc->issing_flush); + atomic_dec(&fcc->queued_flush); } else { struct llist_node *list; list = llist_del_all(&fcc->issue_list); if (!list) { wait_for_completion(&cmd.wait); - atomic_dec(&fcc->issing_flush); + atomic_dec(&fcc->queued_flush); } else { struct flush_cmd *tmp, *next; @@ -662,7 +665,7 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino) llist_for_each_entry_safe(tmp, next, list, llnode) { if (tmp == &cmd) { cmd.ret = ret; - atomic_dec(&fcc->issing_flush); + atomic_dec(&fcc->queued_flush); continue; } tmp->ret = ret; @@ -691,7 +694,7 @@ int f2fs_create_flush_cmd_control(struct f2fs_sb_info *sbi) if (!fcc) return -ENOMEM; atomic_set(&fcc->issued_flush, 0); - atomic_set(&fcc->issing_flush, 0); + atomic_set(&fcc->queued_flush, 0); init_waitqueue_head(&fcc->flush_wait_queue); init_llist_head(&fcc->issue_list); SM_I(sbi)->fcc_info = fcc; @@ -703,7 +706,7 @@ init_thread: "f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev)); if (IS_ERR(fcc->f2fs_issue_flush)) { err = PTR_ERR(fcc->f2fs_issue_flush); - kfree(fcc); + kvfree(fcc); SM_I(sbi)->fcc_info = NULL; return err; } @@ -722,7 +725,7 @@ void f2fs_destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free) kthread_stop(flush_thread); } if (free) { - kfree(fcc); + kvfree(fcc); SM_I(sbi)->fcc_info = NULL; } } @@ -907,7 +910,7 @@ static struct discard_cmd *__create_discard_cmd(struct f2fs_sb_info *sbi, dc->len = len; dc->ref = 0; dc->state = D_PREP; - dc->issuing = 0; + dc->queued = 0; dc->error = 0; init_completion(&dc->wait); list_add_tail(&dc->list, pend_list); @@ -939,7 +942,7 @@ static void __detach_discard_cmd(struct discard_cmd_control *dcc, struct discard_cmd *dc) { if (dc->state == D_DONE) - atomic_sub(dc->issuing, &dcc->issing_discard); + atomic_sub(dc->queued, &dcc->queued_discard); list_del(&dc->list); rb_erase(&dc->rb_node, &dcc->root); @@ -1223,12 +1226,12 @@ submit: dc->bio_ref++; spin_unlock_irqrestore(&dc->lock, flags); - atomic_inc(&dcc->issing_discard); - dc->issuing++; + atomic_inc(&dcc->queued_discard); + dc->queued++; list_move_tail(&dc->list, wait_list); /* sanity check on discard range */ - __check_sit_bitmap(sbi, start, start + len); + __check_sit_bitmap(sbi, lstart, lstart + len); bio->bi_private = dc; bio->bi_end_io = f2fs_submit_discard_endio; @@ -1725,6 +1728,10 @@ static int issue_discard_thread(void *data) if (dcc->discard_wake) dcc->discard_wake = 0; + /* clean up pending candidates before going to sleep */ + if (atomic_read(&dcc->queued_discard)) + __wait_all_discard_cmd(sbi, NULL); + if (try_to_freeze()) continue; if (f2fs_readonly(sbi->sb)) @@ -1810,7 +1817,7 @@ static int __issue_discard_async(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkstart, block_t blklen) { #ifdef CONFIG_BLK_DEV_ZONED - if (f2fs_sb_has_blkzoned(sbi->sb) && + if (f2fs_sb_has_blkzoned(sbi) && bdev_zoned_model(bdev) != BLK_ZONED_NONE) return __f2fs_issue_discard_zone(sbi, bdev, blkstart, blklen); #endif @@ -1958,7 +1965,7 @@ void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi, unsigned int start = 0, end = -1; unsigned int secno, start_segno; bool force = (cpc->reason & CP_DISCARD); - bool need_align = test_opt(sbi, LFS) && sbi->segs_per_sec > 1; + bool need_align = test_opt(sbi, LFS) && __is_large_section(sbi); mutex_lock(&dirty_i->seglist_lock); @@ -1990,7 +1997,7 @@ void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi, (end - 1) <= cpc->trim_end) continue; - if (!test_opt(sbi, LFS) || sbi->segs_per_sec == 1) { + if (!test_opt(sbi, LFS) || !__is_large_section(sbi)) { f2fs_issue_discard(sbi, START_BLOCK(sbi, start), (end - start) << sbi->log_blocks_per_seg); continue; @@ -2022,7 +2029,7 @@ find_next: sbi->blocks_per_seg, cur_pos); len = next_pos - cur_pos; - if (f2fs_sb_has_blkzoned(sbi->sb) || + if (f2fs_sb_has_blkzoned(sbi) || (force && len < cpc->trim_minlen)) goto skip; @@ -2070,7 +2077,7 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) INIT_LIST_HEAD(&dcc->fstrim_list); mutex_init(&dcc->cmd_lock); atomic_set(&dcc->issued_discard, 0); - atomic_set(&dcc->issing_discard, 0); + atomic_set(&dcc->queued_discard, 0); atomic_set(&dcc->discard_cmd_cnt, 0); dcc->nr_discards = 0; dcc->max_discards = MAIN_SEGS(sbi) << sbi->log_blocks_per_seg; @@ -2086,7 +2093,7 @@ init_thread: "f2fs_discard-%u:%u", MAJOR(dev), MINOR(dev)); if (IS_ERR(dcc->f2fs_issue_discard)) { err = PTR_ERR(dcc->f2fs_issue_discard); - kfree(dcc); + kvfree(dcc); SM_I(sbi)->dcc_info = NULL; return err; } @@ -2103,7 +2110,7 @@ static void destroy_discard_cmd_control(struct f2fs_sb_info *sbi) f2fs_stop_discard_thread(sbi); - kfree(dcc); + kvfree(dcc); SM_I(sbi)->dcc_info = NULL; } @@ -2222,7 +2229,7 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) /* update total number of valid blocks to be written in ckpt area */ SIT_I(sbi)->written_valid_blocks += del; - if (sbi->segs_per_sec > 1) + if (__is_large_section(sbi)) get_sec_entry(sbi, segno)->valid_blocks += del; } @@ -2488,7 +2495,7 @@ static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified) static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type) { /* if segs_per_sec is large than 1, we need to keep original policy. */ - if (sbi->segs_per_sec != 1) + if (__is_large_section(sbi)) return CURSEG_I(sbi, type)->segno; if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) @@ -2798,7 +2805,7 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) struct discard_policy dpolicy; unsigned long long trimmed = 0; int err = 0; - bool need_align = test_opt(sbi, LFS) && sbi->segs_per_sec > 1; + bool need_align = test_opt(sbi, LFS) && __is_large_section(sbi); if (start >= MAX_BLKADDR(sbi) || range->len < sbi->blocksize) return -EINVAL; @@ -3349,16 +3356,18 @@ void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn, } void f2fs_wait_on_page_writeback(struct page *page, - enum page_type type, bool ordered) + enum page_type type, bool ordered, bool locked) { if (PageWriteback(page)) { struct f2fs_sb_info *sbi = F2FS_P_SB(page); f2fs_submit_merged_write_cond(sbi, NULL, page, 0, type); - if (ordered) + if (ordered) { wait_on_page_writeback(page); - else + f2fs_bug_on(sbi, locked && PageWriteback(page)); + } else { wait_for_stable_page(page); + } } } @@ -3375,7 +3384,7 @@ void f2fs_wait_on_block_writeback(struct inode *inode, block_t blkaddr) cpage = find_lock_page(META_MAPPING(sbi), blkaddr); if (cpage) { - f2fs_wait_on_page_writeback(cpage, DATA, true); + f2fs_wait_on_page_writeback(cpage, DATA, true, true); f2fs_put_page(cpage, 1); } } @@ -3957,7 +3966,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi) if (!sit_i->tmp_map) return -ENOMEM; - if (sbi->segs_per_sec > 1) { + if (__is_large_section(sbi)) { sit_i->sec_entries = f2fs_kvzalloc(sbi, array_size(sizeof(struct sec_entry), MAIN_SECS(sbi)), @@ -4112,7 +4121,7 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) se->valid_blocks; } - if (sbi->segs_per_sec > 1) + if (__is_large_section(sbi)) get_sec_entry(sbi, start)->valid_blocks += se->valid_blocks; } @@ -4156,7 +4165,7 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) sbi->discard_blks -= se->valid_blocks; } - if (sbi->segs_per_sec > 1) { + if (__is_large_section(sbi)) { get_sec_entry(sbi, start)->valid_blocks += se->valid_blocks; get_sec_entry(sbi, start)->valid_blocks -= @@ -4391,7 +4400,7 @@ static void destroy_dirty_segmap(struct f2fs_sb_info *sbi) destroy_victim_secmap(sbi); SM_I(sbi)->dirty_info = NULL; - kfree(dirty_i); + kvfree(dirty_i); } static void destroy_curseg(struct f2fs_sb_info *sbi) @@ -4403,10 +4412,10 @@ static void destroy_curseg(struct f2fs_sb_info *sbi) return; SM_I(sbi)->curseg_array = NULL; for (i = 0; i < NR_CURSEG_TYPE; i++) { - kfree(array[i].sum_blk); - kfree(array[i].journal); + kvfree(array[i].sum_blk); + kvfree(array[i].journal); } - kfree(array); + kvfree(array); } static void destroy_free_segmap(struct f2fs_sb_info *sbi) @@ -4417,7 +4426,7 @@ static void destroy_free_segmap(struct f2fs_sb_info *sbi) SM_I(sbi)->free_info = NULL; kvfree(free_i->free_segmap); kvfree(free_i->free_secmap); - kfree(free_i); + kvfree(free_i); } static void destroy_sit_info(struct f2fs_sb_info *sbi) @@ -4430,26 +4439,26 @@ static void destroy_sit_info(struct f2fs_sb_info *sbi) if (sit_i->sentries) { for (start = 0; start < MAIN_SEGS(sbi); start++) { - kfree(sit_i->sentries[start].cur_valid_map); + kvfree(sit_i->sentries[start].cur_valid_map); #ifdef CONFIG_F2FS_CHECK_FS - kfree(sit_i->sentries[start].cur_valid_map_mir); + kvfree(sit_i->sentries[start].cur_valid_map_mir); #endif - kfree(sit_i->sentries[start].ckpt_valid_map); - kfree(sit_i->sentries[start].discard_map); + kvfree(sit_i->sentries[start].ckpt_valid_map); + kvfree(sit_i->sentries[start].discard_map); } } - kfree(sit_i->tmp_map); + kvfree(sit_i->tmp_map); kvfree(sit_i->sentries); kvfree(sit_i->sec_entries); kvfree(sit_i->dirty_sentries_bitmap); SM_I(sbi)->sit_info = NULL; - kfree(sit_i->sit_bitmap); + kvfree(sit_i->sit_bitmap); #ifdef CONFIG_F2FS_CHECK_FS - kfree(sit_i->sit_bitmap_mir); + kvfree(sit_i->sit_bitmap_mir); #endif - kfree(sit_i); + kvfree(sit_i); } void f2fs_destroy_segment_manager(struct f2fs_sb_info *sbi) @@ -4465,7 +4474,7 @@ void f2fs_destroy_segment_manager(struct f2fs_sb_info *sbi) destroy_free_segmap(sbi); destroy_sit_info(sbi); sbi->sm_info = NULL; - kfree(sm_info); + kvfree(sm_info); } int __init f2fs_create_segment_manager_caches(void) diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index ab3465faddf1..a77f76f528b6 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -333,7 +333,7 @@ static inline unsigned int get_valid_blocks(struct f2fs_sb_info *sbi, * In order to get # of valid blocks in a section instantly from many * segments, f2fs manages two counting structures separately. */ - if (use_section && sbi->segs_per_sec > 1) + if (use_section && __is_large_section(sbi)) return get_sec_entry(sbi, segno)->valid_blocks; else return get_seg_entry(sbi, segno)->valid_blocks; diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c index 9e13db994fdf..a467aca29cfe 100644 --- a/fs/f2fs/shrinker.c +++ b/fs/f2fs/shrinker.c @@ -135,6 +135,6 @@ void f2fs_leave_shrinker(struct f2fs_sb_info *sbi) f2fs_shrink_extent_tree(sbi, __count_extent_cache(sbi)); spin_lock(&f2fs_list_lock); - list_del(&sbi->s_list); + list_del_init(&sbi->s_list); spin_unlock(&f2fs_list_lock); } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index c551d5336473..a9e7a1e62c66 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -38,7 +38,7 @@ static struct kmem_cache *f2fs_inode_cachep; #ifdef CONFIG_F2FS_FAULT_INJECTION -char *f2fs_fault_name[FAULT_MAX] = { +const char *f2fs_fault_name[FAULT_MAX] = { [FAULT_KMALLOC] = "kmalloc", [FAULT_KVMALLOC] = "kvmalloc", [FAULT_PAGE_ALLOC] = "page alloc", @@ -259,7 +259,7 @@ static int f2fs_set_qf_name(struct super_block *sb, int qtype, "quota options when quota turned on"); return -EINVAL; } - if (f2fs_sb_has_quota_ino(sb)) { + if (f2fs_sb_has_quota_ino(sbi)) { f2fs_msg(sb, KERN_INFO, "QUOTA feature is enabled, so ignore qf_name"); return 0; @@ -289,7 +289,7 @@ static int f2fs_set_qf_name(struct super_block *sb, int qtype, set_opt(sbi, QUOTA); return 0; errout: - kfree(qname); + kvfree(qname); return ret; } @@ -302,7 +302,7 @@ static int f2fs_clear_qf_name(struct super_block *sb, int qtype) " when quota turned on"); return -EINVAL; } - kfree(F2FS_OPTION(sbi).s_qf_names[qtype]); + kvfree(F2FS_OPTION(sbi).s_qf_names[qtype]); F2FS_OPTION(sbi).s_qf_names[qtype] = NULL; return 0; } @@ -314,7 +314,7 @@ static int f2fs_check_quota_options(struct f2fs_sb_info *sbi) * 'grpquota' mount options are allowed even without quota feature * to support legacy quotas in quota files. */ - if (test_opt(sbi, PRJQUOTA) && !f2fs_sb_has_project_quota(sbi->sb)) { + if (test_opt(sbi, PRJQUOTA) && !f2fs_sb_has_project_quota(sbi)) { f2fs_msg(sbi->sb, KERN_ERR, "Project quota feature not enabled. " "Cannot enable project quota enforcement."); return -1; @@ -348,7 +348,7 @@ static int f2fs_check_quota_options(struct f2fs_sb_info *sbi) } } - if (f2fs_sb_has_quota_ino(sbi->sb) && F2FS_OPTION(sbi).s_jquota_fmt) { + if (f2fs_sb_has_quota_ino(sbi) && F2FS_OPTION(sbi).s_jquota_fmt) { f2fs_msg(sbi->sb, KERN_INFO, "QUOTA feature is enabled, so ignore jquota_fmt"); F2FS_OPTION(sbi).s_jquota_fmt = 0; @@ -399,10 +399,10 @@ static int parse_options(struct super_block *sb, char *options) set_opt(sbi, BG_GC); set_opt(sbi, FORCE_FG_GC); } else { - kfree(name); + kvfree(name); return -EINVAL; } - kfree(name); + kvfree(name); break; case Opt_disable_roll_forward: set_opt(sbi, DISABLE_ROLL_FORWARD); @@ -417,7 +417,7 @@ static int parse_options(struct super_block *sb, char *options) set_opt(sbi, DISCARD); break; case Opt_nodiscard: - if (f2fs_sb_has_blkzoned(sb)) { + if (f2fs_sb_has_blkzoned(sbi)) { f2fs_msg(sb, KERN_WARNING, "discard is required for zoned block devices"); return -EINVAL; @@ -566,11 +566,11 @@ static int parse_options(struct super_block *sb, char *options) return -ENOMEM; if (strlen(name) == 8 && !strncmp(name, "adaptive", 8)) { - if (f2fs_sb_has_blkzoned(sb)) { + if (f2fs_sb_has_blkzoned(sbi)) { f2fs_msg(sb, KERN_WARNING, "adaptive mode is not allowed with " "zoned block device feature"); - kfree(name); + kvfree(name); return -EINVAL; } set_opt_mode(sbi, F2FS_MOUNT_ADAPTIVE); @@ -578,10 +578,10 @@ static int parse_options(struct super_block *sb, char *options) !strncmp(name, "lfs", 3)) { set_opt_mode(sbi, F2FS_MOUNT_LFS); } else { - kfree(name); + kvfree(name); return -EINVAL; } - kfree(name); + kvfree(name); break; case Opt_io_size_bits: if (args->from && match_int(args, &arg)) @@ -714,10 +714,10 @@ static int parse_options(struct super_block *sb, char *options) !strncmp(name, "fs-based", 8)) { F2FS_OPTION(sbi).whint_mode = WHINT_MODE_FS; } else { - kfree(name); + kvfree(name); return -EINVAL; } - kfree(name); + kvfree(name); break; case Opt_alloc: name = match_strdup(&args[0]); @@ -731,10 +731,10 @@ static int parse_options(struct super_block *sb, char *options) !strncmp(name, "reuse", 5)) { F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_REUSE; } else { - kfree(name); + kvfree(name); return -EINVAL; } - kfree(name); + kvfree(name); break; case Opt_fsync: name = match_strdup(&args[0]); @@ -751,14 +751,14 @@ static int parse_options(struct super_block *sb, char *options) F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_NOBARRIER; } else { - kfree(name); + kvfree(name); return -EINVAL; } - kfree(name); + kvfree(name); break; case Opt_test_dummy_encryption: #ifdef CONFIG_F2FS_FS_ENCRYPTION - if (!f2fs_sb_has_encrypt(sb)) { + if (!f2fs_sb_has_encrypt(sbi)) { f2fs_msg(sb, KERN_ERR, "Encrypt feature is off"); return -EINVAL; } @@ -783,10 +783,10 @@ static int parse_options(struct super_block *sb, char *options) !strncmp(name, "disable", 7)) { set_opt(sbi, DISABLE_CHECKPOINT); } else { - kfree(name); + kvfree(name); return -EINVAL; } - kfree(name); + kvfree(name); break; default: f2fs_msg(sb, KERN_ERR, @@ -799,13 +799,13 @@ static int parse_options(struct super_block *sb, char *options) if (f2fs_check_quota_options(sbi)) return -EINVAL; #else - if (f2fs_sb_has_quota_ino(sbi->sb) && !f2fs_readonly(sbi->sb)) { + if (f2fs_sb_has_quota_ino(sbi) && !f2fs_readonly(sbi->sb)) { f2fs_msg(sbi->sb, KERN_INFO, "Filesystem with quota feature cannot be mounted RDWR " "without CONFIG_QUOTA"); return -EINVAL; } - if (f2fs_sb_has_project_quota(sbi->sb) && !f2fs_readonly(sbi->sb)) { + if (f2fs_sb_has_project_quota(sbi) && !f2fs_readonly(sbi->sb)) { f2fs_msg(sb, KERN_ERR, "Filesystem with project quota feature cannot be " "mounted RDWR without CONFIG_QUOTA"); @@ -821,8 +821,8 @@ static int parse_options(struct super_block *sb, char *options) } if (test_opt(sbi, INLINE_XATTR_SIZE)) { - if (!f2fs_sb_has_extra_attr(sb) || - !f2fs_sb_has_flexible_inline_xattr(sb)) { + if (!f2fs_sb_has_extra_attr(sbi) || + !f2fs_sb_has_flexible_inline_xattr(sbi)) { f2fs_msg(sb, KERN_ERR, "extra_attr or flexible_inline_xattr " "feature is off"); @@ -1017,10 +1017,10 @@ static void destroy_device_list(struct f2fs_sb_info *sbi) for (i = 0; i < sbi->s_ndevs; i++) { blkdev_put(FDEV(i).bdev, FMODE_EXCL); #ifdef CONFIG_BLK_DEV_ZONED - kfree(FDEV(i).blkz_type); + kvfree(FDEV(i).blkz_type); #endif } - kfree(sbi->devs); + kvfree(sbi->devs); } static void f2fs_put_super(struct super_block *sb) @@ -1058,9 +1058,6 @@ static void f2fs_put_super(struct super_block *sb) f2fs_write_checkpoint(sbi, &cpc); } - /* f2fs_write_checkpoint can update stat informaion */ - f2fs_destroy_stats(sbi); - /* * normally superblock is clean, so we need to release this. * In addition, EIO will skip do checkpoint, we need this as well. @@ -1078,32 +1075,41 @@ static void f2fs_put_super(struct super_block *sb) f2fs_bug_on(sbi, sbi->fsync_node_num); iput(sbi->node_inode); + sbi->node_inode = NULL; + iput(sbi->meta_inode); + sbi->meta_inode = NULL; + + /* + * iput() can update stat information, if f2fs_write_checkpoint() + * above failed with error. + */ + f2fs_destroy_stats(sbi); /* destroy f2fs internal modules */ f2fs_destroy_node_manager(sbi); f2fs_destroy_segment_manager(sbi); - kfree(sbi->ckpt); + kvfree(sbi->ckpt); f2fs_unregister_sysfs(sbi); sb->s_fs_info = NULL; if (sbi->s_chksum_driver) crypto_free_shash(sbi->s_chksum_driver); - kfree(sbi->raw_super); + kvfree(sbi->raw_super); destroy_device_list(sbi); if (sbi->write_io_dummy) mempool_destroy(sbi->write_io_dummy); #ifdef CONFIG_QUOTA for (i = 0; i < MAXQUOTAS; i++) - kfree(F2FS_OPTION(sbi).s_qf_names[i]); + kvfree(F2FS_OPTION(sbi).s_qf_names[i]); #endif destroy_percpu_info(sbi); for (i = 0; i < NR_PAGE_TYPE; i++) - kfree(sbi->write_io[i]); - kfree(sbi); + kvfree(sbi->write_io[i]); + kvfree(sbi); } int f2fs_sync_fs(struct super_block *sb, int sync) @@ -1432,7 +1438,7 @@ static void default_options(struct f2fs_sb_info *sbi) clear_opt(sbi, DISABLE_CHECKPOINT); set_opt(sbi, FLUSH_MERGE); set_opt(sbi, DISCARD); - if (f2fs_sb_has_blkzoned(sbi->sb)) + if (f2fs_sb_has_blkzoned(sbi)) set_opt_mode(sbi, F2FS_MOUNT_LFS); else set_opt_mode(sbi, F2FS_MOUNT_ADAPTIVE); @@ -1458,19 +1464,16 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) sbi->sb->s_flags |= MS_ACTIVE; - mutex_lock(&sbi->gc_mutex); f2fs_update_time(sbi, DISABLE_TIME); while (!f2fs_time_over(sbi, DISABLE_TIME)) { + mutex_lock(&sbi->gc_mutex); err = f2fs_gc(sbi, true, false, NULL_SEGNO); if (err == -ENODATA) break; - if (err && err != -EAGAIN) { - mutex_unlock(&sbi->gc_mutex); + if (err && err != -EAGAIN) return err; - } } - mutex_unlock(&sbi->gc_mutex); err = sync_filesystem(sbi->sb); if (err) @@ -1532,7 +1535,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) GFP_KERNEL); if (!org_mount_opt.s_qf_names[i]) { for (j = 0; j < i; j++) - kfree(org_mount_opt.s_qf_names[j]); + kvfree(org_mount_opt.s_qf_names[j]); return -ENOMEM; } } else { @@ -1576,7 +1579,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) sb->s_flags &= ~MS_RDONLY; if (sb_any_quota_suspended(sb)) { dquot_resume(sb, -1); - } else if (f2fs_sb_has_quota_ino(sb)) { + } else if (f2fs_sb_has_quota_ino(sbi)) { err = f2fs_enable_quotas(sb); if (err) goto restore_opts; @@ -1652,7 +1655,7 @@ skip: #ifdef CONFIG_QUOTA /* Release old quota file names */ for (i = 0; i < MAXQUOTAS; i++) - kfree(org_mount_opt.s_qf_names[i]); + kvfree(org_mount_opt.s_qf_names[i]); #endif /* Update the POSIXACL Flag */ sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | @@ -1673,7 +1676,7 @@ restore_opts: #ifdef CONFIG_QUOTA F2FS_OPTION(sbi).s_jquota_fmt = org_mount_opt.s_jquota_fmt; for (i = 0; i < MAXQUOTAS; i++) { - kfree(F2FS_OPTION(sbi).s_qf_names[i]); + kvfree(F2FS_OPTION(sbi).s_qf_names[i]); F2FS_OPTION(sbi).s_qf_names[i] = org_mount_opt.s_qf_names[i]; } #endif @@ -1818,7 +1821,7 @@ int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly) int enabled = 0; int i, err; - if (f2fs_sb_has_quota_ino(sbi->sb) && rdonly) { + if (f2fs_sb_has_quota_ino(sbi) && rdonly) { err = f2fs_enable_quotas(sbi->sb); if (err) { f2fs_msg(sbi->sb, KERN_ERR, @@ -1849,7 +1852,7 @@ static int f2fs_quota_enable(struct super_block *sb, int type, int format_id, unsigned long qf_inum; int err; - BUG_ON(!f2fs_sb_has_quota_ino(sb)); + BUG_ON(!f2fs_sb_has_quota_ino(F2FS_SB(sb))); qf_inum = f2fs_qf_ino(sb, type); if (!qf_inum) @@ -1994,7 +1997,7 @@ static int f2fs_quota_off(struct super_block *sb, int type) goto out_put; err = dquot_quota_off(sb, type); - if (err || f2fs_sb_has_quota_ino(sb)) + if (err || f2fs_sb_has_quota_ino(F2FS_SB(sb))) goto out_put; inode_lock(inode); @@ -2177,7 +2180,7 @@ static int f2fs_set_context(struct inode *inode, const void *ctx, size_t len, * if LOST_FOUND feature is enabled. * */ - if (f2fs_sb_has_lost_found(sbi->sb) && + if (f2fs_sb_has_lost_found(sbi) && inode->i_ino == F2FS_ROOT_INO(sbi)) return -EPERM; @@ -2400,7 +2403,7 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi, __u32 crc = 0; /* Check checksum_offset and crc in superblock */ - if (le32_to_cpu(raw_super->feature) & F2FS_FEATURE_SB_CHKSUM) { + if (__F2FS_HAS_FEATURE(raw_super, F2FS_FEATURE_SB_CHKSUM)) { crc_offset = le32_to_cpu(raw_super->checksum_offset); if (crc_offset != offsetof(struct f2fs_super_block, crc)) { @@ -2500,10 +2503,10 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi, return 1; } - if (segment_count > (le32_to_cpu(raw_super->block_count) >> 9)) { + if (segment_count > (le64_to_cpu(raw_super->block_count) >> 9)) { f2fs_msg(sb, KERN_INFO, - "Wrong segment_count / block_count (%u > %u)", - segment_count, le32_to_cpu(raw_super->block_count)); + "Wrong segment_count / block_count (%u > %llu)", + segment_count, le64_to_cpu(raw_super->block_count)); return 1; } @@ -2678,7 +2681,7 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi) static void init_sb_info(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = sbi->raw_super; - int i, j; + int i; sbi->log_sectors_per_block = le32_to_cpu(raw_super->log_sectors_per_block); @@ -2696,7 +2699,10 @@ static void init_sb_info(struct f2fs_sb_info *sbi) sbi->node_ino_num = le32_to_cpu(raw_super->node_ino); sbi->meta_ino_num = le32_to_cpu(raw_super->meta_ino); sbi->cur_victim_sec = NULL_SECNO; + sbi->next_victim_seg[BG_GC] = NULL_SEGNO; + sbi->next_victim_seg[FG_GC] = NULL_SEGNO; sbi->max_victim_search = DEF_MAX_VICTIM_SEARCH; + sbi->migration_granularity = sbi->segs_per_sec; sbi->dir_level = DEF_DIR_LEVEL; sbi->interval_time[CP_TIME] = DEF_CP_INTERVAL; @@ -2714,9 +2720,6 @@ static void init_sb_info(struct f2fs_sb_info *sbi) INIT_LIST_HEAD(&sbi->s_list); mutex_init(&sbi->umount_mutex); - for (i = 0; i < NR_PAGE_TYPE - 1; i++) - for (j = HOT; j < NR_TEMP_TYPE; j++) - mutex_init(&sbi->wio_mutex[i][j]); init_rwsem(&sbi->io_order_lock); spin_lock_init(&sbi->cp_lock); @@ -2753,7 +2756,7 @@ static int init_blkz_info(struct f2fs_sb_info *sbi, int devi) unsigned int n = 0; int err = -EIO; - if (!f2fs_sb_has_blkzoned(sbi->sb)) + if (!f2fs_sb_has_blkzoned(sbi)) return 0; if (sbi->blocks_per_blkz && sbi->blocks_per_blkz != @@ -2804,7 +2807,7 @@ static int init_blkz_info(struct f2fs_sb_info *sbi, int devi) } } - kfree(zones); + kvfree(zones); return err; } @@ -2864,7 +2867,7 @@ static int read_raw_super_block(struct f2fs_sb_info *sbi, /* No valid superblock */ if (!*raw_super) - kfree(super); + kvfree(super); else err = 0; @@ -2884,7 +2887,7 @@ int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover) } /* we should update superblock crc here */ - if (!recover && f2fs_sb_has_sb_chksum(sbi->sb)) { + if (!recover && f2fs_sb_has_sb_chksum(sbi)) { crc = f2fs_crc32(sbi, F2FS_RAW_SUPER(sbi), offsetof(struct f2fs_super_block, crc)); F2FS_RAW_SUPER(sbi)->crc = cpu_to_le32(crc); @@ -2976,7 +2979,7 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) #ifdef CONFIG_BLK_DEV_ZONED if (bdev_zoned_model(FDEV(i).bdev) == BLK_ZONED_HM && - !f2fs_sb_has_blkzoned(sbi->sb)) { + !f2fs_sb_has_blkzoned(sbi)) { f2fs_msg(sbi->sb, KERN_ERR, "Zoned block device feature not enabled\n"); return -EINVAL; @@ -3072,7 +3075,7 @@ try_onemore: sbi->raw_super = raw_super; /* precompute checksum seed for metadata */ - if (f2fs_sb_has_inode_chksum(sb)) + if (f2fs_sb_has_inode_chksum(sbi)) sbi->s_chksum_seed = f2fs_chksum(sbi, ~0, raw_super->uuid, sizeof(raw_super->uuid)); @@ -3082,7 +3085,7 @@ try_onemore: * devices, but mandatory for host-managed zoned block devices. */ #ifndef CONFIG_BLK_DEV_ZONED - if (f2fs_sb_has_blkzoned(sb)) { + if (f2fs_sb_has_blkzoned(sbi)) { f2fs_msg(sb, KERN_ERR, "Zoned block device support is not enabled\n"); err = -EOPNOTSUPP; @@ -3109,13 +3112,13 @@ try_onemore: #ifdef CONFIG_QUOTA sb->dq_op = &f2fs_quota_operations; - if (f2fs_sb_has_quota_ino(sb)) + if (f2fs_sb_has_quota_ino(sbi)) sb->s_qcop = &dquot_quotactl_sysfile_ops; else sb->s_qcop = &f2fs_quotactl_ops; sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ; - if (f2fs_sb_has_quota_ino(sbi->sb)) { + if (f2fs_sb_has_quota_ino(sbi)) { for (i = 0; i < MAXQUOTAS; i++) { if (f2fs_qf_ino(sbi->sb, i)) sbi->nquota_files++; @@ -3266,30 +3269,30 @@ try_onemore: f2fs_build_gc_manager(sbi); + err = f2fs_build_stats(sbi); + if (err) + goto free_nm; + /* get an inode for node space */ sbi->node_inode = f2fs_iget(sb, F2FS_NODE_INO(sbi)); if (IS_ERR(sbi->node_inode)) { f2fs_msg(sb, KERN_ERR, "Failed to read node inode"); err = PTR_ERR(sbi->node_inode); - goto free_nm; + goto free_stats; } - err = f2fs_build_stats(sbi); - if (err) - goto free_node_inode; - /* read root inode and dentry */ root = f2fs_iget(sb, F2FS_ROOT_INO(sbi)); if (IS_ERR(root)) { f2fs_msg(sb, KERN_ERR, "Failed to read root inode"); err = PTR_ERR(root); - goto free_stats; + goto free_node_inode; } if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size || !root->i_nlink) { iput(root); err = -EINVAL; - goto free_stats; + goto free_node_inode; } sb->s_root = d_make_root(root); /* allocate root dentry */ @@ -3304,7 +3307,7 @@ try_onemore: #ifdef CONFIG_QUOTA /* Enable quota usage during mount */ - if (f2fs_sb_has_quota_ino(sb) && !f2fs_readonly(sb)) { + if (f2fs_sb_has_quota_ino(sbi) && !f2fs_readonly(sb)) { err = f2fs_enable_quotas(sb); if (err) f2fs_msg(sb, KERN_ERR, @@ -3376,7 +3379,7 @@ skip_recovery: if (err) goto free_meta; } - kfree(options); + kvfree(options); /* recover broken superblock */ if (recovery) { @@ -3399,7 +3402,7 @@ skip_recovery: free_meta: #ifdef CONFIG_QUOTA f2fs_truncate_quota_inode_pages(sb); - if (f2fs_sb_has_quota_ino(sb) && !f2fs_readonly(sb)) + if (f2fs_sb_has_quota_ino(sbi) && !f2fs_readonly(sb)) f2fs_quota_off_umount(sbi->sb); #endif /* @@ -3413,41 +3416,43 @@ free_meta: free_root_inode: dput(sb->s_root); sb->s_root = NULL; -free_stats: - f2fs_destroy_stats(sbi); free_node_inode: f2fs_release_ino_entry(sbi, true); truncate_inode_pages_final(NODE_MAPPING(sbi)); iput(sbi->node_inode); + sbi->node_inode = NULL; +free_stats: + f2fs_destroy_stats(sbi); free_nm: f2fs_destroy_node_manager(sbi); free_sm: f2fs_destroy_segment_manager(sbi); free_devices: destroy_device_list(sbi); - kfree(sbi->ckpt); + kvfree(sbi->ckpt); free_meta_inode: make_bad_inode(sbi->meta_inode); iput(sbi->meta_inode); + sbi->meta_inode = NULL; free_io_dummy: mempool_destroy(sbi->write_io_dummy); free_percpu: destroy_percpu_info(sbi); free_bio_info: for (i = 0; i < NR_PAGE_TYPE; i++) - kfree(sbi->write_io[i]); + kvfree(sbi->write_io[i]); free_options: #ifdef CONFIG_QUOTA for (i = 0; i < MAXQUOTAS; i++) - kfree(F2FS_OPTION(sbi).s_qf_names[i]); + kvfree(F2FS_OPTION(sbi).s_qf_names[i]); #endif - kfree(options); + kvfree(options); free_sb_buf: - kfree(raw_super); + kvfree(raw_super); free_sbi: if (sbi->s_chksum_driver) crypto_free_shash(sbi->s_chksum_driver); - kfree(sbi); + kvfree(sbi); /* give only one another chance */ if (retry) { diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index db89741b219b..948c1a211341 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -90,34 +90,34 @@ static ssize_t features_show(struct f2fs_attr *a, if (!sb->s_bdev->bd_part) return snprintf(buf, PAGE_SIZE, "0\n"); - if (f2fs_sb_has_encrypt(sb)) + if (f2fs_sb_has_encrypt(sbi)) len += snprintf(buf, PAGE_SIZE - len, "%s", "encryption"); - if (f2fs_sb_has_blkzoned(sb)) + if (f2fs_sb_has_blkzoned(sbi)) len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "blkzoned"); - if (f2fs_sb_has_extra_attr(sb)) + if (f2fs_sb_has_extra_attr(sbi)) len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "extra_attr"); - if (f2fs_sb_has_project_quota(sb)) + if (f2fs_sb_has_project_quota(sbi)) len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "projquota"); - if (f2fs_sb_has_inode_chksum(sb)) + if (f2fs_sb_has_inode_chksum(sbi)) len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "inode_checksum"); - if (f2fs_sb_has_flexible_inline_xattr(sb)) + if (f2fs_sb_has_flexible_inline_xattr(sbi)) len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "flexible_inline_xattr"); - if (f2fs_sb_has_quota_ino(sb)) + if (f2fs_sb_has_quota_ino(sbi)) len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "quota_ino"); - if (f2fs_sb_has_inode_crtime(sb)) + if (f2fs_sb_has_inode_crtime(sbi)) len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "inode_crtime"); - if (f2fs_sb_has_lost_found(sb)) + if (f2fs_sb_has_lost_found(sbi)) len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "lost_found"); - if (f2fs_sb_has_sb_chksum(sb)) + if (f2fs_sb_has_sb_chksum(sbi)) len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "sb_checksum"); len += snprintf(buf + len, PAGE_SIZE - len, "\n"); @@ -246,6 +246,11 @@ out: return count; } + if (!strcmp(a->attr.name, "migration_granularity")) { + if (t == 0 || t > sbi->segs_per_sec) + return -EINVAL; + } + if (!strcmp(a->attr.name, "trim_sections")) return -EINVAL; @@ -406,6 +411,7 @@ F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh); F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ra_nid_pages, ra_nid_pages); F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, dirty_nats_ratio, dirty_nats_ratio); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, migration_granularity, migration_granularity); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, interval_time[CP_TIME]); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]); @@ -460,6 +466,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(min_hot_blocks), ATTR_LIST(min_ssr_sections), ATTR_LIST(max_victim_search), + ATTR_LIST(migration_granularity), ATTR_LIST(dir_level), ATTR_LIST(ram_thresh), ATTR_LIST(ra_nid_pages), diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index c7c3c2a63a85..71b27c55988a 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -334,7 +334,7 @@ static int read_xattr_block(struct inode *inode, void *txattr_addr) static int lookup_all_xattrs(struct inode *inode, struct page *ipage, unsigned int index, unsigned int len, const char *name, struct f2fs_xattr_entry **xe, - void **base_addr) + void **base_addr, int *base_size) { void *cur_addr, *txattr_addr, *last_addr = NULL; nid_t xnid = F2FS_I(inode)->i_xattr_nid; @@ -345,8 +345,8 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage, if (!size && !inline_size) return -ENODATA; - txattr_addr = f2fs_kzalloc(F2FS_I_SB(inode), - inline_size + size + XATTR_PADDING_SIZE, GFP_NOFS); + *base_size = inline_size + size + XATTR_PADDING_SIZE; + txattr_addr = f2fs_kzalloc(F2FS_I_SB(inode), *base_size, GFP_NOFS); if (!txattr_addr) return -ENOMEM; @@ -358,8 +358,10 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage, *xe = __find_inline_xattr(inode, txattr_addr, &last_addr, index, len, name); - if (*xe) + if (*xe) { + *base_size = inline_size; goto check; + } } /* read from xattr node block */ @@ -461,7 +463,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, } f2fs_wait_on_page_writeback(ipage ? ipage : in_page, - NODE, true); + NODE, true, true); /* no need to use xattr node block */ if (hsize <= inline_size) { err = f2fs_truncate_xattr_node(inode); @@ -485,7 +487,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, goto in_page_out; } f2fs_bug_on(sbi, new_nid); - f2fs_wait_on_page_writeback(xpage, NODE, true); + f2fs_wait_on_page_writeback(xpage, NODE, true, true); } else { struct dnode_of_data dn; set_new_dnode(&dn, inode, NULL, NULL, new_nid); @@ -520,6 +522,7 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name, int error = 0; unsigned int size, len; void *base_addr = NULL; + int base_size; if (name == NULL) return -EINVAL; @@ -530,7 +533,7 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name, down_read(&F2FS_I(inode)->i_xattr_sem); error = lookup_all_xattrs(inode, ipage, index, len, name, - &entry, &base_addr); + &entry, &base_addr, &base_size); up_read(&F2FS_I(inode)->i_xattr_sem); if (error) return error; @@ -544,6 +547,11 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name, if (buffer) { char *pval = entry->e_name + entry->e_name_len; + + if (base_size - (pval - (char *)base_addr) < size) { + error = -ERANGE; + goto out; + } memcpy(buffer, pval, size); } error = size; diff --git a/fs/fscache/object.c b/fs/fscache/object.c index 7a182c87f378..ab1d7f35f6c2 100644 --- a/fs/fscache/object.c +++ b/fs/fscache/object.c @@ -715,6 +715,9 @@ static const struct fscache_state *fscache_drop_object(struct fscache_object *ob if (awaken) wake_up_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING); + if (test_and_clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags)) + wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP); + /* Prevent a race with our last child, which has to signal EV_CLEARED * before dropping our spinlock. diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index c6e4e4c0221b..0fc9e87802b5 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1751,7 +1751,6 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, req->in.h.nodeid = outarg->nodeid; req->in.numargs = 2; req->in.argpages = 1; - req->page_descs[0].offset = offset; req->end = fuse_retrieve_end; index = outarg->offset >> PAGE_CACHE_SHIFT; @@ -1766,6 +1765,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, this_num = min_t(unsigned, num, PAGE_CACHE_SIZE - offset); req->pages[req->num_pages] = page; + req->page_descs[req->num_pages].offset = offset; req->page_descs[req->num_pages].length = this_num; req->num_pages++; @@ -2091,10 +2091,13 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, ret = fuse_dev_do_write(fud, &cs, len); + pipe_lock(pipe); for (idx = 0; idx < nbuf; idx++) { struct pipe_buffer *buf = &bufs[idx]; buf->ops->release(pipe, buf); } + pipe_unlock(pipe); + out: kfree(bufs); return ret; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 5ff580633a63..690bbb22ed6b 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1853,7 +1853,7 @@ static bool fuse_writepage_in_flight(struct fuse_req *new_req, spin_unlock(&fc->lock); dec_wb_stat(&bdi->wb, WB_WRITEBACK); - dec_zone_page_state(page, NR_WRITEBACK_TEMP); + dec_zone_page_state(new_req->pages[0], NR_WRITEBACK_TEMP); wb_writeout_inc(&bdi->wb); fuse_writepage_free(fc, new_req); fuse_request_free(new_req); diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c index 1ab19e660e69..1ff5774a5382 100644 --- a/fs/hfs/btree.c +++ b/fs/hfs/btree.c @@ -328,13 +328,14 @@ void hfs_bmap_free(struct hfs_bnode *node) nidx -= len * 8; i = node->next; - hfs_bnode_put(node); if (!i) { /* panic */; pr_crit("unable to free bnode %u. bmap not found!\n", node->this); + hfs_bnode_put(node); return; } + hfs_bnode_put(node); node = hfs_bnode_find(tree, i); if (IS_ERR(node)) return; diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c index 3345c7553edc..7adc8a327e03 100644 --- a/fs/hfsplus/btree.c +++ b/fs/hfsplus/btree.c @@ -453,14 +453,15 @@ void hfs_bmap_free(struct hfs_bnode *node) nidx -= len * 8; i = node->next; - hfs_bnode_put(node); if (!i) { /* panic */; pr_crit("unable to free bnode %u. " "bmap not found!\n", node->this); + hfs_bnode_put(node); return; } + hfs_bnode_put(node); node = hfs_bnode_find(tree, i); if (IS_ERR(node)) return; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index a17da8b57fc6..ab34f613fa85 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -118,6 +118,16 @@ static void huge_pagevec_release(struct pagevec *pvec) pagevec_reinit(pvec); } +/* + * Mask used when checking the page offset value passed in via system + * calls. This value will be converted to a loff_t which is signed. + * Therefore, we want to check the upper PAGE_SHIFT + 1 bits of the + * value. The extra bit (- 1 in the shift value) is to take the sign + * bit into account. + */ +#define PGOFF_LOFFT_MAX \ + (((1UL << (PAGE_SHIFT + 1)) - 1) << (BITS_PER_LONG - (PAGE_SHIFT + 1))) + static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *inode = file_inode(file); @@ -136,17 +146,31 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) vma->vm_flags |= VM_HUGETLB | VM_DONTEXPAND; vma->vm_ops = &hugetlb_vm_ops; + /* + * page based offset in vm_pgoff could be sufficiently large to + * overflow a loff_t when converted to byte offset. This can + * only happen on architectures where sizeof(loff_t) == + * sizeof(unsigned long). So, only check in those instances. + */ + if (sizeof(unsigned long) == sizeof(loff_t)) { + if (vma->vm_pgoff & PGOFF_LOFFT_MAX) + return -EINVAL; + } + + /* must be huge page aligned */ if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT)) return -EINVAL; vma_len = (loff_t)(vma->vm_end - vma->vm_start); + len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); + /* check for overflow */ + if (len < vma_len) + return -EINVAL; mutex_lock(&inode->i_mutex); file_accessed(file); ret = -ENOMEM; - len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); - if (hugetlb_reserve_pages(inode, vma->vm_pgoff >> huge_page_order(h), len >> huge_page_shift(h), vma, @@ -155,7 +179,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) ret = 0; if (vma->vm_flags & VM_WRITE && inode->i_size < len) - inode->i_size = len; + i_size_write(inode, len); out: mutex_unlock(&inode->i_mutex); diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index 1544f530ccd0..023e7f32ee1b 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -101,7 +101,8 @@ static int jffs2_sync_fs(struct super_block *sb, int wait) struct jffs2_sb_info *c = JFFS2_SB_INFO(sb); #ifdef CONFIG_JFFS2_FS_WRITEBUFFER - cancel_delayed_work_sync(&c->wbuf_dwork); + if (jffs2_is_writebuffered(c)) + cancel_delayed_work_sync(&c->wbuf_dwork); #endif mutex_lock(&c->alloc_sem); diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 62f358f67764..412fcfbc50e2 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -2376,8 +2376,7 @@ static int nfs_compare_mount_options(const struct super_block *s, const struct n goto Ebusy; if (a->acdirmax != b->acdirmax) goto Ebusy; - if (b->auth_info.flavor_len > 0 && - clnt_a->cl_auth->au_flavor != clnt_b->cl_auth->au_flavor) + if (clnt_a->cl_auth->au_flavor != clnt_b->cl_auth->au_flavor) goto Ebusy; return 1; Ebusy: diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 9690cb4dd588..03c7a4e7b6ba 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1106,6 +1106,8 @@ static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size) case 'Y': case 'y': case '1': + if (nn->nfsd_serv) + return -EBUSY; nfsd4_end_grace(nn); break; default: diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c index 272269f1c310..9ee8bcfbf00f 100644 --- a/fs/ocfs2/buffer_head_io.c +++ b/fs/ocfs2/buffer_head_io.c @@ -146,7 +146,6 @@ int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block, BUG(); } - clear_buffer_uptodate(bh); get_bh(bh); /* for end_buffer_read_sync() */ bh->b_end_io = end_buffer_read_sync; submit_bh(READ, bh); @@ -300,7 +299,6 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr, continue; } - clear_buffer_uptodate(bh); get_bh(bh); /* for end_buffer_read_sync() */ if (validate) set_buffer_needs_validate(bh); diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c index 827fc9809bc2..3494e220b510 100644 --- a/fs/ocfs2/export.c +++ b/fs/ocfs2/export.c @@ -125,10 +125,10 @@ check_err: check_gen: if (handle->ih_generation != inode->i_generation) { - iput(inode); trace_ocfs2_get_dentry_generation((unsigned long long)blkno, handle->ih_generation, inode->i_generation); + iput(inode); result = ERR_PTR(-ESTALE); goto bail; } diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index 0a4457fb0711..85111d740c9d 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -345,13 +345,18 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb) if (num_used || alloc->id1.bitmap1.i_used || alloc->id1.bitmap1.i_total - || la->la_bm_off) - mlog(ML_ERROR, "Local alloc hasn't been recovered!\n" + || la->la_bm_off) { + mlog(ML_ERROR, "inconsistent detected, clean journal with" + " unrecovered local alloc, please run fsck.ocfs2!\n" "found = %u, set = %u, taken = %u, off = %u\n", num_used, le32_to_cpu(alloc->id1.bitmap1.i_used), le32_to_cpu(alloc->id1.bitmap1.i_total), OCFS2_LOCAL_ALLOC(alloc)->la_bm_off); + status = -EINVAL; + goto bail; + } + osb->local_alloc_bh = alloc_bh; osb->local_alloc_state = OCFS2_LA_ENABLED; diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index 124471d26a73..c1a83c58456e 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -156,18 +156,14 @@ out: } /* - * lock allocators, and reserving appropriate number of bits for - * meta blocks and data clusters. - * - * in some cases, we don't need to reserve clusters, just let data_ac - * be NULL. + * lock allocator, and reserve appropriate number of bits for + * meta blocks. */ -static int ocfs2_lock_allocators_move_extents(struct inode *inode, +static int ocfs2_lock_meta_allocator_move_extents(struct inode *inode, struct ocfs2_extent_tree *et, u32 clusters_to_move, u32 extents_to_split, struct ocfs2_alloc_context **meta_ac, - struct ocfs2_alloc_context **data_ac, int extra_blocks, int *credits) { @@ -192,13 +188,6 @@ static int ocfs2_lock_allocators_move_extents(struct inode *inode, goto out; } - if (data_ac) { - ret = ocfs2_reserve_clusters(osb, clusters_to_move, data_ac); - if (ret) { - mlog_errno(ret); - goto out; - } - } *credits += ocfs2_calc_extend_credits(osb->sb, et->et_root_el); @@ -260,10 +249,10 @@ static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context, } } - ret = ocfs2_lock_allocators_move_extents(inode, &context->et, *len, 1, - &context->meta_ac, - &context->data_ac, - extra_blocks, &credits); + ret = ocfs2_lock_meta_allocator_move_extents(inode, &context->et, + *len, 1, + &context->meta_ac, + extra_blocks, &credits); if (ret) { mlog_errno(ret); goto out; @@ -286,6 +275,21 @@ static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context, } } + /* + * Make sure ocfs2_reserve_cluster is called after + * __ocfs2_flush_truncate_log, otherwise, dead lock may happen. + * + * If ocfs2_reserve_cluster is called + * before __ocfs2_flush_truncate_log, dead lock on global bitmap + * may happen. + * + */ + ret = ocfs2_reserve_clusters(osb, *len, &context->data_ac); + if (ret) { + mlog_errno(ret); + goto out_unlock_mutex; + } + handle = ocfs2_start_trans(osb, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); @@ -606,9 +610,10 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, } } - ret = ocfs2_lock_allocators_move_extents(inode, &context->et, len, 1, - &context->meta_ac, - NULL, extra_blocks, &credits); + ret = ocfs2_lock_meta_allocator_move_extents(inode, &context->et, + len, 1, + &context->meta_ac, + extra_blocks, &credits); if (ret) { mlog_errno(ret); goto out; diff --git a/fs/proc/array.c b/fs/proc/array.c index 161441f52ebf..015cdc615dfb 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -333,7 +333,7 @@ static inline void task_seccomp(struct seq_file *m, struct task_struct *p) #ifdef CONFIG_SECCOMP seq_printf(m, "Seccomp:\t%d\n", p->seccomp.mode); #endif - seq_printf(m, "\nSpeculation_Store_Bypass:\t"); + seq_printf(m, "Speculation_Store_Bypass:\t"); switch (arch_prctl_spec_ctrl_get(p, PR_SPEC_STORE_BYPASS)) { case -EINVAL: seq_printf(m, "unknown"); diff --git a/fs/proc/base.c b/fs/proc/base.c index ee9b7f0ea938..86cc5554198f 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -255,7 +255,7 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf, * Inherently racy -- command line shares address space * with code and data. */ - rv = access_remote_vm(mm, arg_end - 1, &c, 1, 0); + rv = access_remote_vm(mm, arg_end - 1, &c, 1, FOLL_ANON); if (rv <= 0) goto out_free_page; @@ -273,7 +273,7 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf, int nr_read; _count = min3(count, len, PAGE_SIZE); - nr_read = access_remote_vm(mm, p, page, _count, 0); + nr_read = access_remote_vm(mm, p, page, _count, FOLL_ANON); if (nr_read < 0) rv = nr_read; if (nr_read <= 0) @@ -308,7 +308,7 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf, bool final; _count = min3(count, len, PAGE_SIZE); - nr_read = access_remote_vm(mm, p, page, _count, 0); + nr_read = access_remote_vm(mm, p, page, _count, FOLL_ANON); if (nr_read < 0) rv = nr_read; if (nr_read <= 0) @@ -357,7 +357,7 @@ skip_argv: bool final; _count = min3(count, len, PAGE_SIZE); - nr_read = access_remote_vm(mm, p, page, _count, 0); + nr_read = access_remote_vm(mm, p, page, _count, FOLL_ANON); if (nr_read < 0) rv = nr_read; if (nr_read <= 0) @@ -869,6 +869,7 @@ static ssize_t mem_rw(struct file *file, char __user *buf, unsigned long addr = *ppos; ssize_t copied; char *page; + unsigned int flags; if (!mm) return 0; @@ -881,6 +882,11 @@ static ssize_t mem_rw(struct file *file, char __user *buf, if (!atomic_inc_not_zero(&mm->mm_users)) goto free; + /* Maybe we should limit FOLL_FORCE to actual ptrace users? */ + flags = FOLL_FORCE; + if (write) + flags |= FOLL_WRITE; + while (count > 0) { int this_len = min_t(int, count, PAGE_SIZE); @@ -889,7 +895,7 @@ static ssize_t mem_rw(struct file *file, char __user *buf, break; } - this_len = access_remote_vm(mm, addr, page, this_len, write); + this_len = access_remote_vm(mm, addr, page, this_len, flags); if (!this_len) { if (!copied) copied = -EIO; @@ -1001,8 +1007,7 @@ static ssize_t environ_read(struct file *file, char __user *buf, max_len = min_t(size_t, PAGE_SIZE, count); this_len = min(max_len, this_len); - retval = access_remote_vm(mm, (env_start + src), - page, this_len, 0); + retval = access_remote_vm(mm, (env_start + src), page, this_len, FOLL_ANON); if (retval <= 0) { ret = retval; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 99c4ffaa43a8..2df3b2358b0a 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -143,7 +143,7 @@ static void seq_print_vma_name(struct seq_file *m, struct vm_area_struct *vma) struct page *page; pages_pinned = get_user_pages(current, mm, page_start_vaddr, - 1, 0, 0, &page, NULL); + 1, 0, &page, NULL); if (pages_pinned < 1) { seq_puts(m, "<fault>]"); return; diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 40a0fe0a4e05..6fbfa8189451 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -392,8 +392,8 @@ static void pstore_console_write(struct console *con, const char *s, unsigned c) } else { spin_lock_irqsave(&psinfo->buf_lock, flags); } - memcpy(psinfo->buf, s, c); - psinfo->write(PSTORE_TYPE_CONSOLE, 0, &id, 0, 0, 0, c, psinfo); + psinfo->write_buf(PSTORE_TYPE_CONSOLE, 0, &id, 0, + s, 0, c, psinfo); spin_unlock_irqrestore(&psinfo->buf_lock, flags); s += c; c = e - s; diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c index ecdb3baa1283..11e558efd61e 100644 --- a/fs/pstore/ram_core.c +++ b/fs/pstore/ram_core.c @@ -488,6 +488,11 @@ static int persistent_ram_post_init(struct persistent_ram_zone *prz, u32 sig, sig ^= PERSISTENT_RAM_SIG; if (prz->buffer->sig == sig) { + if (buffer_size(prz) == 0) { + pr_debug("found existing empty buffer\n"); + return 0; + } + if (buffer_size(prz) > prz->buffer_size || buffer_start(prz) > buffer_size(prz)) pr_info("found existing invalid buffer, size %zu, start %zu\n", diff --git a/fs/read_write.c b/fs/read_write.c index bfd1a5dddf6e..16e554ba885d 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -363,8 +363,10 @@ ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos) iter->type |= WRITE; ret = file->f_op->write_iter(&kiocb, iter); BUG_ON(ret == -EIOCBQUEUED); - if (ret > 0) + if (ret > 0) { *ppos = kiocb.ki_pos; + fsnotify_modify(file); + } return ret; } EXPORT_SYMBOL(vfs_iter_write); diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig index 6dd158a216f4..ffb093e72b6c 100644 --- a/fs/squashfs/Kconfig +++ b/fs/squashfs/Kconfig @@ -26,6 +26,34 @@ config SQUASHFS If unsure, say N. choice + prompt "File decompression options" + depends on SQUASHFS + help + Squashfs now supports two options for decompressing file + data. Traditionally Squashfs has decompressed into an + intermediate buffer and then memcopied it into the page cache. + Squashfs now supports the ability to decompress directly into + the page cache. + + If unsure, select "Decompress file data into an intermediate buffer" + +config SQUASHFS_FILE_CACHE + bool "Decompress file data into an intermediate buffer" + help + Decompress file data into an intermediate buffer and then + memcopy it into the page cache. + +config SQUASHFS_FILE_DIRECT + bool "Decompress files directly into the page cache" + help + Directly decompress file data into the page cache. + Doing so can significantly improve performance because + it eliminates a memcpy and it also removes the lock contention + on the single buffer. + +endchoice + +choice prompt "Decompressor parallelisation options" depends on SQUASHFS help diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile index fe51f1507ed1..246a6f329d89 100644 --- a/fs/squashfs/Makefile +++ b/fs/squashfs/Makefile @@ -5,7 +5,8 @@ obj-$(CONFIG_SQUASHFS) += squashfs.o squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o squashfs-y += namei.o super.o symlink.o decompressor.o -squashfs-y += file_direct.o page_actor.o +squashfs-$(CONFIG_SQUASHFS_FILE_CACHE) += file_cache.o +squashfs-$(CONFIG_SQUASHFS_FILE_DIRECT) += file_direct.o page_actor.o squashfs-$(CONFIG_SQUASHFS_DECOMP_SINGLE) += decompressor_single.o squashfs-$(CONFIG_SQUASHFS_DECOMP_MULTI) += decompressor_multi.o squashfs-$(CONFIG_SQUASHFS_DECOMP_MULTI_PERCPU) += decompressor_multi_percpu.o diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index b3b95e2ae2ff..82bc942fc437 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -28,12 +28,9 @@ #include <linux/fs.h> #include <linux/vfs.h> -#include <linux/bio.h> #include <linux/slab.h> #include <linux/string.h> -#include <linux/pagemap.h> #include <linux/buffer_head.h> -#include <linux/workqueue.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" @@ -41,381 +38,45 @@ #include "decompressor.h" #include "page_actor.h" -static struct workqueue_struct *squashfs_read_wq; - -struct squashfs_read_request { - struct super_block *sb; - u64 index; - int length; - int compressed; - int offset; - u64 read_end; - struct squashfs_page_actor *output; - enum { - SQUASHFS_COPY, - SQUASHFS_DECOMPRESS, - SQUASHFS_METADATA, - } data_processing; - bool synchronous; - - /* - * If the read is synchronous, it is possible to retrieve information - * about the request by setting these pointers. - */ - int *res; - int *bytes_read; - int *bytes_uncompressed; - - int nr_buffers; - struct buffer_head **bh; - struct work_struct offload; -}; - -struct squashfs_bio_request { - struct buffer_head **bh; - int nr_buffers; -}; - -static int squashfs_bio_submit(struct squashfs_read_request *req); - -int squashfs_init_read_wq(void) -{ - squashfs_read_wq = create_workqueue("SquashFS read wq"); - return !!squashfs_read_wq; -} - -void squashfs_destroy_read_wq(void) -{ - flush_workqueue(squashfs_read_wq); - destroy_workqueue(squashfs_read_wq); -} - -static void free_read_request(struct squashfs_read_request *req, int error) -{ - if (!req->synchronous) - squashfs_page_actor_free(req->output, error); - if (req->res) - *(req->res) = error; - kfree(req->bh); - kfree(req); -} - -static void squashfs_process_blocks(struct squashfs_read_request *req) -{ - int error = 0; - int bytes, i, length; - struct squashfs_sb_info *msblk = req->sb->s_fs_info; - struct squashfs_page_actor *actor = req->output; - struct buffer_head **bh = req->bh; - int nr_buffers = req->nr_buffers; - - for (i = 0; i < nr_buffers; ++i) { - if (!bh[i]) - continue; - wait_on_buffer(bh[i]); - if (!buffer_uptodate(bh[i])) - error = -EIO; - } - if (error) - goto cleanup; - - if (req->data_processing == SQUASHFS_METADATA) { - /* Extract the length of the metadata block */ - if (req->offset != msblk->devblksize - 1) { - length = le16_to_cpup((__le16 *) - (bh[0]->b_data + req->offset)); - } else { - length = (unsigned char)bh[0]->b_data[req->offset]; - length |= (unsigned char)bh[1]->b_data[0] << 8; - } - req->compressed = SQUASHFS_COMPRESSED(length); - req->data_processing = req->compressed ? SQUASHFS_DECOMPRESS - : SQUASHFS_COPY; - length = SQUASHFS_COMPRESSED_SIZE(length); - if (req->index + length + 2 > req->read_end) { - for (i = 0; i < nr_buffers; ++i) - put_bh(bh[i]); - kfree(bh); - req->length = length; - req->index += 2; - squashfs_bio_submit(req); - return; - } - req->length = length; - req->offset = (req->offset + 2) % PAGE_SIZE; - if (req->offset < 2) { - put_bh(bh[0]); - ++bh; - --nr_buffers; - } - } - if (req->bytes_read) - *(req->bytes_read) = req->length; - - if (req->data_processing == SQUASHFS_COPY) { - squashfs_bh_to_actor(bh, nr_buffers, req->output, req->offset, - req->length, msblk->devblksize); - } else if (req->data_processing == SQUASHFS_DECOMPRESS) { - req->length = squashfs_decompress(msblk, bh, nr_buffers, - req->offset, req->length, actor); - if (req->length < 0) { - error = -EIO; - goto cleanup; - } - } - - /* Last page may have trailing bytes not filled */ - bytes = req->length % PAGE_SIZE; - if (bytes && actor->page[actor->pages - 1]) - zero_user_segment(actor->page[actor->pages - 1], bytes, - PAGE_SIZE); - -cleanup: - if (req->bytes_uncompressed) - *(req->bytes_uncompressed) = req->length; - if (error) { - for (i = 0; i < nr_buffers; ++i) - if (bh[i]) - put_bh(bh[i]); - } - free_read_request(req, error); -} - -static void read_wq_handler(struct work_struct *work) -{ - squashfs_process_blocks(container_of(work, - struct squashfs_read_request, offload)); -} - -static void squashfs_bio_end_io(struct bio *bio) -{ - int i; - int error = bio->bi_error; - struct squashfs_bio_request *bio_req = bio->bi_private; - - bio_put(bio); - - for (i = 0; i < bio_req->nr_buffers; ++i) { - if (!bio_req->bh[i]) - continue; - if (!error) - set_buffer_uptodate(bio_req->bh[i]); - else - clear_buffer_uptodate(bio_req->bh[i]); - unlock_buffer(bio_req->bh[i]); - } - kfree(bio_req); -} - -static int bh_is_optional(struct squashfs_read_request *req, int idx) -{ - int start_idx, end_idx; - struct squashfs_sb_info *msblk = req->sb->s_fs_info; - - start_idx = (idx * msblk->devblksize - req->offset) >> PAGE_SHIFT; - end_idx = ((idx + 1) * msblk->devblksize - req->offset + 1) >> PAGE_SHIFT; - if (start_idx >= req->output->pages) - return 1; - if (start_idx < 0) - start_idx = end_idx; - if (end_idx >= req->output->pages) - end_idx = start_idx; - return !req->output->page[start_idx] && !req->output->page[end_idx]; -} - -static int actor_getblks(struct squashfs_read_request *req, u64 block) -{ - int i; - - req->bh = kmalloc_array(req->nr_buffers, sizeof(*(req->bh)), GFP_NOIO); - if (!req->bh) - return -ENOMEM; - - for (i = 0; i < req->nr_buffers; ++i) { - /* - * When dealing with an uncompressed block, the actor may - * contains NULL pages. There's no need to read the buffers - * associated with these pages. - */ - if (!req->compressed && bh_is_optional(req, i)) { - req->bh[i] = NULL; - continue; - } - req->bh[i] = sb_getblk(req->sb, block + i); - if (!req->bh[i]) { - while (--i) { - if (req->bh[i]) - put_bh(req->bh[i]); - } - return -1; - } - } - return 0; -} - -static int squashfs_bio_submit(struct squashfs_read_request *req) +/* + * Read the metadata block length, this is stored in the first two + * bytes of the metadata block. + */ +static struct buffer_head *get_block_length(struct super_block *sb, + u64 *cur_index, int *offset, int *length) { - struct bio *bio = NULL; + struct squashfs_sb_info *msblk = sb->s_fs_info; struct buffer_head *bh; - struct squashfs_bio_request *bio_req = NULL; - int b = 0, prev_block = 0; - struct squashfs_sb_info *msblk = req->sb->s_fs_info; - - u64 read_start = round_down(req->index, msblk->devblksize); - u64 read_end = round_up(req->index + req->length, msblk->devblksize); - sector_t block = read_start >> msblk->devblksize_log2; - sector_t block_end = read_end >> msblk->devblksize_log2; - int offset = read_start - round_down(req->index, PAGE_SIZE); - int nr_buffers = block_end - block; - int blksz = msblk->devblksize; - int bio_max_pages = nr_buffers > BIO_MAX_PAGES ? BIO_MAX_PAGES - : nr_buffers; - /* Setup the request */ - req->read_end = read_end; - req->offset = req->index - read_start; - req->nr_buffers = nr_buffers; - if (actor_getblks(req, block) < 0) - goto getblk_failed; - - /* Create and submit the BIOs */ - for (b = 0; b < nr_buffers; ++b, offset += blksz) { - bh = req->bh[b]; - if (!bh || !trylock_buffer(bh)) - continue; - if (buffer_uptodate(bh)) { - unlock_buffer(bh); - continue; + bh = sb_bread(sb, *cur_index); + if (bh == NULL) + return NULL; + + if (msblk->devblksize - *offset == 1) { + *length = (unsigned char) bh->b_data[*offset]; + put_bh(bh); + bh = sb_bread(sb, ++(*cur_index)); + if (bh == NULL) + return NULL; + *length |= (unsigned char) bh->b_data[0] << 8; + *offset = 1; + } else { + *length = (unsigned char) bh->b_data[*offset] | + (unsigned char) bh->b_data[*offset + 1] << 8; + *offset += 2; + + if (*offset == msblk->devblksize) { + put_bh(bh); + bh = sb_bread(sb, ++(*cur_index)); + if (bh == NULL) + return NULL; + *offset = 0; } - offset %= PAGE_SIZE; - - /* Append the buffer to the current BIO if it is contiguous */ - if (bio && bio_req && prev_block + 1 == b) { - if (bio_add_page(bio, bh->b_page, blksz, offset)) { - bio_req->nr_buffers += 1; - prev_block = b; - continue; - } - } - - /* Otherwise, submit the current BIO and create a new one */ - if (bio) - submit_bio(READ, bio); - bio_req = kcalloc(1, sizeof(struct squashfs_bio_request), - GFP_NOIO); - if (!bio_req) - goto req_alloc_failed; - bio_req->bh = &req->bh[b]; - bio = bio_alloc(GFP_NOIO, bio_max_pages); - if (!bio) - goto bio_alloc_failed; - bio->bi_bdev = req->sb->s_bdev; - bio->bi_iter.bi_sector = (block + b) - << (msblk->devblksize_log2 - 9); - bio->bi_private = bio_req; - bio->bi_end_io = squashfs_bio_end_io; - - bio_add_page(bio, bh->b_page, blksz, offset); - bio_req->nr_buffers += 1; - prev_block = b; } - if (bio) - submit_bio(READ, bio); - if (req->synchronous) - squashfs_process_blocks(req); - else { - INIT_WORK(&req->offload, read_wq_handler); - schedule_work(&req->offload); - } - return 0; - -bio_alloc_failed: - kfree(bio_req); -req_alloc_failed: - unlock_buffer(bh); - while (--nr_buffers >= b) - if (req->bh[nr_buffers]) - put_bh(req->bh[nr_buffers]); - while (--b >= 0) - if (req->bh[b]) - wait_on_buffer(req->bh[b]); -getblk_failed: - free_read_request(req, -ENOMEM); - return -ENOMEM; + return bh; } -static int read_metadata_block(struct squashfs_read_request *req, - u64 *next_index) -{ - int ret, error, bytes_read = 0, bytes_uncompressed = 0; - struct squashfs_sb_info *msblk = req->sb->s_fs_info; - - if (req->index + 2 > msblk->bytes_used) { - free_read_request(req, -EINVAL); - return -EINVAL; - } - req->length = 2; - - /* Do not read beyond the end of the device */ - if (req->index + req->length > msblk->bytes_used) - req->length = msblk->bytes_used - req->index; - req->data_processing = SQUASHFS_METADATA; - - /* - * Reading metadata is always synchronous because we don't know the - * length in advance and the function is expected to update - * 'next_index' and return the length. - */ - req->synchronous = true; - req->res = &error; - req->bytes_read = &bytes_read; - req->bytes_uncompressed = &bytes_uncompressed; - - TRACE("Metadata block @ 0x%llx, %scompressed size %d, src size %d\n", - req->index, req->compressed ? "" : "un", bytes_read, - req->output->length); - - ret = squashfs_bio_submit(req); - if (ret) - return ret; - if (error) - return error; - if (next_index) - *next_index += 2 + bytes_read; - return bytes_uncompressed; -} - -static int read_data_block(struct squashfs_read_request *req, int length, - u64 *next_index, bool synchronous) -{ - int ret, error = 0, bytes_uncompressed = 0, bytes_read = 0; - - req->compressed = SQUASHFS_COMPRESSED_BLOCK(length); - req->length = length = SQUASHFS_COMPRESSED_SIZE_BLOCK(length); - req->data_processing = req->compressed ? SQUASHFS_DECOMPRESS - : SQUASHFS_COPY; - - req->synchronous = synchronous; - if (synchronous) { - req->res = &error; - req->bytes_read = &bytes_read; - req->bytes_uncompressed = &bytes_uncompressed; - } - - TRACE("Data block @ 0x%llx, %scompressed size %d, src size %d\n", - req->index, req->compressed ? "" : "un", req->length, - req->output->length); - - ret = squashfs_bio_submit(req); - if (ret) - return ret; - if (synchronous) - ret = error ? error : bytes_uncompressed; - if (next_index) - *next_index += length; - return ret; -} /* * Read and decompress a metadata block or datablock. Length is non-zero @@ -426,50 +87,130 @@ static int read_data_block(struct squashfs_read_request *req, int length, * generated a larger block - this does occasionally happen with compression * algorithms). */ -static int __squashfs_read_data(struct super_block *sb, u64 index, int length, - u64 *next_index, struct squashfs_page_actor *output, bool sync) +int squashfs_read_data(struct super_block *sb, u64 index, int length, + u64 *next_index, struct squashfs_page_actor *output) { - struct squashfs_read_request *req; + struct squashfs_sb_info *msblk = sb->s_fs_info; + struct buffer_head **bh; + int offset = index & ((1 << msblk->devblksize_log2) - 1); + u64 cur_index = index >> msblk->devblksize_log2; + int bytes, compressed, b = 0, k = 0, avail, i; - req = kcalloc(1, sizeof(struct squashfs_read_request), GFP_KERNEL); - if (!req) { - if (!sync) - squashfs_page_actor_free(output, -ENOMEM); + bh = kcalloc(((output->length + msblk->devblksize - 1) + >> msblk->devblksize_log2) + 1, sizeof(*bh), GFP_KERNEL); + if (bh == NULL) return -ENOMEM; - } - req->sb = sb; - req->index = index; - req->output = output; + if (length) { + /* + * Datablock. + */ + bytes = -offset; + compressed = SQUASHFS_COMPRESSED_BLOCK(length); + length = SQUASHFS_COMPRESSED_SIZE_BLOCK(length); + if (next_index) + *next_index = index + length; + + TRACE("Block @ 0x%llx, %scompressed size %d, src size %d\n", + index, compressed ? "" : "un", length, output->length); + + if (length < 0 || length > output->length || + (index + length) > msblk->bytes_used) + goto read_failure; + + for (b = 0; bytes < length; b++, cur_index++) { + bh[b] = sb_getblk(sb, cur_index); + if (bh[b] == NULL) + goto block_release; + bytes += msblk->devblksize; + } + ll_rw_block(READ, b, bh); + } else { + /* + * Metadata block. + */ + if ((index + 2) > msblk->bytes_used) + goto read_failure; + + bh[0] = get_block_length(sb, &cur_index, &offset, &length); + if (bh[0] == NULL) + goto read_failure; + b = 1; + + bytes = msblk->devblksize - offset; + compressed = SQUASHFS_COMPRESSED(length); + length = SQUASHFS_COMPRESSED_SIZE(length); + if (next_index) + *next_index = index + length + 2; - if (next_index) - *next_index = index; + TRACE("Block @ 0x%llx, %scompressed size %d\n", index, + compressed ? "" : "un", length); - if (length) - length = read_data_block(req, length, next_index, sync); - else - length = read_metadata_block(req, next_index); + if (length < 0 || length > output->length || + (index + length) > msblk->bytes_used) + goto block_release; - if (length < 0) { - ERROR("squashfs_read_data failed to read block 0x%llx\n", - (unsigned long long)index); - return -EIO; + for (; bytes < length; b++) { + bh[b] = sb_getblk(sb, ++cur_index); + if (bh[b] == NULL) + goto block_release; + bytes += msblk->devblksize; + } + ll_rw_block(READ, b - 1, bh + 1); } - return length; -} + for (i = 0; i < b; i++) { + wait_on_buffer(bh[i]); + if (!buffer_uptodate(bh[i])) + goto block_release; + } -int squashfs_read_data(struct super_block *sb, u64 index, int length, - u64 *next_index, struct squashfs_page_actor *output) -{ - return __squashfs_read_data(sb, index, length, next_index, output, - true); -} + if (compressed) { + if (!msblk->stream) + goto read_failure; + length = squashfs_decompress(msblk, bh, b, offset, length, + output); + if (length < 0) + goto read_failure; + } else { + /* + * Block is uncompressed. + */ + int in, pg_offset = 0; + void *data = squashfs_first_page(output); + + for (bytes = length; k < b; k++) { + in = min(bytes, msblk->devblksize - offset); + bytes -= in; + while (in) { + if (pg_offset == PAGE_CACHE_SIZE) { + data = squashfs_next_page(output); + pg_offset = 0; + } + avail = min_t(int, in, PAGE_CACHE_SIZE - + pg_offset); + memcpy(data + pg_offset, bh[k]->b_data + offset, + avail); + in -= avail; + pg_offset += avail; + offset += avail; + } + offset = 0; + put_bh(bh[k]); + } + squashfs_finish_page(output); + } -int squashfs_read_data_async(struct super_block *sb, u64 index, int length, - u64 *next_index, struct squashfs_page_actor *output) -{ + kfree(bh); + return length; + +block_release: + for (; k < b; k++) + put_bh(bh[k]); - return __squashfs_read_data(sb, index, length, next_index, output, - false); +read_failure: + ERROR("squashfs_read_data failed to read block 0x%llx\n", + (unsigned long long) index); + kfree(bh); + return -EIO; } diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c index 7d78c3d28bbd..91ce49c05b7c 100644 --- a/fs/squashfs/cache.c +++ b/fs/squashfs/cache.c @@ -209,14 +209,17 @@ void squashfs_cache_put(struct squashfs_cache_entry *entry) */ void squashfs_cache_delete(struct squashfs_cache *cache) { - int i; + int i, j; if (cache == NULL) return; for (i = 0; i < cache->entries; i++) { - if (cache->entry[i].page) - free_page_array(cache->entry[i].page, cache->pages); + if (cache->entry[i].data) { + for (j = 0; j < cache->pages; j++) + kfree(cache->entry[i].data[j]); + kfree(cache->entry[i].data); + } kfree(cache->entry[i].actor); } @@ -233,7 +236,7 @@ void squashfs_cache_delete(struct squashfs_cache *cache) struct squashfs_cache *squashfs_cache_init(char *name, int entries, int block_size) { - int i; + int i, j; struct squashfs_cache *cache = kzalloc(sizeof(*cache), GFP_KERNEL); if (cache == NULL) { @@ -265,13 +268,22 @@ struct squashfs_cache *squashfs_cache_init(char *name, int entries, init_waitqueue_head(&cache->entry[i].wait_queue); entry->cache = cache; entry->block = SQUASHFS_INVALID_BLK; - entry->page = alloc_page_array(cache->pages, GFP_KERNEL); - if (!entry->page) { + entry->data = kcalloc(cache->pages, sizeof(void *), GFP_KERNEL); + if (entry->data == NULL) { ERROR("Failed to allocate %s cache entry\n", name); goto cleanup; } - entry->actor = squashfs_page_actor_init(entry->page, - cache->pages, 0, NULL); + + for (j = 0; j < cache->pages; j++) { + entry->data[j] = kmalloc(PAGE_CACHE_SIZE, GFP_KERNEL); + if (entry->data[j] == NULL) { + ERROR("Failed to allocate %s buffer\n", name); + goto cleanup; + } + } + + entry->actor = squashfs_page_actor_init(entry->data, + cache->pages, 0); if (entry->actor == NULL) { ERROR("Failed to allocate %s cache entry\n", name); goto cleanup; @@ -302,20 +314,18 @@ int squashfs_copy_data(void *buffer, struct squashfs_cache_entry *entry, return min(length, entry->length - offset); while (offset < entry->length) { - void *buff = kmap_atomic(entry->page[offset / PAGE_CACHE_SIZE]) - + (offset % PAGE_CACHE_SIZE); + void *buff = entry->data[offset / PAGE_CACHE_SIZE] + + (offset % PAGE_CACHE_SIZE); int bytes = min_t(int, entry->length - offset, PAGE_CACHE_SIZE - (offset % PAGE_CACHE_SIZE)); if (bytes >= remaining) { memcpy(buffer, buff, remaining); - kunmap_atomic(buff); remaining = 0; break; } memcpy(buffer, buff, bytes); - kunmap_atomic(buff); buffer += bytes; remaining -= bytes; offset += bytes; @@ -409,38 +419,43 @@ struct squashfs_cache_entry *squashfs_get_datablock(struct super_block *sb, void *squashfs_read_table(struct super_block *sb, u64 block, int length) { int pages = (length + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - struct page **page; - void *buff; - int res; + int i, res; + void *table, *buffer, **data; struct squashfs_page_actor *actor; - page = alloc_page_array(pages, GFP_KERNEL); - if (!page) + table = buffer = kmalloc(length, GFP_KERNEL); + if (table == NULL) return ERR_PTR(-ENOMEM); - actor = squashfs_page_actor_init(page, pages, length, NULL); - if (actor == NULL) { + data = kcalloc(pages, sizeof(void *), GFP_KERNEL); + if (data == NULL) { res = -ENOMEM; goto failed; } + actor = squashfs_page_actor_init(data, pages, length); + if (actor == NULL) { + res = -ENOMEM; + goto failed2; + } + + for (i = 0; i < pages; i++, buffer += PAGE_CACHE_SIZE) + data[i] = buffer; + res = squashfs_read_data(sb, block, length | SQUASHFS_COMPRESSED_BIT_BLOCK, NULL, actor); + kfree(data); + kfree(actor); + if (res < 0) - goto failed2; + goto failed; - buff = kmalloc(length, GFP_KERNEL); - if (!buff) - goto failed2; - squashfs_actor_to_buf(actor, buff, length); - squashfs_page_actor_free(actor, 0); - free_page_array(page, pages); - return buff; + return table; failed2: - squashfs_page_actor_free(actor, 0); + kfree(data); failed: - free_page_array(page, pages); + kfree(table); return ERR_PTR(res); } diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c index 7de35bf297aa..e9034bf6e5ae 100644 --- a/fs/squashfs/decompressor.c +++ b/fs/squashfs/decompressor.c @@ -24,8 +24,7 @@ #include <linux/types.h> #include <linux/mutex.h> #include <linux/slab.h> -#include <linux/highmem.h> -#include <linux/fs.h> +#include <linux/buffer_head.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" @@ -95,44 +94,40 @@ const struct squashfs_decompressor *squashfs_lookup_decompressor(int id) static void *get_comp_opts(struct super_block *sb, unsigned short flags) { struct squashfs_sb_info *msblk = sb->s_fs_info; - void *comp_opts, *buffer = NULL; - struct page *page; + void *buffer = NULL, *comp_opts; struct squashfs_page_actor *actor = NULL; int length = 0; - if (!SQUASHFS_COMP_OPTS(flags)) - return squashfs_comp_opts(msblk, buffer, length); - /* * Read decompressor specific options from file system if present */ - - page = alloc_page(GFP_KERNEL); - if (!page) - return ERR_PTR(-ENOMEM); - - actor = squashfs_page_actor_init(&page, 1, 0, NULL); - if (actor == NULL) { - comp_opts = ERR_PTR(-ENOMEM); - goto actor_error; - } - - length = squashfs_read_data(sb, - sizeof(struct squashfs_super_block), 0, NULL, actor); - - if (length < 0) { - comp_opts = ERR_PTR(length); - goto read_error; + if (SQUASHFS_COMP_OPTS(flags)) { + buffer = kmalloc(PAGE_CACHE_SIZE, GFP_KERNEL); + if (buffer == NULL) { + comp_opts = ERR_PTR(-ENOMEM); + goto out; + } + + actor = squashfs_page_actor_init(&buffer, 1, 0); + if (actor == NULL) { + comp_opts = ERR_PTR(-ENOMEM); + goto out; + } + + length = squashfs_read_data(sb, + sizeof(struct squashfs_super_block), 0, NULL, actor); + + if (length < 0) { + comp_opts = ERR_PTR(length); + goto out; + } } - buffer = kmap_atomic(page); comp_opts = squashfs_comp_opts(msblk, buffer, length); - kunmap_atomic(buffer); -read_error: - squashfs_page_actor_free(actor, 0); -actor_error: - __free_page(page); +out: + kfree(actor); + kfree(buffer); return comp_opts; } diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c index 43d946617d2d..1ec7bae2751d 100644 --- a/fs/squashfs/file.c +++ b/fs/squashfs/file.c @@ -47,16 +47,12 @@ #include <linux/string.h> #include <linux/pagemap.h> #include <linux/mutex.h> -#include <linux/mm_inline.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" #include "squashfs_fs_i.h" #include "squashfs.h" -// Backported from 4.5 -#define lru_to_page(head) (list_entry((head)->prev, struct page, lru)) - /* * Locate cache slot in range [offset, index] for specified inode. If * there's more than one return the slot closest to index. @@ -446,21 +442,6 @@ static int squashfs_readpage_fragment(struct page *page) return res; } -static int squashfs_readpages_fragment(struct page *page, - struct list_head *readahead_pages, struct address_space *mapping) -{ - if (!page) { - page = lru_to_page(readahead_pages); - list_del(&page->lru); - if (add_to_page_cache_lru(page, mapping, page->index, - mapping_gfp_constraint(mapping, GFP_KERNEL))) { - put_page(page); - return 0; - } - } - return squashfs_readpage_fragment(page); -} - static int squashfs_readpage_sparse(struct page *page, int index, int file_end) { struct inode *inode = page->mapping->host; @@ -473,105 +454,54 @@ static int squashfs_readpage_sparse(struct page *page, int index, int file_end) return 0; } -static int squashfs_readpages_sparse(struct page *page, - struct list_head *readahead_pages, int index, int file_end, - struct address_space *mapping) -{ - if (!page) { - page = lru_to_page(readahead_pages); - list_del(&page->lru); - if (add_to_page_cache_lru(page, mapping, page->index, - mapping_gfp_constraint(mapping, GFP_KERNEL))) { - put_page(page); - return 0; - } - } - return squashfs_readpage_sparse(page, index, file_end); -} - -static int __squashfs_readpages(struct file *file, struct page *page, - struct list_head *readahead_pages, unsigned int nr_pages, - struct address_space *mapping) +static int squashfs_readpage(struct file *file, struct page *page) { - struct inode *inode = mapping->host; + struct inode *inode = page->mapping->host; struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; + int index = page->index >> (msblk->block_log - PAGE_CACHE_SHIFT); int file_end = i_size_read(inode) >> msblk->block_log; int res; - - do { - struct page *cur_page = page ? page - : lru_to_page(readahead_pages); - int page_index = cur_page->index; - int index = page_index >> (msblk->block_log - PAGE_CACHE_SHIFT); - - if (page_index >= ((i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> - PAGE_CACHE_SHIFT)) - return 1; - - if (index < file_end || squashfs_i(inode)->fragment_block == - SQUASHFS_INVALID_BLK) { - u64 block = 0; - int bsize = read_blocklist(inode, index, &block); - - if (bsize < 0) - return -1; - - if (bsize == 0) { - res = squashfs_readpages_sparse(page, - readahead_pages, index, file_end, - mapping); - } else { - res = squashfs_readpages_block(page, - readahead_pages, &nr_pages, mapping, - page_index, block, bsize); - } - } else { - res = squashfs_readpages_fragment(page, - readahead_pages, mapping); - } - if (res) - return 0; - page = NULL; - } while (readahead_pages && !list_empty(readahead_pages)); - - return 0; -} - -static int squashfs_readpage(struct file *file, struct page *page) -{ - int ret; + void *pageaddr; TRACE("Entered squashfs_readpage, page index %lx, start block %llx\n", - page->index, squashfs_i(page->mapping->host)->start); + page->index, squashfs_i(inode)->start); - get_page(page); + if (page->index >= ((i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT)) + goto out; - ret = __squashfs_readpages(file, page, NULL, 1, page->mapping); - if (ret) { - flush_dcache_page(page); - if (ret < 0) - SetPageError(page); - else - SetPageUptodate(page); - zero_user_segment(page, 0, PAGE_CACHE_SIZE); - unlock_page(page); - put_page(page); - } + if (index < file_end || squashfs_i(inode)->fragment_block == + SQUASHFS_INVALID_BLK) { + u64 block = 0; + int bsize = read_blocklist(inode, index, &block); + if (bsize < 0) + goto error_out; - return 0; -} + if (bsize == 0) + res = squashfs_readpage_sparse(page, index, file_end); + else + res = squashfs_readpage_block(page, block, bsize); + } else + res = squashfs_readpage_fragment(page); + + if (!res) + return 0; + +error_out: + SetPageError(page); +out: + pageaddr = kmap_atomic(page); + memset(pageaddr, 0, PAGE_CACHE_SIZE); + kunmap_atomic(pageaddr); + flush_dcache_page(page); + if (!PageError(page)) + SetPageUptodate(page); + unlock_page(page); -static int squashfs_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned int nr_pages) -{ - TRACE("Entered squashfs_readpages, %u pages, first page index %lx\n", - nr_pages, lru_to_page(pages)->index); - __squashfs_readpages(file, NULL, pages, nr_pages, mapping); return 0; } const struct address_space_operations squashfs_aops = { - .readpage = squashfs_readpage, - .readpages = squashfs_readpages, + .readpage = squashfs_readpage }; diff --git a/fs/squashfs/file_cache.c b/fs/squashfs/file_cache.c new file mode 100644 index 000000000000..f2310d2a2019 --- /dev/null +++ b/fs/squashfs/file_cache.c @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2013 + * Phillip Lougher <phillip@squashfs.org.uk> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#include <linux/fs.h> +#include <linux/vfs.h> +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/pagemap.h> +#include <linux/mutex.h> + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs_fs_i.h" +#include "squashfs.h" + +/* Read separately compressed datablock and memcopy into page cache */ +int squashfs_readpage_block(struct page *page, u64 block, int bsize) +{ + struct inode *i = page->mapping->host; + struct squashfs_cache_entry *buffer = squashfs_get_datablock(i->i_sb, + block, bsize); + int res = buffer->error; + + if (res) + ERROR("Unable to read page, block %llx, size %x\n", block, + bsize); + else + squashfs_copy_cache(page, buffer, buffer->length, 0); + + squashfs_cache_put(buffer); + return res; +} diff --git a/fs/squashfs/file_direct.c b/fs/squashfs/file_direct.c index c97af4c6ccd0..43e7a7eddac0 100644 --- a/fs/squashfs/file_direct.c +++ b/fs/squashfs/file_direct.c @@ -13,7 +13,6 @@ #include <linux/string.h> #include <linux/pagemap.h> #include <linux/mutex.h> -#include <linux/mm_inline.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" @@ -21,139 +20,157 @@ #include "squashfs.h" #include "page_actor.h" -// Backported from 4.5 -#define lru_to_page(head) (list_entry((head)->prev, struct page, lru)) +static int squashfs_read_cache(struct page *target_page, u64 block, int bsize, + int pages, struct page **page); -static void release_actor_pages(struct page **page, int pages, int error) -{ - int i; - - for (i = 0; i < pages; i++) { - if (!page[i]) - continue; - flush_dcache_page(page[i]); - if (!error) - SetPageUptodate(page[i]); - else { - SetPageError(page[i]); - zero_user_segment(page[i], 0, PAGE_CACHE_SIZE); - } - unlock_page(page[i]); - put_page(page[i]); - } - kfree(page); -} +/* Read separately compressed datablock directly into page cache */ +int squashfs_readpage_block(struct page *target_page, u64 block, int bsize) -/* - * Create a "page actor" which will kmap and kunmap the - * page cache pages appropriately within the decompressor - */ -static struct squashfs_page_actor *actor_from_page_cache( - unsigned int actor_pages, struct page *target_page, - struct list_head *rpages, unsigned int *nr_pages, int start_index, - struct address_space *mapping) { + struct inode *inode = target_page->mapping->host; + struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; + + int file_end = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT; + int mask = (1 << (msblk->block_log - PAGE_CACHE_SHIFT)) - 1; + int start_index = target_page->index & ~mask; + int end_index = start_index | mask; + int i, n, pages, missing_pages, bytes, res = -ENOMEM; struct page **page; struct squashfs_page_actor *actor; - int i, n; - gfp_t gfp = mapping_gfp_constraint(mapping, GFP_KERNEL); - - page = kmalloc_array(actor_pages, sizeof(void *), GFP_KERNEL); - if (!page) - return NULL; - - for (i = 0, n = start_index; i < actor_pages; i++, n++) { - if (target_page == NULL && rpages && !list_empty(rpages)) { - struct page *cur_page = lru_to_page(rpages); - - if (cur_page->index < start_index + actor_pages) { - list_del(&cur_page->lru); - --(*nr_pages); - if (add_to_page_cache_lru(cur_page, mapping, - cur_page->index, gfp)) - put_page(cur_page); - else - target_page = cur_page; - } else - rpages = NULL; - } + void *pageaddr; + + if (end_index > file_end) + end_index = file_end; + + pages = end_index - start_index + 1; - if (target_page && target_page->index == n) { - page[i] = target_page; - target_page = NULL; - } else { - page[i] = grab_cache_page_nowait(mapping, n); - if (page[i] == NULL) - continue; + page = kmalloc_array(pages, sizeof(void *), GFP_KERNEL); + if (page == NULL) + return res; + + /* + * Create a "page actor" which will kmap and kunmap the + * page cache pages appropriately within the decompressor + */ + actor = squashfs_page_actor_init_special(page, pages, 0); + if (actor == NULL) + goto out; + + /* Try to grab all the pages covered by the Squashfs block */ + for (missing_pages = 0, i = 0, n = start_index; i < pages; i++, n++) { + page[i] = (n == target_page->index) ? target_page : + grab_cache_page_nowait(target_page->mapping, n); + + if (page[i] == NULL) { + missing_pages++; + continue; } if (PageUptodate(page[i])) { unlock_page(page[i]); - put_page(page[i]); + page_cache_release(page[i]); page[i] = NULL; + missing_pages++; } } - actor = squashfs_page_actor_init(page, actor_pages, 0, - release_actor_pages); - if (!actor) { - release_actor_pages(page, actor_pages, -ENOMEM); - kfree(page); - return NULL; + if (missing_pages) { + /* + * Couldn't get one or more pages, this page has either + * been VM reclaimed, but others are still in the page cache + * and uptodate, or we're racing with another thread in + * squashfs_readpage also trying to grab them. Fall back to + * using an intermediate buffer. + */ + res = squashfs_read_cache(target_page, block, bsize, pages, + page); + if (res < 0) + goto mark_errored; + + goto out; } - return actor; -} -int squashfs_readpages_block(struct page *target_page, - struct list_head *readahead_pages, - unsigned int *nr_pages, - struct address_space *mapping, - int page_index, u64 block, int bsize) + /* Decompress directly into the page cache buffers */ + res = squashfs_read_data(inode->i_sb, block, bsize, NULL, actor); + if (res < 0) + goto mark_errored; + + /* Last page may have trailing bytes not filled */ + bytes = res % PAGE_CACHE_SIZE; + if (bytes) { + pageaddr = kmap_atomic(page[pages - 1]); + memset(pageaddr + bytes, 0, PAGE_CACHE_SIZE - bytes); + kunmap_atomic(pageaddr); + } -{ - struct squashfs_page_actor *actor; - struct inode *inode = mapping->host; - struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; - int start_index, end_index, file_end, actor_pages, res; - int mask = (1 << (msblk->block_log - PAGE_CACHE_SHIFT)) - 1; + /* Mark pages as uptodate, unlock and release */ + for (i = 0; i < pages; i++) { + flush_dcache_page(page[i]); + SetPageUptodate(page[i]); + unlock_page(page[i]); + if (page[i] != target_page) + page_cache_release(page[i]); + } - /* - * If readpage() is called on an uncompressed datablock, we can just - * read the pages instead of fetching the whole block. - * This greatly improves the performance when a process keep doing - * random reads because we only fetch the necessary data. - * The readahead algorithm will take care of doing speculative reads - * if necessary. - * We can't read more than 1 block even if readahead provides use more - * pages because we don't know yet if the next block is compressed or - * not. + kfree(actor); + kfree(page); + + return 0; + +mark_errored: + /* Decompression failed, mark pages as errored. Target_page is + * dealt with by the caller */ - if (bsize && !SQUASHFS_COMPRESSED_BLOCK(bsize)) { - u64 block_end = block + msblk->block_size; - - block += (page_index & mask) * PAGE_CACHE_SIZE; - actor_pages = (block_end - block) / PAGE_CACHE_SIZE; - if (*nr_pages < actor_pages) - actor_pages = *nr_pages; - start_index = page_index; - bsize = min_t(int, bsize, (PAGE_CACHE_SIZE * actor_pages) - | SQUASHFS_COMPRESSED_BIT_BLOCK); - } else { - file_end = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT; - start_index = page_index & ~mask; - end_index = start_index | mask; - if (end_index > file_end) - end_index = file_end; - actor_pages = end_index - start_index + 1; + for (i = 0; i < pages; i++) { + if (page[i] == NULL || page[i] == target_page) + continue; + flush_dcache_page(page[i]); + SetPageError(page[i]); + unlock_page(page[i]); + page_cache_release(page[i]); } - actor = actor_from_page_cache(actor_pages, target_page, - readahead_pages, nr_pages, start_index, - mapping); - if (!actor) - return -ENOMEM; +out: + kfree(actor); + kfree(page); + return res; +} + + +static int squashfs_read_cache(struct page *target_page, u64 block, int bsize, + int pages, struct page **page) +{ + struct inode *i = target_page->mapping->host; + struct squashfs_cache_entry *buffer = squashfs_get_datablock(i->i_sb, + block, bsize); + int bytes = buffer->length, res = buffer->error, n, offset = 0; + void *pageaddr; + + if (res) { + ERROR("Unable to read page, block %llx, size %x\n", block, + bsize); + goto out; + } + + for (n = 0; n < pages && bytes > 0; n++, + bytes -= PAGE_CACHE_SIZE, offset += PAGE_CACHE_SIZE) { + int avail = min_t(int, bytes, PAGE_CACHE_SIZE); + + if (page[n] == NULL) + continue; + + pageaddr = kmap_atomic(page[n]); + squashfs_copy_data(pageaddr, buffer, offset, avail); + memset(pageaddr + avail, 0, PAGE_CACHE_SIZE - avail); + kunmap_atomic(pageaddr); + flush_dcache_page(page[n]); + SetPageUptodate(page[n]); + unlock_page(page[n]); + if (page[n] != target_page) + page_cache_release(page[n]); + } - res = squashfs_read_data_async(inode->i_sb, block, bsize, NULL, - actor); - return res < 0 ? res : 0; +out: + squashfs_cache_put(buffer); + return res; } diff --git a/fs/squashfs/lz4_wrapper.c b/fs/squashfs/lz4_wrapper.c index df4fa3c7ddd0..c31e2bc9c081 100644 --- a/fs/squashfs/lz4_wrapper.c +++ b/fs/squashfs/lz4_wrapper.c @@ -94,17 +94,39 @@ static int lz4_uncompress(struct squashfs_sb_info *msblk, void *strm, struct buffer_head **bh, int b, int offset, int length, struct squashfs_page_actor *output) { - int res; - size_t dest_len = output->length; struct squashfs_lz4 *stream = strm; + void *buff = stream->input, *data; + int avail, i, bytes = length, res; + size_t dest_len = output->length; + + for (i = 0; i < b; i++) { + avail = min(bytes, msblk->devblksize - offset); + memcpy(buff, bh[i]->b_data + offset, avail); + buff += avail; + bytes -= avail; + offset = 0; + put_bh(bh[i]); + } - squashfs_bh_to_buf(bh, b, stream->input, offset, length, - msblk->devblksize); res = lz4_decompress_unknownoutputsize(stream->input, length, stream->output, &dest_len); if (res) return -EIO; - squashfs_buf_to_actor(stream->output, output, dest_len); + + bytes = dest_len; + data = squashfs_first_page(output); + buff = stream->output; + while (data) { + if (bytes <= PAGE_CACHE_SIZE) { + memcpy(data, buff, bytes); + break; + } + memcpy(data, buff, PAGE_CACHE_SIZE); + buff += PAGE_CACHE_SIZE; + bytes -= PAGE_CACHE_SIZE; + data = squashfs_next_page(output); + } + squashfs_finish_page(output); return dest_len; } diff --git a/fs/squashfs/lzo_wrapper.c b/fs/squashfs/lzo_wrapper.c index 2c844d53a59e..244b9fbfff7b 100644 --- a/fs/squashfs/lzo_wrapper.c +++ b/fs/squashfs/lzo_wrapper.c @@ -79,19 +79,45 @@ static int lzo_uncompress(struct squashfs_sb_info *msblk, void *strm, struct buffer_head **bh, int b, int offset, int length, struct squashfs_page_actor *output) { - int res; - size_t out_len = output->length; struct squashfs_lzo *stream = strm; + void *buff = stream->input, *data; + int avail, i, bytes = length, res; + size_t out_len = output->length; + + for (i = 0; i < b; i++) { + avail = min(bytes, msblk->devblksize - offset); + memcpy(buff, bh[i]->b_data + offset, avail); + buff += avail; + bytes -= avail; + offset = 0; + put_bh(bh[i]); + } - squashfs_bh_to_buf(bh, b, stream->input, offset, length, - msblk->devblksize); res = lzo1x_decompress_safe(stream->input, (size_t)length, stream->output, &out_len); if (res != LZO_E_OK) - return -EIO; - squashfs_buf_to_actor(stream->output, output, out_len); + goto failed; - return out_len; + res = bytes = (int)out_len; + data = squashfs_first_page(output); + buff = stream->output; + while (data) { + if (bytes <= PAGE_CACHE_SIZE) { + memcpy(data, buff, bytes); + break; + } else { + memcpy(data, buff, PAGE_CACHE_SIZE); + buff += PAGE_CACHE_SIZE; + bytes -= PAGE_CACHE_SIZE; + data = squashfs_next_page(output); + } + } + squashfs_finish_page(output); + + return res; + +failed: + return -EIO; } const struct squashfs_decompressor squashfs_lzo_comp_ops = { diff --git a/fs/squashfs/page_actor.c b/fs/squashfs/page_actor.c index 53863508e400..5a1c11f56441 100644 --- a/fs/squashfs/page_actor.c +++ b/fs/squashfs/page_actor.c @@ -9,145 +9,92 @@ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/pagemap.h> -#include <linux/buffer_head.h> #include "page_actor.h" -struct squashfs_page_actor *squashfs_page_actor_init(struct page **page, - int pages, int length, void (*release_pages)(struct page **, int, int)) -{ - struct squashfs_page_actor *actor = kmalloc(sizeof(*actor), GFP_KERNEL); - - if (actor == NULL) - return NULL; +/* + * This file contains implementations of page_actor for decompressing into + * an intermediate buffer, and for decompressing directly into the + * page cache. + * + * Calling code should avoid sleeping between calls to squashfs_first_page() + * and squashfs_finish_page(). + */ - actor->length = length ? : pages * PAGE_CACHE_SIZE; - actor->page = page; - actor->pages = pages; - actor->next_page = 0; - actor->pageaddr = NULL; - actor->release_pages = release_pages; - return actor; +/* Implementation of page_actor for decompressing into intermediate buffer */ +static void *cache_first_page(struct squashfs_page_actor *actor) +{ + actor->next_page = 1; + return actor->buffer[0]; } -void squashfs_page_actor_free(struct squashfs_page_actor *actor, int error) +static void *cache_next_page(struct squashfs_page_actor *actor) { - if (!actor) - return; + if (actor->next_page == actor->pages) + return NULL; - if (actor->release_pages) - actor->release_pages(actor->page, actor->pages, error); - kfree(actor); + return actor->buffer[actor->next_page++]; } -void squashfs_actor_to_buf(struct squashfs_page_actor *actor, void *buf, - int length) +static void cache_finish_page(struct squashfs_page_actor *actor) { - void *pageaddr; - int pos = 0, avail, i; - - for (i = 0; i < actor->pages && pos < length; ++i) { - avail = min_t(int, length - pos, PAGE_CACHE_SIZE); - if (actor->page[i]) { - pageaddr = kmap_atomic(actor->page[i]); - memcpy(buf + pos, pageaddr, avail); - kunmap_atomic(pageaddr); - } - pos += avail; - } + /* empty */ } -void squashfs_buf_to_actor(void *buf, struct squashfs_page_actor *actor, - int length) +struct squashfs_page_actor *squashfs_page_actor_init(void **buffer, + int pages, int length) { - void *pageaddr; - int pos = 0, avail, i; - - for (i = 0; i < actor->pages && pos < length; ++i) { - avail = min_t(int, length - pos, PAGE_CACHE_SIZE); - if (actor->page[i]) { - pageaddr = kmap_atomic(actor->page[i]); - memcpy(pageaddr, buf + pos, avail); - kunmap_atomic(pageaddr); - } - pos += avail; - } + struct squashfs_page_actor *actor = kmalloc(sizeof(*actor), GFP_KERNEL); + + if (actor == NULL) + return NULL; + + actor->length = length ? : pages * PAGE_CACHE_SIZE; + actor->buffer = buffer; + actor->pages = pages; + actor->next_page = 0; + actor->squashfs_first_page = cache_first_page; + actor->squashfs_next_page = cache_next_page; + actor->squashfs_finish_page = cache_finish_page; + return actor; } -void squashfs_bh_to_actor(struct buffer_head **bh, int nr_buffers, - struct squashfs_page_actor *actor, int offset, int length, int blksz) +/* Implementation of page_actor for decompressing directly into page cache. */ +static void *direct_first_page(struct squashfs_page_actor *actor) { - void *kaddr = NULL; - int bytes = 0, pgoff = 0, b = 0, p = 0, avail, i; - - while (bytes < length) { - if (actor->page[p]) { - kaddr = kmap_atomic(actor->page[p]); - while (pgoff < PAGE_CACHE_SIZE && bytes < length) { - avail = min_t(int, blksz - offset, - PAGE_CACHE_SIZE - pgoff); - memcpy(kaddr + pgoff, bh[b]->b_data + offset, - avail); - pgoff += avail; - bytes += avail; - offset = (offset + avail) % blksz; - if (!offset) { - put_bh(bh[b]); - ++b; - } - } - kunmap_atomic(kaddr); - pgoff = 0; - } else { - for (i = 0; i < PAGE_CACHE_SIZE / blksz; ++i) { - if (bh[b]) - put_bh(bh[b]); - ++b; - } - bytes += PAGE_CACHE_SIZE; - } - ++p; - } + actor->next_page = 1; + return actor->pageaddr = kmap_atomic(actor->page[0]); } -void squashfs_bh_to_buf(struct buffer_head **bh, int nr_buffers, void *buf, - int offset, int length, int blksz) +static void *direct_next_page(struct squashfs_page_actor *actor) { - int i, avail, bytes = 0; - - for (i = 0; i < nr_buffers && bytes < length; ++i) { - avail = min_t(int, length - bytes, blksz - offset); - if (bh[i]) { - memcpy(buf + bytes, bh[i]->b_data + offset, avail); - put_bh(bh[i]); - } - bytes += avail; - offset = 0; - } + if (actor->pageaddr) + kunmap_atomic(actor->pageaddr); + + return actor->pageaddr = actor->next_page == actor->pages ? NULL : + kmap_atomic(actor->page[actor->next_page++]); } -void free_page_array(struct page **page, int nr_pages) +static void direct_finish_page(struct squashfs_page_actor *actor) { - int i; - - for (i = 0; i < nr_pages; ++i) - __free_page(page[i]); - kfree(page); + if (actor->pageaddr) + kunmap_atomic(actor->pageaddr); } -struct page **alloc_page_array(int nr_pages, int gfp_mask) +struct squashfs_page_actor *squashfs_page_actor_init_special(struct page **page, + int pages, int length) { - int i; - struct page **page; + struct squashfs_page_actor *actor = kmalloc(sizeof(*actor), GFP_KERNEL); - page = kcalloc(nr_pages, sizeof(struct page *), gfp_mask); - if (!page) + if (actor == NULL) return NULL; - for (i = 0; i < nr_pages; ++i) { - page[i] = alloc_page(gfp_mask); - if (!page[i]) { - free_page_array(page, i); - return NULL; - } - } - return page; + + actor->length = length ? : pages * PAGE_CACHE_SIZE; + actor->page = page; + actor->pages = pages; + actor->next_page = 0; + actor->pageaddr = NULL; + actor->squashfs_first_page = direct_first_page; + actor->squashfs_next_page = direct_next_page; + actor->squashfs_finish_page = direct_finish_page; + return actor; } diff --git a/fs/squashfs/page_actor.h b/fs/squashfs/page_actor.h index aa1ed790b5a3..26dd82008b82 100644 --- a/fs/squashfs/page_actor.h +++ b/fs/squashfs/page_actor.h @@ -5,61 +5,77 @@ * Phillip Lougher <phillip@squashfs.org.uk> * * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level squashfsory. + * the COPYING file in the top-level directory. */ +#ifndef CONFIG_SQUASHFS_FILE_DIRECT struct squashfs_page_actor { - struct page **page; - void *pageaddr; + void **page; int pages; int length; int next_page; - void (*release_pages)(struct page **, int, int); }; -extern struct squashfs_page_actor *squashfs_page_actor_init(struct page **, - int, int, void (*)(struct page **, int, int)); -extern void squashfs_page_actor_free(struct squashfs_page_actor *, int); +static inline struct squashfs_page_actor *squashfs_page_actor_init(void **page, + int pages, int length) +{ + struct squashfs_page_actor *actor = kmalloc(sizeof(*actor), GFP_KERNEL); -extern void squashfs_actor_to_buf(struct squashfs_page_actor *, void *, int); -extern void squashfs_buf_to_actor(void *, struct squashfs_page_actor *, int); -extern void squashfs_bh_to_actor(struct buffer_head **, int, - struct squashfs_page_actor *, int, int, int); -extern void squashfs_bh_to_buf(struct buffer_head **, int, void *, int, int, - int); + if (actor == NULL) + return NULL; + + actor->length = length ? : pages * PAGE_CACHE_SIZE; + actor->page = page; + actor->pages = pages; + actor->next_page = 0; + return actor; +} -/* - * Calling code should avoid sleeping between calls to squashfs_first_page() - * and squashfs_finish_page(). - */ static inline void *squashfs_first_page(struct squashfs_page_actor *actor) { actor->next_page = 1; - return actor->pageaddr = actor->page[0] ? kmap_atomic(actor->page[0]) - : NULL; + return actor->page[0]; } static inline void *squashfs_next_page(struct squashfs_page_actor *actor) { - if (!IS_ERR_OR_NULL(actor->pageaddr)) - kunmap_atomic(actor->pageaddr); - - if (actor->next_page == actor->pages) - return actor->pageaddr = ERR_PTR(-ENODATA); - - actor->pageaddr = actor->page[actor->next_page] ? - kmap_atomic(actor->page[actor->next_page]) : NULL; - ++actor->next_page; - return actor->pageaddr; + return actor->next_page == actor->pages ? NULL : + actor->page[actor->next_page++]; } static inline void squashfs_finish_page(struct squashfs_page_actor *actor) { - if (!IS_ERR_OR_NULL(actor->pageaddr)) - kunmap_atomic(actor->pageaddr); + /* empty */ } +#else +struct squashfs_page_actor { + union { + void **buffer; + struct page **page; + }; + void *pageaddr; + void *(*squashfs_first_page)(struct squashfs_page_actor *); + void *(*squashfs_next_page)(struct squashfs_page_actor *); + void (*squashfs_finish_page)(struct squashfs_page_actor *); + int pages; + int length; + int next_page; +}; -extern struct page **alloc_page_array(int, int); -extern void free_page_array(struct page **, int); - +extern struct squashfs_page_actor *squashfs_page_actor_init(void **, int, int); +extern struct squashfs_page_actor *squashfs_page_actor_init_special(struct page + **, int, int); +static inline void *squashfs_first_page(struct squashfs_page_actor *actor) +{ + return actor->squashfs_first_page(actor); +} +static inline void *squashfs_next_page(struct squashfs_page_actor *actor) +{ + return actor->squashfs_next_page(actor); +} +static inline void squashfs_finish_page(struct squashfs_page_actor *actor) +{ + actor->squashfs_finish_page(actor); +} +#endif #endif diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h index 6093579c6c5d..887d6d270080 100644 --- a/fs/squashfs/squashfs.h +++ b/fs/squashfs/squashfs.h @@ -28,14 +28,8 @@ #define WARNING(s, args...) pr_warn("SQUASHFS: "s, ## args) /* block.c */ -extern int squashfs_init_read_wq(void); -extern void squashfs_destroy_read_wq(void); extern int squashfs_read_data(struct super_block *, u64, int, u64 *, struct squashfs_page_actor *); -extern int squashfs_read_data(struct super_block *, u64, int, u64 *, - struct squashfs_page_actor *); -extern int squashfs_read_data_async(struct super_block *, u64, int, u64 *, - struct squashfs_page_actor *); /* cache.c */ extern struct squashfs_cache *squashfs_cache_init(char *, int, int); @@ -76,9 +70,8 @@ extern __le64 *squashfs_read_fragment_index_table(struct super_block *, void squashfs_copy_cache(struct page *, struct squashfs_cache_entry *, int, int); -/* file_direct.c */ -extern int squashfs_readpages_block(struct page *, struct list_head *, - unsigned int *, struct address_space *, int, u64, int); +/* file_xxx.c */ +extern int squashfs_readpage_block(struct page *, u64, int); /* id.c */ extern int squashfs_get_id(struct super_block *, unsigned int, unsigned int *); diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h index 3b767ce1e46d..ef69c31947bf 100644 --- a/fs/squashfs/squashfs_fs_sb.h +++ b/fs/squashfs/squashfs_fs_sb.h @@ -49,7 +49,7 @@ struct squashfs_cache_entry { int num_waiters; wait_queue_head_t wait_queue; struct squashfs_cache *cache; - struct page **page; + void **data; struct squashfs_page_actor *actor; }; diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 4ffbddb688aa..93aa3e23c845 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -445,15 +445,9 @@ static int __init init_squashfs_fs(void) if (err) return err; - if (!squashfs_init_read_wq()) { - destroy_inodecache(); - return -ENOMEM; - } - err = register_filesystem(&squashfs_fs_type); if (err) { destroy_inodecache(); - squashfs_destroy_read_wq(); return err; } @@ -467,7 +461,6 @@ static void __exit exit_squashfs_fs(void) { unregister_filesystem(&squashfs_fs_type); destroy_inodecache(); - squashfs_destroy_read_wq(); } diff --git a/fs/squashfs/xz_wrapper.c b/fs/squashfs/xz_wrapper.c index 14cd373e1897..c609624e4b8a 100644 --- a/fs/squashfs/xz_wrapper.c +++ b/fs/squashfs/xz_wrapper.c @@ -55,7 +55,7 @@ static void *squashfs_xz_comp_opts(struct squashfs_sb_info *msblk, struct comp_opts *opts; int err = 0, n; - opts = kmalloc(sizeof(*opts), GFP_ATOMIC); + opts = kmalloc(sizeof(*opts), GFP_KERNEL); if (opts == NULL) { err = -ENOMEM; goto out2; @@ -136,7 +136,6 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm, enum xz_ret xz_err; int avail, total = 0, k = 0; struct squashfs_xz *stream = strm; - void *buf = NULL; xz_dec_reset(stream->state); stream->buf.in_pos = 0; @@ -157,20 +156,12 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm, if (stream->buf.out_pos == stream->buf.out_size) { stream->buf.out = squashfs_next_page(output); - if (!IS_ERR(stream->buf.out)) { + if (stream->buf.out != NULL) { stream->buf.out_pos = 0; total += PAGE_CACHE_SIZE; } } - if (!stream->buf.out) { - if (!buf) { - buf = kmalloc(PAGE_CACHE_SIZE, GFP_ATOMIC); - if (!buf) - goto out; - } - stream->buf.out = buf; - } xz_err = xz_dec_run(stream->state, &stream->buf); if (stream->buf.in_pos == stream->buf.in_size && k < b) @@ -182,13 +173,11 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm, if (xz_err != XZ_STREAM_END || k < b) goto out; - kfree(buf); return total + stream->buf.out_pos; out: for (; k < b; k++) put_bh(bh[k]); - kfree(buf); return -EIO; } diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c index 09c892b5308e..8727caba6882 100644 --- a/fs/squashfs/zlib_wrapper.c +++ b/fs/squashfs/zlib_wrapper.c @@ -66,7 +66,6 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm, struct buffer_head **bh, int b, int offset, int length, struct squashfs_page_actor *output) { - void *buf = NULL; int zlib_err, zlib_init = 0, k = 0; z_stream *stream = strm; @@ -85,19 +84,10 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm, if (stream->avail_out == 0) { stream->next_out = squashfs_next_page(output); - if (!IS_ERR(stream->next_out)) + if (stream->next_out != NULL) stream->avail_out = PAGE_CACHE_SIZE; } - if (!stream->next_out) { - if (!buf) { - buf = kmalloc(PAGE_CACHE_SIZE, GFP_ATOMIC); - if (!buf) - goto out; - } - stream->next_out = buf; - } - if (!zlib_init) { zlib_err = zlib_inflateInit(stream); if (zlib_err != Z_OK) { @@ -125,13 +115,11 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm, if (k < b) goto out; - kfree(buf); return stream->total_out; out: for (; k < b; k++) put_bh(bh[k]); - kfree(buf); return -EIO; } diff --git a/fs/super.c b/fs/super.c index bc7ae0f327d0..689ec96c43f8 100644 --- a/fs/super.c +++ b/fs/super.c @@ -118,13 +118,23 @@ static unsigned long super_cache_count(struct shrinker *shrink, sb = container_of(shrink, struct super_block, s_shrink); /* - * Don't call trylock_super as it is a potential - * scalability bottleneck. The counts could get updated - * between super_cache_count and super_cache_scan anyway. - * Call to super_cache_count with shrinker_rwsem held - * ensures the safety of call to list_lru_shrink_count() and - * s_op->nr_cached_objects(). + * We don't call trylock_super() here as it is a scalability bottleneck, + * so we're exposed to partial setup state. The shrinker rwsem does not + * protect filesystem operations backing list_lru_shrink_count() or + * s_op->nr_cached_objects(). Counts can change between + * super_cache_count and super_cache_scan, so we really don't need locks + * here. + * + * However, if we are currently mounting the superblock, the underlying + * filesystem might be in a state of partial construction and hence it + * is dangerous to access it. trylock_super() uses a MS_BORN check to + * avoid this situation, so do the same here. The memory barrier is + * matched with the one in mount_fs() as we don't hold locks here. */ + if (!(sb->s_flags & MS_BORN)) + return 0; + smp_rmb(); + if (sb->s_op && sb->s_op->nr_cached_objects) total_objects = sb->s_op->nr_cached_objects(sb, sc); @@ -1151,6 +1161,14 @@ mount_fs(struct file_system_type *type, int flags, const char *name, struct vfsm sb = root->d_sb; BUG_ON(!sb); WARN_ON(!sb->s_bdi); + + /* + * Write barrier is for super_cache_count(). We place it before setting + * MS_BORN as the data dependency between the two functions is the + * superblock structure contents that we just set up, not the MS_BORN + * flag. + */ + smp_wmb(); sb->s_flags |= MS_BORN; error = security_sb_kern_mount(sb, flags, secdata); diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c index 02fa1dcc5969..29f5b2e589a1 100644 --- a/fs/sysv/inode.c +++ b/fs/sysv/inode.c @@ -275,7 +275,7 @@ static int __sysv_write_inode(struct inode *inode, int wait) } } brelse(bh); - return 0; + return err; } int sysv_write_inode(struct inode *inode, struct writeback_control *wbc) diff --git a/fs/timerfd.c b/fs/timerfd.c index 0548c572839c..1327a02ec778 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -50,8 +50,7 @@ static DEFINE_SPINLOCK(cancel_lock); static inline bool isalarm(struct timerfd_ctx *ctx) { return ctx->clockid == CLOCK_REALTIME_ALARM || - ctx->clockid == CLOCK_BOOTTIME_ALARM || - ctx->clockid == CLOCK_POWEROFF_ALARM; + ctx->clockid == CLOCK_BOOTTIME_ALARM; } /* @@ -143,8 +142,7 @@ static void timerfd_setup_cancel(struct timerfd_ctx *ctx, int flags) { spin_lock(&ctx->cancel_lock); if ((ctx->clockid == CLOCK_REALTIME || - ctx->clockid == CLOCK_REALTIME_ALARM || - ctx->clockid == CLOCK_POWEROFF_ALARM) && + ctx->clockid == CLOCK_REALTIME_ALARM) && (flags & TFD_TIMER_ABSTIME) && (flags & TFD_TIMER_CANCEL_ON_SET)) { if (!ctx->might_cancel) { ctx->might_cancel = true; @@ -176,7 +174,6 @@ static int timerfd_setup(struct timerfd_ctx *ctx, int flags, enum hrtimer_mode htmode; ktime_t texp; int clockid = ctx->clockid; - enum alarmtimer_type type; htmode = (flags & TFD_TIMER_ABSTIME) ? HRTIMER_MODE_ABS: HRTIMER_MODE_REL; @@ -187,8 +184,10 @@ static int timerfd_setup(struct timerfd_ctx *ctx, int flags, ctx->tintv = timespec_to_ktime(ktmr->it_interval); if (isalarm(ctx)) { - type = clock2alarm(ctx->clockid); - alarm_init(&ctx->t.alarm, type, timerfd_alarmproc); + alarm_init(&ctx->t.alarm, + ctx->clockid == CLOCK_REALTIME_ALARM ? + ALARM_REALTIME : ALARM_BOOTTIME, + timerfd_alarmproc); } else { hrtimer_init(&ctx->t.tmr, clockid, htmode); hrtimer_set_expires(&ctx->t.tmr, texp); @@ -388,7 +387,6 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags) { int ufd; struct timerfd_ctx *ctx; - enum alarmtimer_type type; /* Check the TFD_* constants for consistency. */ BUILD_BUG_ON(TFD_CLOEXEC != O_CLOEXEC); @@ -399,8 +397,7 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags) clockid != CLOCK_REALTIME && clockid != CLOCK_REALTIME_ALARM && clockid != CLOCK_BOOTTIME && - clockid != CLOCK_BOOTTIME_ALARM && - clockid != CLOCK_POWEROFF_ALARM)) + clockid != CLOCK_BOOTTIME_ALARM)) return -EINVAL; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); @@ -411,12 +408,13 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags) spin_lock_init(&ctx->cancel_lock); ctx->clockid = clockid; - if (isalarm(ctx)) { - type = clock2alarm(ctx->clockid); - alarm_init(&ctx->t.alarm, type, timerfd_alarmproc); - } else { + if (isalarm(ctx)) + alarm_init(&ctx->t.alarm, + ctx->clockid == CLOCK_REALTIME_ALARM ? + ALARM_REALTIME : ALARM_BOOTTIME, + timerfd_alarmproc); + else hrtimer_init(&ctx->t.tmr, clockid, HRTIMER_MODE_ABS); - } ctx->moffs = ktime_mono_to_real((ktime_t){ .tv64 = 0 }); @@ -488,10 +486,6 @@ static int do_timerfd_settime(int ufd, int flags, ret = timerfd_setup(ctx, flags, new); spin_unlock_irq(&ctx->wqh.lock); - - if (ctx->clockid == CLOCK_POWEROFF_ALARM) - set_power_on_alarm(); - fdput(f); return ret; } diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 0e659d9c69a1..613193c6bb42 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -1364,6 +1364,12 @@ reread: iinfo->i_alloc_type = le16_to_cpu(fe->icbTag.flags) & ICBTAG_FLAG_AD_MASK; + if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_SHORT && + iinfo->i_alloc_type != ICBTAG_FLAG_AD_LONG && + iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { + ret = -EIO; + goto out; + } iinfo->i_unique = 0; iinfo->i_lenEAttr = 0; iinfo->i_lenExtents = 0; diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index d859d8bd1f96..e7cc0d860499 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -793,6 +793,18 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, goto out_unlock; /* + * UFFDIO_COPY will fill file holes even without + * PROT_WRITE. This check enforces that if this is a + * MAP_SHARED, the process has write permission to the backing + * file. If VM_MAYWRITE is set it also enforces that on a + * MAP_SHARED vma: there is no F_WRITE_SEAL and no further + * F_WRITE_SEAL can be taken until the vma is destroyed. + */ + ret = -EPERM; + if (unlikely(!(cur->vm_flags & VM_MAYWRITE))) + goto out_unlock; + + /* * Check that this vma isn't already owned by a * different userfaultfd. We can't allow more than one * userfaultfd to own a single vma simultaneously or we @@ -817,6 +829,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, BUG_ON(vma->vm_ops); BUG_ON(vma->vm_userfaultfd_ctx.ctx && vma->vm_userfaultfd_ctx.ctx != ctx); + WARN_ON(!(vma->vm_flags & VM_MAYWRITE)); /* * Nothing to do: this vma is already registered into this @@ -953,6 +966,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, cond_resched(); BUG_ON(vma->vm_ops); + WARN_ON(!(vma->vm_flags & VM_MAYWRITE)); /* * Nothing to do: this vma is already registered into this diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index fb9636cc927c..5d8d12746e6e 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -528,7 +528,14 @@ xfs_attr_shortform_addname(xfs_da_args_t *args) if (args->flags & ATTR_CREATE) return retval; retval = xfs_attr_shortform_remove(args); - ASSERT(retval == 0); + if (retval) + return retval; + /* + * Since we have removed the old attr, clear ATTR_REPLACE so + * that the leaf format add routine won't trip over the attr + * not being around. + */ + args->flags &= ~ATTR_REPLACE; } if (args->namelen >= XFS_ATTR_SF_ENTSIZE_MAX || |
