From 035a571120ddbe4f92b91bbe46f3eff05b6e43eb Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Tue, 7 Apr 2009 07:40:57 +0800 Subject: ocfs2: Reserve 1 more cluster in expanding_inline_dir for indexed dir. In ocfs2_expand_inline_dir, we calculate whether we need 1 extra cluster if we can't store the dx inline the root and save it in dx_alloc. So add it when we call ocfs2_reserve_clusters. Signed-off-by: Tao Ma Signed-off-by: Mark Fasheh --- fs/ocfs2/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index e71160cda110..07d89204f0d3 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -2934,7 +2934,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, */ BUG_ON(alloc > 2); - ret = ocfs2_reserve_clusters(osb, alloc, &data_ac); + ret = ocfs2_reserve_clusters(osb, alloc + dx_alloc, &data_ac); if (ret) { mlog_errno(ret); goto out; -- cgit v1.2.3 From 3c48f23adada870db612a0dd3488605c4af5c0a5 Mon Sep 17 00:00:00 2001 From: Subrata Modak Date: Sun, 19 Apr 2009 01:10:03 +0530 Subject: configfs: Fix Trivial Warning in fs/configfs/symlink.c I observed the following build warning with fs/configfs/symlink.c: fs/configfs/symlink.c: In function 'configfs_symlink': fs/configfs/symlink.c:138: warning: 'target_item' may be used uninitialized in this function Here is a small fix for this. Cc: Patrick Mochel Cc: Balbir Singh Cc: Sachin P Sant Signed-Off-By: Subrata Modak Signed-off-by: Joel Becker --- fs/configfs/symlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c index 932a92b31483..c8afa6b1d91d 100644 --- a/fs/configfs/symlink.c +++ b/fs/configfs/symlink.c @@ -135,7 +135,7 @@ int configfs_symlink(struct inode *dir, struct dentry *dentry, const char *symna struct path path; struct configfs_dirent *sd; struct config_item *parent_item; - struct config_item *target_item; + struct config_item *target_item = NULL; struct config_item_type *type; ret = -EPERM; /* What lack-of-symlink returns */ -- cgit v1.2.3 From 0fba813748f16f4eaf24d492c505226c4026d58f Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Thu, 19 Mar 2009 05:08:43 +0800 Subject: ocfs2: Fix 2 warning during ocfs2 make. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fs/ocfs2/dir.c: In function ‘ocfs2_extend_dir’: fs/ocfs2/dir.c:2700: warning: ‘ret’ may be used uninitialized in this function fs/ocfs2/suballoc.c: In function ‘ocfs2_get_suballoc_slot_bit’: fs/ocfs2/suballoc.c:2216: warning: comparison is always true due to limited range of data type Signed-off-by: Tao Ma Signed-off-by: Joel Becker --- fs/ocfs2/dir.c | 2 +- fs/ocfs2/suballoc.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 07d89204f0d3..c5752305627c 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -2697,7 +2697,7 @@ static int ocfs2_dx_dir_index_block(struct inode *dir, u32 *num_dx_entries, struct buffer_head *dirent_bh) { - int ret, namelen, i; + int ret = 0, namelen, i; char *de_buf, *limit; struct ocfs2_dir_entry *de; struct buffer_head *dx_leaf_bh; diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index b4ca5911caaf..eb21dbb0ee0b 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -2213,7 +2213,7 @@ static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno, goto bail; } - if (le16_to_cpu(inode_fe->i_suballoc_slot) != OCFS2_INVALID_SLOT && + if (le16_to_cpu(inode_fe->i_suballoc_slot) != (u16)OCFS2_INVALID_SLOT && (u32)le16_to_cpu(inode_fe->i_suballoc_slot) > osb->max_slots - 1) { mlog(ML_ERROR, "inode %llu has invalid suballoc slot %u\n", blkno, (u32)le16_to_cpu(inode_fe->i_suballoc_slot)); -- cgit v1.2.3 From 5b09b507daaa882d888b6cd78ee89ba9caace44b Mon Sep 17 00:00:00 2001 From: Joel Becker Date: Tue, 21 Apr 2009 16:31:20 -0700 Subject: ocfs2: Fix some printk() warnings. The old %llu vs u64 battle. Cast them correctly. Signed-off-by: Joel Becker --- fs/ocfs2/export.c | 9 +++++---- fs/ocfs2/suballoc.c | 19 ++++++++++++------- 2 files changed, 17 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c index de3da8eb558c..15713cbb865c 100644 --- a/fs/ocfs2/export.c +++ b/fs/ocfs2/export.c @@ -100,7 +100,8 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb, /* If the inode allocator bit is clear, this inode must be stale */ if (!set) { - mlog(0, "inode %llu suballoc bit is clear\n", blkno); + mlog(0, "inode %llu suballoc bit is clear\n", + (unsigned long long)blkno); status = -ESTALE; goto unlock_nfs_sync; } @@ -114,7 +115,7 @@ check_err: if (status < 0) { if (status == -ESTALE) { mlog(0, "stale inode ino: %llu generation: %u\n", - blkno, handle->ih_generation); + (unsigned long long)blkno, handle->ih_generation); } result = ERR_PTR(status); goto bail; @@ -129,8 +130,8 @@ check_err: check_gen: if (handle->ih_generation != inode->i_generation) { iput(inode); - mlog(0, "stale inode ino: %llu generation: %u\n", blkno, - handle->ih_generation); + mlog(0, "stale inode ino: %llu generation: %u\n", + (unsigned long long)blkno, handle->ih_generation); result = ERR_PTR(-ESTALE); goto bail; } diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index eb21dbb0ee0b..8439f6b324b9 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -2197,18 +2197,20 @@ static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno, struct buffer_head *inode_bh = NULL; struct ocfs2_dinode *inode_fe; - mlog_entry("blkno: %llu\n", blkno); + mlog_entry("blkno: %llu\n", (unsigned long long)blkno); /* dirty read disk */ status = ocfs2_read_blocks_sync(osb, blkno, 1, &inode_bh); if (status < 0) { - mlog(ML_ERROR, "read block %llu failed %d\n", blkno, status); + mlog(ML_ERROR, "read block %llu failed %d\n", + (unsigned long long)blkno, status); goto bail; } inode_fe = (struct ocfs2_dinode *) inode_bh->b_data; if (!OCFS2_IS_VALID_DINODE(inode_fe)) { - mlog(ML_ERROR, "invalid inode %llu requested\n", blkno); + mlog(ML_ERROR, "invalid inode %llu requested\n", + (unsigned long long)blkno); status = -EINVAL; goto bail; } @@ -2216,7 +2218,8 @@ static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno, if (le16_to_cpu(inode_fe->i_suballoc_slot) != (u16)OCFS2_INVALID_SLOT && (u32)le16_to_cpu(inode_fe->i_suballoc_slot) > osb->max_slots - 1) { mlog(ML_ERROR, "inode %llu has invalid suballoc slot %u\n", - blkno, (u32)le16_to_cpu(inode_fe->i_suballoc_slot)); + (unsigned long long)blkno, + (u32)le16_to_cpu(inode_fe->i_suballoc_slot)); status = -EINVAL; goto bail; } @@ -2251,7 +2254,8 @@ static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb, u64 bg_blkno; int status; - mlog_entry("blkno: %llu bit: %u\n", blkno, (unsigned int)bit); + mlog_entry("blkno: %llu bit: %u\n", (unsigned long long)blkno, + (unsigned int)bit); alloc_fe = (struct ocfs2_dinode *)alloc_bh->b_data; if ((bit + 1) > ocfs2_bits_per_group(&alloc_fe->id2.i_chain)) { @@ -2266,7 +2270,8 @@ static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb, status = ocfs2_read_group_descriptor(suballoc, alloc_fe, bg_blkno, &group_bh); if (status < 0) { - mlog(ML_ERROR, "read group %llu failed %d\n", bg_blkno, status); + mlog(ML_ERROR, "read group %llu failed %d\n", + (unsigned long long)bg_blkno, status); goto bail; } @@ -2300,7 +2305,7 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res) struct inode *inode_alloc_inode; struct buffer_head *alloc_bh = NULL; - mlog_entry("blkno: %llu", blkno); + mlog_entry("blkno: %llu", (unsigned long long)blkno); status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot, &suballoc_bit); -- cgit v1.2.3 From a5a0a630922a2f6a774b6dac19f70cb5abd86bb0 Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Mon, 20 Apr 2009 21:34:18 -0700 Subject: ocfs2: Add missing iput() during error handling in ocfs2_dentry_attach_lock() In ocfs2_dentry_attach_lock(), if unable to get the dentry lock, we need to call iput(inode) because a failure here means no d_instantiate(), which means the normally matching iput() will not be called during dput(dentry). This patch fixes the oops that accompanies the following message: (3996,1):dlm_empty_lockres:2708 ERROR: lockres W00000000000000000a1046b06a4382 still has local locks! kernel BUG in dlm_empty_lockres at /rpmbuild/smushran/BUILD/ocfs2-1.4.2/fs/ocfs2/dlm/dlmmaster.c:2709! Signed-off-by: Sunil Mushran Signed-off-by: Joel Becker --- fs/ocfs2/dcache.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'fs') diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c index 7d604480557a..b574431a031d 100644 --- a/fs/ocfs2/dcache.c +++ b/fs/ocfs2/dcache.c @@ -290,6 +290,21 @@ out_attach: else mlog_errno(ret); + /* + * In case of error, manually free the allocation and do the iput(). + * We need to do this because error here means no d_instantiate(), + * which means iput() will not be called during dput(dentry). + */ + if (ret < 0 && !alias) { + ocfs2_lock_res_free(&dl->dl_lockres); + BUG_ON(dl->dl_count != 1); + spin_lock(&dentry_attach_lock); + dentry->d_fsdata = NULL; + spin_unlock(&dentry_attach_lock); + kfree(dl); + iput(inode); + } + dput(alias); return ret; -- cgit v1.2.3 From 485c26ec70f823f2a9cf45982b724893e53a859e Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 24 Apr 2009 13:43:20 -0400 Subject: ext4: Fix softlockup caused by illegal i_file_acl value in on-disk inode If the block containing external extended attributes (which is stored in i_file_acl and i_file_acl_high) is larger than the on-disk filesystem, the process which tried to access the extended attributes will endlessly issue kernel printks complaining that "__find_get_block_slow() failed", locking up that CPU until the system is forcibly rebooted. So when we read in the inode, make sure the i_file_acl value is legal, and if not, flag the filesystem as being corrupted. Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index c6bd6ced3bb7..cab75bbcd57b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4409,7 +4409,17 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32; } - if (ei->i_flags & EXT4_EXTENTS_FL) { + if (ei->i_file_acl && + ((ei->i_file_acl < + (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) + + EXT4_SB(sb)->s_gdb_count)) || + (ei->i_file_acl >= ext4_blocks_count(EXT4_SB(sb)->s_es)))) { + ext4_error(sb, __func__, + "bad extended attribute block %llu in inode #%lu", + ei->i_file_acl, inode->i_ino); + ret = -EIO; + goto bad_inode; + } else if (ei->i_flags & EXT4_EXTENTS_FL) { /* Validate extent which is part of inode */ ret = ext4_ext_check_inode(inode); } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || -- cgit v1.2.3 From 97e728d4353f38c87bf0804cdfd79a9b13fc2c3e Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 21 Apr 2009 17:40:57 -0400 Subject: Btrfs: try to keep a healthy ratio of metadata vs data block groups This patch makes the chunk allocator keep a good ratio of metadata vs data block groups. By default for every 8 data block groups, we'll allocate 1 metadata chunk, or about 12% of the disk will be allocated for metadata. This can be changed by specifying the metadata_ratio mount option. This is simply the number of data block groups that have to be allocated to force a metadata chunk allocation. By making sure we allocate metadata chunks more often, we are less likely to get into situations where the whole disk has been allocated as data block groups. Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 3 +++ fs/btrfs/disk-io.c | 1 + fs/btrfs/extent-tree.c | 28 +++++++++++++++++++++++++++- fs/btrfs/super.c | 12 +++++++++++- 4 files changed, 42 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index ad96495dedc5..213535f45da2 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -881,6 +881,9 @@ struct btrfs_fs_info { u64 metadata_alloc_profile; u64 system_alloc_profile; + unsigned data_chunk_allocations; + unsigned metadata_ratio; + void *bdev_holder; }; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index a6b83744b05d..44c94d808e2b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1604,6 +1604,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, fs_info->btree_inode = new_inode(sb); fs_info->btree_inode->i_ino = 1; fs_info->btree_inode->i_nlink = 1; + fs_info->metadata_ratio = 8; fs_info->thread_pool_size = min_t(unsigned long, num_online_cpus() + 2, 8); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 178df4c67de4..2895a8373232 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1918,15 +1918,29 @@ void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode, spin_unlock(&info->lock); } +static void force_metadata_allocation(struct btrfs_fs_info *info) +{ + struct list_head *head = &info->space_info; + struct btrfs_space_info *found; + + rcu_read_lock(); + list_for_each_entry_rcu(found, head, list) { + if (found->flags & BTRFS_BLOCK_GROUP_METADATA) + found->force_alloc = 1; + } + rcu_read_unlock(); +} + static int do_chunk_alloc(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, u64 alloc_bytes, u64 flags, int force) { struct btrfs_space_info *space_info; + struct btrfs_fs_info *fs_info = extent_root->fs_info; u64 thresh; int ret = 0; - mutex_lock(&extent_root->fs_info->chunk_mutex); + mutex_lock(&fs_info->chunk_mutex); flags = btrfs_reduce_alloc_profile(extent_root, flags); @@ -1958,6 +1972,18 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, } spin_unlock(&space_info->lock); + /* + * if we're doing a data chunk, go ahead and make sure that + * we keep a reasonable number of metadata chunks allocated in the + * FS as well. + */ + if (flags & BTRFS_BLOCK_GROUP_DATA) { + fs_info->data_chunk_allocations++; + if (!(fs_info->data_chunk_allocations % + fs_info->metadata_ratio)) + force_metadata_allocation(fs_info); + } + ret = btrfs_alloc_chunk(trans, extent_root, flags); if (ret) space_info->full = 1; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 9744af9d71e9..30c9a8ca2a54 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -68,7 +68,7 @@ enum { Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow, Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd, Opt_thread_pool, Opt_noacl, Opt_compress, Opt_notreelog, - Opt_flushoncommit, Opt_err, + Opt_ratio, Opt_flushoncommit, Opt_err, }; static match_table_t tokens = { @@ -87,6 +87,7 @@ static match_table_t tokens = { {Opt_noacl, "noacl"}, {Opt_notreelog, "notreelog"}, {Opt_flushoncommit, "flushoncommit"}, + {Opt_ratio, "metadata_ratio=%d"}, {Opt_err, NULL}, }; @@ -234,6 +235,15 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) printk(KERN_INFO "btrfs: turning on flush-on-commit\n"); btrfs_set_opt(info->mount_opt, FLUSHONCOMMIT); break; + case Opt_ratio: + intarg = 0; + match_int(&args[0], &intarg); + if (intarg) { + info->metadata_ratio = intarg; + printk(KERN_INFO "btrfs: metadata ratio %d\n", + info->metadata_ratio); + } + break; default: break; } -- cgit v1.2.3 From 2ea2544ef5dad5cac52f1e4c7b812631274fc1cb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 13 Apr 2009 15:32:28 +0200 Subject: Btrfs: simplify makefile Get rid of the hacks for building out of tree, and always use += for assigning to the object lists. Signed-off-by: Christoph Hellwig Signed-off-by: Chris Mason --- fs/btrfs/Makefile | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 9adf5e4f7e96..94212844a9bc 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -1,25 +1,10 @@ -ifneq ($(KERNELRELEASE),) -# kbuild part of makefile obj-$(CONFIG_BTRFS_FS) := btrfs.o -btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ + +btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ file-item.o inode-item.o inode-map.o disk-io.o \ transaction.o inode.o file.o tree-defrag.o \ extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ ref-cache.o export.o tree-log.o acl.o free-space-cache.o zlib.o \ compression.o delayed-ref.o -else - -# Normal Makefile - -KERNELDIR := /lib/modules/`uname -r`/build -all: - $(MAKE) -C $(KERNELDIR) M=`pwd` CONFIG_BTRFS_FS=m modules - -modules_install: - $(MAKE) -C $(KERNELDIR) M=`pwd` modules_install -clean: - $(MAKE) -C $(KERNELDIR) M=`pwd` clean - -endif -- cgit v1.2.3 From 0d4bf11e5309eff64272a49e1ea55658372abc56 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 13 Apr 2009 15:33:56 +0200 Subject: Btrfs: don't export symbols Currently the extent_map code is only for btrfs so don't export it's symbols. Signed-off-by: Christoph Hellwig Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index b187917b36fa..9827fa1de4e1 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -43,7 +43,6 @@ void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask) tree->map.rb_node = NULL; spin_lock_init(&tree->lock); } -EXPORT_SYMBOL(extent_map_tree_init); /** * alloc_extent_map - allocate new extent map structure @@ -64,7 +63,6 @@ struct extent_map *alloc_extent_map(gfp_t mask) atomic_set(&em->refs, 1); return em; } -EXPORT_SYMBOL(alloc_extent_map); /** * free_extent_map - drop reference count of an extent_map @@ -83,7 +81,6 @@ void free_extent_map(struct extent_map *em) kmem_cache_free(extent_map_cache, em); } } -EXPORT_SYMBOL(free_extent_map); static struct rb_node *tree_insert(struct rb_root *root, u64 offset, struct rb_node *node) @@ -264,7 +261,6 @@ int add_extent_mapping(struct extent_map_tree *tree, out: return ret; } -EXPORT_SYMBOL(add_extent_mapping); /* simple helper to do math around the end of an extent, handling wrap */ static u64 range_end(u64 start, u64 len) @@ -326,7 +322,6 @@ found: out: return em; } -EXPORT_SYMBOL(lookup_extent_mapping); /** * remove_extent_mapping - removes an extent_map from the extent tree @@ -346,4 +341,3 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) em->in_tree = 0; return ret; } -EXPORT_SYMBOL(remove_extent_mapping); -- cgit v1.2.3 From 9601e3f6336f6ca66929f451b1f66085e68e36e3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 13 Apr 2009 15:33:09 +0200 Subject: Btrfs: kill btrfs_cache_create Just use kmem_cache_create directly. Signed-off-by: Christoph Hellwig Signed-off-by: Chris Mason --- fs/btrfs/extent_io.c | 18 ++++++------------ fs/btrfs/extent_map.c | 11 +++-------- fs/btrfs/inode.c | 42 +++++++++++++++++++----------------------- 3 files changed, 28 insertions(+), 43 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 05a1c42e25bf..c33b54029d78 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -17,12 +17,6 @@ #include "ctree.h" #include "btrfs_inode.h" -/* temporary define until extent_map moves out of btrfs */ -struct kmem_cache *btrfs_cache_create(const char *name, size_t size, - unsigned long extra_flags, - void (*ctor)(void *, struct kmem_cache *, - unsigned long)); - static struct kmem_cache *extent_state_cache; static struct kmem_cache *extent_buffer_cache; @@ -58,15 +52,15 @@ struct extent_page_data { int __init extent_io_init(void) { - extent_state_cache = btrfs_cache_create("extent_state", - sizeof(struct extent_state), 0, - NULL); + extent_state_cache = kmem_cache_create("extent_state", + sizeof(struct extent_state), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); if (!extent_state_cache) return -ENOMEM; - extent_buffer_cache = btrfs_cache_create("extent_buffers", - sizeof(struct extent_buffer), 0, - NULL); + extent_buffer_cache = kmem_cache_create("extent_buffers", + sizeof(struct extent_buffer), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); if (!extent_buffer_cache) goto free_state_cache; return 0; diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 9827fa1de4e1..30c9365861e6 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -6,19 +6,14 @@ #include #include "extent_map.h" -/* temporary define until extent_map moves out of btrfs */ -struct kmem_cache *btrfs_cache_create(const char *name, size_t size, - unsigned long extra_flags, - void (*ctor)(void *, struct kmem_cache *, - unsigned long)); static struct kmem_cache *extent_map_cache; int __init extent_map_init(void) { - extent_map_cache = btrfs_cache_create("extent_map", - sizeof(struct extent_map), 0, - NULL); + extent_map_cache = kmem_cache_create("extent_map", + sizeof(struct extent_map), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); if (!extent_map_cache) return -ENOMEM; return 0; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 65219f6a16a1..176b6cc28b1e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4640,39 +4640,35 @@ void btrfs_destroy_cachep(void) kmem_cache_destroy(btrfs_path_cachep); } -struct kmem_cache *btrfs_cache_create(const char *name, size_t size, - unsigned long extra_flags, - void (*ctor)(void *)) -{ - return kmem_cache_create(name, size, 0, (SLAB_RECLAIM_ACCOUNT | - SLAB_MEM_SPREAD | extra_flags), ctor); -} - int btrfs_init_cachep(void) { - btrfs_inode_cachep = btrfs_cache_create("btrfs_inode_cache", - sizeof(struct btrfs_inode), - 0, init_once); + btrfs_inode_cachep = kmem_cache_create("btrfs_inode_cache", + sizeof(struct btrfs_inode), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once); if (!btrfs_inode_cachep) goto fail; - btrfs_trans_handle_cachep = - btrfs_cache_create("btrfs_trans_handle_cache", - sizeof(struct btrfs_trans_handle), - 0, NULL); + + btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle_cache", + sizeof(struct btrfs_trans_handle), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); if (!btrfs_trans_handle_cachep) goto fail; - btrfs_transaction_cachep = btrfs_cache_create("btrfs_transaction_cache", - sizeof(struct btrfs_transaction), - 0, NULL); + + btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction_cache", + sizeof(struct btrfs_transaction), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); if (!btrfs_transaction_cachep) goto fail; - btrfs_path_cachep = btrfs_cache_create("btrfs_path_cache", - sizeof(struct btrfs_path), - 0, NULL); + + btrfs_path_cachep = kmem_cache_create("btrfs_path_cache", + sizeof(struct btrfs_path), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); if (!btrfs_path_cachep) goto fail; - btrfs_bit_radix_cachep = btrfs_cache_create("btrfs_radix", 256, - SLAB_DESTROY_BY_RCU, NULL); + + btrfs_bit_radix_cachep = kmem_cache_create("btrfs_radix", 256, 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | + SLAB_DESTROY_BY_RCU, NULL); if (!btrfs_bit_radix_cachep) goto fail; return 0; -- cgit v1.2.3 From e980b50cda1610f1c17978d9b7fd311a9dd93877 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 24 Apr 2009 14:39:24 -0400 Subject: Btrfs: fix fallocate deadlock on inode extent lock The btrfs fallocate call takes an extent lock on the entire range being fallocated, and then runs through insert_reserved_extent on each extent as they are allocated. The problem with this is that btrfs_drop_extents may decide to try and take the same extent lock fallocate was already holding. The solution used here is to push down knowledge of the range that is already locked going into btrfs_drop_extents. It turns out that at least one other caller had the same bug. Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 3 ++- fs/btrfs/file.c | 11 ++++++----- fs/btrfs/inode.c | 27 ++++++++++++++++++--------- fs/btrfs/ioctl.c | 3 ++- fs/btrfs/tree-log.c | 2 +- 5 files changed, 29 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 213535f45da2..4414a5d9983a 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2177,7 +2177,8 @@ int btrfs_check_file(struct btrfs_root *root, struct inode *inode); extern struct file_operations btrfs_file_operations; int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, - u64 start, u64 end, u64 inline_limit, u64 *hint_block); + u64 start, u64 end, u64 locked_end, + u64 inline_limit, u64 *hint_block); int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, u64 start, u64 end); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 482f8db2cfd0..da3ed965c956 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -363,15 +363,16 @@ out: */ noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, - u64 start, u64 end, u64 inline_limit, u64 *hint_byte) + u64 start, u64 end, u64 locked_end, + u64 inline_limit, u64 *hint_byte) { u64 extent_end = 0; - u64 locked_end = end; u64 search_start = start; u64 leaf_start; u64 ram_bytes = 0; u64 orig_parent = 0; u64 disk_bytenr = 0; + u64 orig_locked_end = locked_end; u8 compression; u8 encryption; u16 other_encoding = 0; @@ -684,9 +685,9 @@ next_slot: } out: btrfs_free_path(path); - if (locked_end > end) { - unlock_extent(&BTRFS_I(inode)->io_tree, end, locked_end - 1, - GFP_NOFS); + if (locked_end > orig_locked_end) { + unlock_extent(&BTRFS_I(inode)->io_tree, orig_locked_end, + locked_end - 1, GFP_NOFS); } btrfs_check_file(root, inode); return ret; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 176b6cc28b1e..2fdb2995be64 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -234,7 +234,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, } ret = btrfs_drop_extents(trans, root, inode, start, - aligned_end, start, &hint_byte); + aligned_end, aligned_end, start, &hint_byte); BUG_ON(ret); if (isize > actual_end) @@ -1439,6 +1439,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, struct inode *inode, u64 file_pos, u64 disk_bytenr, u64 disk_num_bytes, u64 num_bytes, u64 ram_bytes, + u64 locked_end, u8 compression, u8 encryption, u16 other_encoding, int extent_type) { @@ -1455,7 +1456,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, path->leave_spinning = 1; ret = btrfs_drop_extents(trans, root, inode, file_pos, - file_pos + num_bytes, file_pos, &hint); + file_pos + num_bytes, locked_end, + file_pos, &hint); BUG_ON(ret); ins.objectid = inode->i_ino; @@ -1590,6 +1592,8 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) ordered_extent->disk_len, ordered_extent->len, ordered_extent->len, + ordered_extent->file_offset + + ordered_extent->len, compressed, 0, 0, BTRFS_FILE_EXTENT_REG); BUG_ON(ret); @@ -2877,6 +2881,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t size) err = btrfs_drop_extents(trans, root, inode, cur_offset, cur_offset + hole_size, + block_end, cur_offset, &hint_byte); if (err) break; @@ -4968,7 +4973,7 @@ out_fail: static int prealloc_file_range(struct btrfs_trans_handle *trans, struct inode *inode, u64 start, u64 end, - u64 alloc_hint, int mode) + u64 locked_end, u64 alloc_hint, int mode) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_key ins; @@ -4989,7 +4994,8 @@ static int prealloc_file_range(struct btrfs_trans_handle *trans, ret = insert_reserved_file_extent(trans, inode, cur_offset, ins.objectid, ins.offset, ins.offset, - ins.offset, 0, 0, 0, + ins.offset, locked_end, + 0, 0, 0, BTRFS_FILE_EXTENT_PREALLOC); BUG_ON(ret); num_bytes -= ins.offset; @@ -5018,6 +5024,7 @@ static long btrfs_fallocate(struct inode *inode, int mode, u64 alloc_start; u64 alloc_end; u64 alloc_hint = 0; + u64 locked_end; u64 mask = BTRFS_I(inode)->root->sectorsize - 1; struct extent_map *em; struct btrfs_trans_handle *trans; @@ -5039,6 +5046,7 @@ static long btrfs_fallocate(struct inode *inode, int mode, goto out; } + locked_end = alloc_end - 1; while (1) { struct btrfs_ordered_extent *ordered; @@ -5051,8 +5059,8 @@ static long btrfs_fallocate(struct inode *inode, int mode, /* the extent lock is ordered inside the running * transaction */ - lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, - alloc_end - 1, GFP_NOFS); + lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, + GFP_NOFS); ordered = btrfs_lookup_first_ordered_extent(inode, alloc_end - 1); if (ordered && @@ -5060,7 +5068,7 @@ static long btrfs_fallocate(struct inode *inode, int mode, ordered->file_offset < alloc_end) { btrfs_put_ordered_extent(ordered); unlock_extent(&BTRFS_I(inode)->io_tree, - alloc_start, alloc_end - 1, GFP_NOFS); + alloc_start, locked_end, GFP_NOFS); btrfs_end_transaction(trans, BTRFS_I(inode)->root); /* @@ -5085,7 +5093,8 @@ static long btrfs_fallocate(struct inode *inode, int mode, last_byte = (last_byte + mask) & ~mask; if (em->block_start == EXTENT_MAP_HOLE) { ret = prealloc_file_range(trans, inode, cur_offset, - last_byte, alloc_hint, mode); + last_byte, locked_end + 1, + alloc_hint, mode); if (ret < 0) { free_extent_map(em); break; @@ -5101,7 +5110,7 @@ static long btrfs_fallocate(struct inode *inode, int mode, break; } } - unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, alloc_end - 1, + unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, GFP_NOFS); btrfs_end_transaction(trans, BTRFS_I(inode)->root); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 7594bec1be10..f4e5d2e5ece6 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -830,7 +830,8 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, BUG_ON(!trans); /* punch hole in destination first */ - btrfs_drop_extents(trans, root, inode, off, off+len, 0, &hint_byte); + btrfs_drop_extents(trans, root, inode, off, off + len, + off + len, 0, &hint_byte); /* clone data */ key.objectid = src->i_ino; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 25f20ea11f27..db5e212e8445 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -536,7 +536,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, saved_nbytes = inode_get_bytes(inode); /* drop any overlapping extents */ ret = btrfs_drop_extents(trans, root, inode, - start, extent_end, start, &alloc_hint); + start, extent_end, extent_end, start, &alloc_hint); BUG_ON(ret); if (found_type == BTRFS_FILE_EXTENT_REG || -- cgit v1.2.3 From 59bc5c758ece00fb0b2a170dd8fbbf31f1856c8a Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 24 Apr 2009 14:39:25 -0400 Subject: Btrfs: fix deadlocks and stalls on dead root removal After a transaction commit, the old root of the subvol btrees are sent through snapshot removal. This is what actually frees up any blocks replaced by COW, and anything the old blocks pointed to. Snapshot deletion will pause when a transaction commit has started, which helps to avoid a huge amount of delayed reference count updates piling up as the transaction is trying to close. But, this pause happens after the snapshot deletion process has asked other procs on the system to throttle back a bit so that it can make progress. We don't want to throttle everyone while we're waiting for the transaction commit, it leads to deadlocks in the user transaction ioctls used by Ceph and makes things slower in general. This patch changes things to avoid the throttling while we sleep. Signed-off-by: Chris Mason --- fs/btrfs/transaction.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 2869b3361eb6..01b143605ec1 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -687,7 +687,13 @@ static noinline int wait_transaction_pre_flush(struct btrfs_fs_info *info) prepare_to_wait(&info->transaction_wait, &wait, TASK_UNINTERRUPTIBLE); mutex_unlock(&info->trans_mutex); + + atomic_dec(&info->throttles); + wake_up(&info->transaction_throttle); + schedule(); + + atomic_inc(&info->throttles); mutex_lock(&info->trans_mutex); finish_wait(&info->transaction_wait, &wait); } -- cgit v1.2.3 From a9e817425dc0baede8ebe5fbc9984a640257432b Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 24 Apr 2009 16:11:18 -0400 Subject: ext4: Ignore i_file_acl_high unless EXT4_FEATURE_INCOMPAT_64BIT is present Don't try to look at i_file_acl_high unless the INCOMPAT_64BIT feature bit is set. The field is normally zero, but older versions of e2fsck didn't automatically check to make sure of this, so in the spirit of "be liberal in what you accept", don't look at i_file_acl_high unless we are using a 64-bit filesystem. Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index cab75bbcd57b..11460037ea9d 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4357,11 +4357,9 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ei->i_flags = le32_to_cpu(raw_inode->i_flags); inode->i_blocks = ext4_inode_blocks(raw_inode, ei); ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo); - if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != - cpu_to_le32(EXT4_OS_HURD)) { + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT)) ei->i_file_acl |= ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; - } inode->i_size = ext4_isize(raw_inode); ei->i_disksize = inode->i_size; inode->i_generation = le32_to_cpu(raw_inode->i_generation); -- cgit v1.2.3 From c4b5a614316c505922a522b2e35ba05ea3e08a7c Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 24 Apr 2009 18:45:35 -0400 Subject: ext4: Do not try to validate extents on special files The EXTENTS_FL flag should never be set on special files, but if it is, don't bother trying to validate that the extents tree is valid, since only files, directories, and non-fast symlinks will ever have an extent data structure. We perhaps should flag the filesystem as being corrupted if we see a special file (named pipes, device nodes, Unix domain sockets, etc.) with the EXTENTS_FL flag, but e2fsck doesn't currently check this case, so we'll just ignore this for now, since it's harmless. Without this fix, a special device with the extents flag is flagged as an error by the kernel, so it is impossible to access or delete the inode, but e2fsck doesn't see it as a problem, leading to confused/frustrated users. Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 11460037ea9d..e91f978c7f12 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4407,6 +4407,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32; } + ret = 0; if (ei->i_file_acl && ((ei->i_file_acl < (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) + @@ -4418,8 +4419,11 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ret = -EIO; goto bad_inode; } else if (ei->i_flags & EXT4_EXTENTS_FL) { - /* Validate extent which is part of inode */ - ret = ext4_ext_check_inode(inode); + if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + (S_ISLNK(inode->i_mode) && + !ext4_inode_is_fast_symlink(inode))) + /* Validate extent which is part of inode */ + ret = ext4_ext_check_inode(inode); } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || (S_ISLNK(inode->i_mode) && !ext4_inode_is_fast_symlink(inode))) { -- cgit v1.2.3 From d6397baee468809ef311e763dfc6e9f73418f8a6 Mon Sep 17 00:00:00 2001 From: Chris Ball Date: Mon, 27 Apr 2009 07:29:03 -0400 Subject: Btrfs: When shrinking, only update disk size on success Previously, we updated a device's size prior to attempting a shrink operation. This patch moves the device resizing logic to only happen if the shrink completes successfully. In the process, it introduces a new field to btrfs_device -- disk_total_bytes -- to track the on-disk size. Signed-off-by: Chris Ball Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 35 ++++++++++++++++++++++++----------- fs/btrfs/volumes.h | 3 +++ 2 files changed, 27 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index e53835b88594..5f01dad4b696 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1543,7 +1543,7 @@ static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, btrfs_set_device_io_align(leaf, dev_item, device->io_align); btrfs_set_device_io_width(leaf, dev_item, device->io_width); btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); - btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes); + btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes); btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used); btrfs_mark_buffer_dirty(leaf); @@ -1940,14 +1940,6 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) device->total_bytes = new_size; if (device->writeable) device->fs_devices->total_rw_bytes -= diff; - ret = btrfs_update_device(trans, device); - if (ret) { - unlock_chunks(root); - btrfs_end_transaction(trans, root); - goto done; - } - WARN_ON(diff > old_total); - btrfs_set_super_total_bytes(super_copy, old_total - diff); unlock_chunks(root); btrfs_end_transaction(trans, root); @@ -1979,7 +1971,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) length = btrfs_dev_extent_length(l, dev_extent); if (key.offset + length <= new_size) - goto done; + break; chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent); chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent); @@ -1992,6 +1984,26 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) goto done; } + /* Shrinking succeeded, else we would be at "done". */ + trans = btrfs_start_transaction(root, 1); + if (!trans) { + ret = -ENOMEM; + goto done; + } + lock_chunks(root); + + device->disk_total_bytes = new_size; + /* Now btrfs_update_device() will change the on-disk size. */ + ret = btrfs_update_device(trans, device); + if (ret) { + unlock_chunks(root); + btrfs_end_transaction(trans, root); + goto done; + } + WARN_ON(diff > old_total); + btrfs_set_super_total_bytes(super_copy, old_total - diff); + unlock_chunks(root); + btrfs_end_transaction(trans, root); done: btrfs_free_path(path); return ret; @@ -3076,7 +3088,8 @@ static int fill_device_from_item(struct extent_buffer *leaf, unsigned long ptr; device->devid = btrfs_device_id(leaf, dev_item); - device->total_bytes = btrfs_device_total_bytes(leaf, dev_item); + device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item); + device->total_bytes = device->disk_total_bytes; device->bytes_used = btrfs_device_bytes_used(leaf, dev_item); device->type = btrfs_device_type(leaf, dev_item); device->io_align = btrfs_device_io_align(leaf, dev_item); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 5836327ba5dd..5c3ff6d02fd7 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -61,6 +61,9 @@ struct btrfs_device { /* size of the device */ u64 total_bytes; + /* size of the disk */ + u64 disk_total_bytes; + /* bytes used */ u64 bytes_used; -- cgit v1.2.3 From b7967db75a38df4891b22efe1b0969b9357eb946 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 27 Apr 2009 07:29:04 -0400 Subject: Btrfs: remove #if 0 code Btrfs had some old code sitting around under #if 0, this drops it. Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 48 +------------------------------- fs/btrfs/extent_io.c | 63 ------------------------------------------ fs/btrfs/file.c | 78 ---------------------------------------------------- 3 files changed, 1 insertion(+), 188 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 44c94d808e2b..77f9a3b824be 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -584,18 +584,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, btrfs_set_work_high_prio(&async->work); btrfs_queue_worker(&fs_info->workers, &async->work); -#if 0 - int limit = btrfs_async_submit_limit(fs_info); - if (atomic_read(&fs_info->nr_async_submits) > limit) { - wait_event_timeout(fs_info->async_submit_wait, - (atomic_read(&fs_info->nr_async_submits) < limit), - HZ/10); - wait_event_timeout(fs_info->async_submit_wait, - (atomic_read(&fs_info->nr_async_bios) < limit), - HZ/10); - } -#endif while (atomic_read(&fs_info->async_submit_draining) && atomic_read(&fs_info->nr_async_submits)) { wait_event(fs_info->async_submit_wait, @@ -770,27 +759,6 @@ static void btree_invalidatepage(struct page *page, unsigned long offset) } } -#if 0 -static int btree_writepage(struct page *page, struct writeback_control *wbc) -{ - struct buffer_head *bh; - struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; - struct buffer_head *head; - if (!page_has_buffers(page)) { - create_empty_buffers(page, root->fs_info->sb->s_blocksize, - (1 << BH_Dirty)|(1 << BH_Uptodate)); - } - head = page_buffers(page); - bh = head; - do { - if (buffer_dirty(bh)) - csum_tree_block(root, bh, 0); - bh = bh->b_this_page; - } while (bh != head); - return block_write_full_page(page, btree_get_block, wbc); -} -#endif - static struct address_space_operations btree_aops = { .readpage = btree_readpage, .writepage = btree_writepage, @@ -1278,11 +1246,7 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits) int ret = 0; struct btrfs_device *device; struct backing_dev_info *bdi; -#if 0 - if ((bdi_bits & (1 << BDI_write_congested)) && - btrfs_congested_async(info, 0)) - return 1; -#endif + list_for_each_entry(device, &info->fs_devices->devices, dev_list) { if (!device->bdev) continue; @@ -2334,16 +2298,6 @@ int close_ctree(struct btrfs_root *root) btrfs_stop_workers(&fs_info->endio_write_workers); btrfs_stop_workers(&fs_info->submit_workers); -#if 0 - while (!list_empty(&fs_info->hashers)) { - struct btrfs_hasher *hasher; - hasher = list_entry(fs_info->hashers.next, struct btrfs_hasher, - hashers); - list_del(&hasher->hashers); - crypto_free_hash(&fs_info->hash_tfm); - kfree(hasher); - } -#endif btrfs_close_devices(fs_info->fs_devices); btrfs_mapping_tree_free(&fs_info->mapping_tree); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index c33b54029d78..fe9eb990e443 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1401,69 +1401,6 @@ out: return total_bytes; } -#if 0 -/* - * helper function to lock both pages and extents in the tree. - * pages must be locked first. - */ -static int lock_range(struct extent_io_tree *tree, u64 start, u64 end) -{ - unsigned long index = start >> PAGE_CACHE_SHIFT; - unsigned long end_index = end >> PAGE_CACHE_SHIFT; - struct page *page; - int err; - - while (index <= end_index) { - page = grab_cache_page(tree->mapping, index); - if (!page) { - err = -ENOMEM; - goto failed; - } - if (IS_ERR(page)) { - err = PTR_ERR(page); - goto failed; - } - index++; - } - lock_extent(tree, start, end, GFP_NOFS); - return 0; - -failed: - /* - * we failed above in getting the page at 'index', so we undo here - * up to but not including the page at 'index' - */ - end_index = index; - index = start >> PAGE_CACHE_SHIFT; - while (index < end_index) { - page = find_get_page(tree->mapping, index); - unlock_page(page); - page_cache_release(page); - index++; - } - return err; -} - -/* - * helper function to unlock both pages and extents in the tree. - */ -static int unlock_range(struct extent_io_tree *tree, u64 start, u64 end) -{ - unsigned long index = start >> PAGE_CACHE_SHIFT; - unsigned long end_index = end >> PAGE_CACHE_SHIFT; - struct page *page; - - while (index <= end_index) { - page = find_get_page(tree->mapping, index); - unlock_page(page); - page_cache_release(page); - index++; - } - unlock_extent(tree, start, end, GFP_NOFS); - return 0; -} -#endif - /* * set the private field for a given byte offset in the tree. If there isn't * an extent_state there already, this does nothing. diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index da3ed965c956..1d51dc38bb49 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -272,83 +272,6 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, return 0; } -int btrfs_check_file(struct btrfs_root *root, struct inode *inode) -{ - return 0; -#if 0 - struct btrfs_path *path; - struct btrfs_key found_key; - struct extent_buffer *leaf; - struct btrfs_file_extent_item *extent; - u64 last_offset = 0; - int nritems; - int slot; - int found_type; - int ret; - int err = 0; - u64 extent_end = 0; - - path = btrfs_alloc_path(); - ret = btrfs_lookup_file_extent(NULL, root, path, inode->i_ino, - last_offset, 0); - while (1) { - nritems = btrfs_header_nritems(path->nodes[0]); - if (path->slots[0] >= nritems) { - ret = btrfs_next_leaf(root, path); - if (ret) - goto out; - nritems = btrfs_header_nritems(path->nodes[0]); - } - slot = path->slots[0]; - leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &found_key, slot); - if (found_key.objectid != inode->i_ino) - break; - if (found_key.type != BTRFS_EXTENT_DATA_KEY) - goto out; - - if (found_key.offset < last_offset) { - WARN_ON(1); - btrfs_print_leaf(root, leaf); - printk(KERN_ERR "inode %lu found offset %llu " - "expected %llu\n", inode->i_ino, - (unsigned long long)found_key.offset, - (unsigned long long)last_offset); - err = 1; - goto out; - } - extent = btrfs_item_ptr(leaf, slot, - struct btrfs_file_extent_item); - found_type = btrfs_file_extent_type(leaf, extent); - if (found_type == BTRFS_FILE_EXTENT_REG) { - extent_end = found_key.offset + - btrfs_file_extent_num_bytes(leaf, extent); - } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { - struct btrfs_item *item; - item = btrfs_item_nr(leaf, slot); - extent_end = found_key.offset + - btrfs_file_extent_inline_len(leaf, extent); - extent_end = (extent_end + root->sectorsize - 1) & - ~((u64)root->sectorsize - 1); - } - last_offset = extent_end; - path->slots[0]++; - } - if (0 && last_offset < inode->i_size) { - WARN_ON(1); - btrfs_print_leaf(root, leaf); - printk(KERN_ERR "inode %lu found offset %llu size %llu\n", - inode->i_ino, (unsigned long long)last_offset, - (unsigned long long)inode->i_size); - err = 1; - - } -out: - btrfs_free_path(path); - return err; -#endif -} - /* * this is very complex, but the basic idea is to drop all extents * in the range start - end. hint_block is filled in with a block number @@ -689,7 +612,6 @@ out: unlock_extent(&BTRFS_I(inode)->io_tree, orig_locked_end, locked_end - 1, GFP_NOFS); } - btrfs_check_file(root, inode); return ret; } -- cgit v1.2.3 From 193f284d4985db0370a8a1bbdfb20df548cf9ffb Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 27 Apr 2009 07:29:05 -0400 Subject: Btrfs: ratelimit IO error printks Btrfs has printks for various IO errors, including bad checksums and mismatches between what we expect the block headers to contain and what we actually find on the disk. Longer term we need a real reporting mechanism for this, but for now printk is going to have to do. Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 38 +++++++++++++++++++++++++------------- fs/btrfs/inode.c | 10 ++++++---- 2 files changed, 31 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 77f9a3b824be..aa0c259b9c28 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -232,10 +232,14 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, memcpy(&found, result, csum_size); read_extent_buffer(buf, &val, 0, csum_size); - printk(KERN_INFO "btrfs: %s checksum verify failed " - "on %llu wanted %X found %X level %d\n", - root->fs_info->sb->s_id, - buf->start, val, found, btrfs_header_level(buf)); + if (printk_ratelimit()) { + printk(KERN_INFO "btrfs: %s checksum verify " + "failed on %llu wanted %X found %X " + "level %d\n", + root->fs_info->sb->s_id, + (unsigned long long)buf->start, val, found, + btrfs_header_level(buf)); + } if (result != (char *)&inline_result) kfree(result); return 1; @@ -268,10 +272,13 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, ret = 0; goto out; } - printk("parent transid verify failed on %llu wanted %llu found %llu\n", - (unsigned long long)eb->start, - (unsigned long long)parent_transid, - (unsigned long long)btrfs_header_generation(eb)); + if (printk_ratelimit()) { + printk("parent transid verify failed on %llu wanted %llu " + "found %llu\n", + (unsigned long long)eb->start, + (unsigned long long)parent_transid, + (unsigned long long)btrfs_header_generation(eb)); + } ret = 1; clear_extent_buffer_uptodate(io_tree, eb); out: @@ -415,9 +422,12 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, found_start = btrfs_header_bytenr(eb); if (found_start != start) { - printk(KERN_INFO "btrfs bad tree block start %llu %llu\n", - (unsigned long long)found_start, - (unsigned long long)eb->start); + if (printk_ratelimit()) { + printk(KERN_INFO "btrfs bad tree block start " + "%llu %llu\n", + (unsigned long long)found_start, + (unsigned long long)eb->start); + } ret = -EIO; goto err; } @@ -429,8 +439,10 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, goto err; } if (check_tree_block_fsid(root, eb)) { - printk(KERN_INFO "btrfs bad fsid on block %llu\n", - (unsigned long long)eb->start); + if (printk_ratelimit()) { + printk(KERN_INFO "btrfs bad fsid on block %llu\n", + (unsigned long long)eb->start); + } ret = -EIO; goto err; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 2fdb2995be64..552e08afc7fb 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1823,10 +1823,12 @@ good: return 0; zeroit: - printk(KERN_INFO "btrfs csum failed ino %lu off %llu csum %u " - "private %llu\n", page->mapping->host->i_ino, - (unsigned long long)start, csum, - (unsigned long long)private); + if (printk_ratelimit()) { + printk(KERN_INFO "btrfs csum failed ino %lu off %llu csum %u " + "private %llu\n", page->mapping->host->i_ino, + (unsigned long long)start, csum, + (unsigned long long)private); + } memset(kaddr + offset, 1, end - start + 1); flush_dcache_page(page); kunmap_atomic(kaddr, KM_USER0); -- cgit v1.2.3 From 45c06543afe2772c02f21efee0e2138b4e1c911e Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 27 Apr 2009 07:49:10 -0400 Subject: Btrfs: remove unused btrfs_bit_radix slab Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 552e08afc7fb..98bd5069d54a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -70,7 +70,6 @@ static struct extent_io_ops btrfs_extent_io_ops; static struct kmem_cache *btrfs_inode_cachep; struct kmem_cache *btrfs_trans_handle_cachep; struct kmem_cache *btrfs_transaction_cachep; -struct kmem_cache *btrfs_bit_radix_cachep; struct kmem_cache *btrfs_path_cachep; #define S_SHIFT 12 @@ -4641,8 +4640,6 @@ void btrfs_destroy_cachep(void) kmem_cache_destroy(btrfs_trans_handle_cachep); if (btrfs_transaction_cachep) kmem_cache_destroy(btrfs_transaction_cachep); - if (btrfs_bit_radix_cachep) - kmem_cache_destroy(btrfs_bit_radix_cachep); if (btrfs_path_cachep) kmem_cache_destroy(btrfs_path_cachep); } @@ -4673,11 +4670,6 @@ int btrfs_init_cachep(void) if (!btrfs_path_cachep) goto fail; - btrfs_bit_radix_cachep = kmem_cache_create("btrfs_radix", 256, 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | - SLAB_DESTROY_BY_RCU, NULL); - if (!btrfs_bit_radix_cachep) - goto fail; return 0; fail: btrfs_destroy_cachep(); -- cgit v1.2.3 From e63b6a6c0ffa2ebd8617cc1a10969000296831aa Mon Sep 17 00:00:00 2001 From: Joel Becker Date: Tue, 21 Apr 2009 12:38:30 -0700 Subject: Btrfs: Fix a trivial warning using max() of u64 vs ULL. A small warning popped up on ia64 because inode-map.c was comparing a u64 object id with the ULL FIRST_FREE_OBJECTID. My first thought was that all the OBJECTID constants should contain the u64 cast because btrfs code deals entirely in u64s. But then I saw how large that was, and figured I'd just fix the max() call. Signed-off-by: Joel Becker Signed-off-by: Chris Mason --- fs/btrfs/inode-map.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index cc7334d833c9..9abbced1123d 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -79,7 +79,7 @@ int btrfs_find_free_objectid(struct btrfs_trans_handle *trans, } path = btrfs_alloc_path(); BUG_ON(!path); - search_start = max(search_start, BTRFS_FIRST_FREE_OBJECTID); + search_start = max(search_start, (u64)BTRFS_FIRST_FREE_OBJECTID); search_key.objectid = search_start; search_key.type = 0; search_key.offset = 0; -- cgit v1.2.3 From 21380931eb4da4e29ac663d0221581282cbba208 Mon Sep 17 00:00:00 2001 From: Joel Becker Date: Tue, 21 Apr 2009 12:38:29 -0700 Subject: Btrfs: Fix a bunch of printk() warnings. Just happened to notice a bunch of %llu vs u64 warnings. Here's a patch to cast them all. Signed-off-by: Joel Becker Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 6 +++--- fs/btrfs/extent-tree.c | 21 ++++++++++++++------- fs/btrfs/free-space-cache.c | 15 ++++++++++----- fs/btrfs/ioctl.c | 6 ++++-- fs/btrfs/super.c | 15 +++++++++------ 5 files changed, 40 insertions(+), 23 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index aa0c259b9c28..0ff16d3331da 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1671,7 +1671,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, if (features) { printk(KERN_ERR "BTRFS: couldn't mount because of " "unsupported optional features (%Lx).\n", - features); + (unsigned long long)features); err = -EINVAL; goto fail_iput; } @@ -1681,7 +1681,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, if (!(sb->s_flags & MS_RDONLY) && features) { printk(KERN_ERR "BTRFS: couldn't mount RDWR because of " "unsupported option features (%Lx).\n", - features); + (unsigned long long)features); err = -EINVAL; goto fail_iput; } @@ -2273,7 +2273,7 @@ int close_ctree(struct btrfs_root *root) if (fs_info->delalloc_bytes) { printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n", - fs_info->delalloc_bytes); + (unsigned long long)fs_info->delalloc_bytes); } if (fs_info->total_ref_cache_size) { printk(KERN_INFO "btrfs: at umount reference cache size %llu\n", diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 2895a8373232..e4966444811b 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1844,10 +1844,14 @@ again: printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes" ", %llu bytes_used, %llu bytes_reserved, " "%llu bytes_pinned, %llu bytes_readonly, %llu may use" - "%llu total\n", bytes, data_sinfo->bytes_delalloc, - data_sinfo->bytes_used, data_sinfo->bytes_reserved, - data_sinfo->bytes_pinned, data_sinfo->bytes_readonly, - data_sinfo->bytes_may_use, data_sinfo->total_bytes); + "%llu total\n", (unsigned long long)bytes, + (unsigned long long)data_sinfo->bytes_delalloc, + (unsigned long long)data_sinfo->bytes_used, + (unsigned long long)data_sinfo->bytes_reserved, + (unsigned long long)data_sinfo->bytes_pinned, + (unsigned long long)data_sinfo->bytes_readonly, + (unsigned long long)data_sinfo->bytes_may_use, + (unsigned long long)data_sinfo->total_bytes); return -ENOSPC; } data_sinfo->bytes_may_use += bytes; @@ -2824,9 +2828,12 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes) info->bytes_pinned - info->bytes_reserved), (info->full) ? "" : "not "); printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu," - " may_use=%llu, used=%llu\n", info->total_bytes, - info->bytes_pinned, info->bytes_delalloc, info->bytes_may_use, - info->bytes_used); + " may_use=%llu, used=%llu\n", + (unsigned long long)info->total_bytes, + (unsigned long long)info->bytes_pinned, + (unsigned long long)info->bytes_delalloc, + (unsigned long long)info->bytes_may_use, + (unsigned long long)info->bytes_used); down_read(&info->groups_sem); list_for_each_entry(cache, &info->block_groups, list) { diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 768b9523662d..0bc93657b460 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -332,13 +332,17 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, printk(KERN_ERR "couldn't find space %llu to free\n", (unsigned long long)offset); printk(KERN_ERR "cached is %d, offset %llu bytes %llu\n", - block_group->cached, block_group->key.objectid, - block_group->key.offset); + block_group->cached, + (unsigned long long)block_group->key.objectid, + (unsigned long long)block_group->key.offset); btrfs_dump_free_space(block_group, bytes); } else if (info) { printk(KERN_ERR "hmm, found offset=%llu bytes=%llu, " "but wanted offset=%llu bytes=%llu\n", - info->offset, info->bytes, offset, bytes); + (unsigned long long)info->offset, + (unsigned long long)info->bytes, + (unsigned long long)offset, + (unsigned long long)bytes); } WARN_ON(1); } @@ -357,8 +361,9 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, info = rb_entry(n, struct btrfs_free_space, offset_index); if (info->bytes >= bytes) count++; - printk(KERN_ERR "entry offset %llu, bytes %llu\n", info->offset, - info->bytes); + printk(KERN_ERR "entry offset %llu, bytes %llu\n", + (unsigned long long)info->offset, + (unsigned long long)info->bytes); } printk(KERN_INFO "%d blocks of free space at or bigger than bytes is" "\n", count); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index f4e5d2e5ece6..48762aa1e945 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -483,11 +483,13 @@ static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg) *devstr = '\0'; devstr = vol_args->name; devid = simple_strtoull(devstr, &end, 10); - printk(KERN_INFO "resizing devid %llu\n", devid); + printk(KERN_INFO "resizing devid %llu\n", + (unsigned long long)devid); } device = btrfs_find_device(root, devid, NULL, NULL); if (!device) { - printk(KERN_INFO "resizer unable to find device %llu\n", devid); + printk(KERN_INFO "resizer unable to find device %llu\n", + (unsigned long long)devid); ret = -EINVAL; goto out_unlock; } diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 30c9a8ca2a54..bf0e84c75607 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -196,7 +196,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) info->max_extent = max_t(u64, info->max_extent, root->sectorsize); printk(KERN_INFO "btrfs: max_extent at %llu\n", - info->max_extent); + (unsigned long long)info->max_extent); } break; case Opt_max_inline: @@ -211,7 +211,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) root->sectorsize); } printk(KERN_INFO "btrfs: max_inline at %llu\n", - info->max_inline); + (unsigned long long)info->max_inline); } break; case Opt_alloc_start: @@ -221,7 +221,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) kfree(num); printk(KERN_INFO "btrfs: allocations start at %llu\n", - info->alloc_start); + (unsigned long long)info->alloc_start); } break; case Opt_noacl: @@ -420,11 +420,14 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs) if (btrfs_test_opt(root, NOBARRIER)) seq_puts(seq, ",nobarrier"); if (info->max_extent != (u64)-1) - seq_printf(seq, ",max_extent=%llu", info->max_extent); + seq_printf(seq, ",max_extent=%llu", + (unsigned long long)info->max_extent); if (info->max_inline != 8192 * 1024) - seq_printf(seq, ",max_inline=%llu", info->max_inline); + seq_printf(seq, ",max_inline=%llu", + (unsigned long long)info->max_inline); if (info->alloc_start != 0) - seq_printf(seq, ",alloc_start=%llu", info->alloc_start); + seq_printf(seq, ",alloc_start=%llu", + (unsigned long long)info->alloc_start); if (info->thread_pool_size != min_t(unsigned long, num_online_cpus() + 2, 8)) seq_printf(seq, ",thread_pool=%d", info->thread_pool_size); -- cgit v1.2.3 From fd1b52435a6d9663de896e8437ef067372916ef3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 7 Apr 2009 18:10:06 +0200 Subject: quota: remove obsolete comments in fs/quota/Makefile Get rid of useless comments and the equally useless obj-y initialization. Signed-off-by: Christoph Hellwig Signed-off-by: Jan Kara --- fs/quota/Makefile | 9 --------- 1 file changed, 9 deletions(-) (limited to 'fs') diff --git a/fs/quota/Makefile b/fs/quota/Makefile index 385a0831cc99..68d4f6dc0578 100644 --- a/fs/quota/Makefile +++ b/fs/quota/Makefile @@ -1,12 +1,3 @@ -# -# Makefile for the Linux filesystems. -# -# 14 Sep 2000, Christoph Hellwig -# Rewritten to use lists instead of if-statements. -# - -obj-y := - obj-$(CONFIG_QUOTA) += dquot.o obj-$(CONFIG_QFMT_V1) += quota_v1.o obj-$(CONFIG_QFMT_V2) += quota_v2.o -- cgit v1.2.3 From a069e9cee1dba2f847839d325f46ce6976ed1b76 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 9 Apr 2009 18:07:10 +0200 Subject: ext2: missing unlock in ext2_quota_write() The inode->i_mutex should be unlocked. Found by smatch (http://repo.or.cz/w/smatch.git). Compile tested. Signed-off-by: Dan Carpenter Signed-off-by: Jan Kara --- fs/ext2/super.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext2/super.c b/fs/ext2/super.c index f983225266dc..5c4afe652245 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -1395,8 +1395,10 @@ static ssize_t ext2_quota_write(struct super_block *sb, int type, blk++; } out: - if (len == towrite) + if (len == towrite) { + mutex_unlock(&inode->i_mutex); return err; + } if (inode->i_size < off+len-towrite) i_size_write(inode, off+len-towrite); inode->i_version++; -- cgit v1.2.3 From 7b1a14bbb0e547aaa4d30cc376e6c8c12539ab0f Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 27 Apr 2009 10:49:53 -0400 Subject: Btrfs: fix acl caching Linus noticed the btrfs code to cache acls wasn't properly caching a NULL acl when the inode didn't have any acls. This meant the common case of no acls resulted in expensive btree searches every time the kernel checked permissions (which is quite often). This is a modified version of Linus' original patch: Properly set initial acl fields to BTRFS_ACL_NOT_CACHED in the inode. This forces an acl lookup when permission checks are done. Fix btrfs_get_acl to avoid lookups and locking when the inode acls fields are set to null. Fix btrfs_get_acl to use the right return value from __btrfs_getxattr when deciding to cache a NULL acl. It was storing a NULL acl when __btrfs_getxattr return -ENOENT, but __btrfs_getxattr was actually returning -ENODATA for this case. Signed-off-by: Chris Mason --- fs/btrfs/acl.c | 18 +++++++++++++----- fs/btrfs/inode.c | 4 ++-- 2 files changed, 15 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 7fdd184a528d..cbba000dccbe 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -60,15 +60,20 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type) return ERR_PTR(-EINVAL); } + /* Handle the cached NULL acl case without locking */ + acl = ACCESS_ONCE(*p_acl); + if (!acl) + return acl; + spin_lock(&inode->i_lock); - if (*p_acl != BTRFS_ACL_NOT_CACHED) - acl = posix_acl_dup(*p_acl); + acl = *p_acl; + if (acl != BTRFS_ACL_NOT_CACHED) + acl = posix_acl_dup(acl); spin_unlock(&inode->i_lock); - if (acl) + if (acl != BTRFS_ACL_NOT_CACHED) return acl; - size = __btrfs_getxattr(inode, name, "", 0); if (size > 0) { value = kzalloc(size, GFP_NOFS); @@ -80,9 +85,12 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type) btrfs_update_cached_acl(inode, p_acl, acl); } kfree(value); - } else if (size == -ENOENT) { + } else if (size == -ENOENT || size == -ENODATA || size == 0) { + /* FIXME, who returns -ENOENT? I think nobody */ acl = NULL; btrfs_update_cached_acl(inode, p_acl, acl); + } else { + acl = ERR_PTR(-EIO); } return acl; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 98bd5069d54a..dc812ec551f6 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3047,8 +3047,8 @@ static noinline void init_btrfs_i(struct inode *inode) { struct btrfs_inode *bi = BTRFS_I(inode); - bi->i_acl = NULL; - bi->i_default_acl = NULL; + bi->i_acl = BTRFS_ACL_NOT_CACHED; + bi->i_default_acl = BTRFS_ACL_NOT_CACHED; bi->generation = 0; bi->sequence = 0; -- cgit v1.2.3 From 46a53cca826e71effe59e3cb4f383622c33ebdcb Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 27 Apr 2009 11:47:50 -0400 Subject: Btrfs: look for acls during btrfs_read_locked_inode This changes btrfs_read_locked_inode() to peek ahead in the btree for acl items. If it is certain a given inode has no acls, it will set the in memory acl fields to null to avoid acl lookups completely. Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index dc812ec551f6..90c23eb28829 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2015,6 +2015,57 @@ void btrfs_orphan_cleanup(struct btrfs_root *root) btrfs_free_path(path); } +/* + * very simple check to peek ahead in the leaf looking for xattrs. If we + * don't find any xattrs, we know there can't be any acls. + * + * slot is the slot the inode is in, objectid is the objectid of the inode + */ +static noinline int acls_after_inode_item(struct extent_buffer *leaf, + int slot, u64 objectid) +{ + u32 nritems = btrfs_header_nritems(leaf); + struct btrfs_key found_key; + int scanned = 0; + + slot++; + while (slot < nritems) { + btrfs_item_key_to_cpu(leaf, &found_key, slot); + + /* we found a different objectid, there must not be acls */ + if (found_key.objectid != objectid) + return 0; + + /* we found an xattr, assume we've got an acl */ + if (found_key.type == BTRFS_XATTR_ITEM_KEY) + return 1; + + /* + * we found a key greater than an xattr key, there can't + * be any acls later on + */ + if (found_key.type > BTRFS_XATTR_ITEM_KEY) + return 0; + + slot++; + scanned++; + + /* + * it goes inode, inode backrefs, xattrs, extents, + * so if there are a ton of hard links to an inode there can + * be a lot of backrefs. Don't waste time searching too hard, + * this is just an optimization + */ + if (scanned >= 8) + break; + } + /* we hit the end of the leaf before we found an xattr or + * something larger than an xattr. We have to assume the inode + * has acls + */ + return 1; +} + /* * read an inode from the btree into the in-memory inode */ @@ -2026,6 +2077,7 @@ void btrfs_read_locked_inode(struct inode *inode) struct btrfs_timespec *tspec; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_key location; + int maybe_acls; u64 alloc_group_block; u32 rdev; int ret; @@ -2072,6 +2124,16 @@ void btrfs_read_locked_inode(struct inode *inode) alloc_group_block = btrfs_inode_block_group(leaf, inode_item); + /* + * try to precache a NULL acl entry for files that don't have + * any xattrs or acls + */ + maybe_acls = acls_after_inode_item(leaf, path->slots[0], inode->i_ino); + if (!maybe_acls) { + BTRFS_I(inode)->i_acl = NULL; + BTRFS_I(inode)->i_default_acl = NULL; + } + BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0, alloc_group_block, 0); btrfs_free_path(path); -- cgit v1.2.3 From 69838727bcd819a8fd73a88447801221788b0c6d Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Tue, 28 Apr 2009 20:24:29 +0200 Subject: bio: fix memcpy corruption in bio_copy_user_iov() st driver uses blk_rq_map_user() in order to just build a request out of page frames. In this case, map_data->offset is a non zero value and iov[0].iov_base is NULL. We need to increase nr_pages for that. Cc: stable@kernel.org Signed-off-by: FUJITA Tomonori Signed-off-by: Jens Axboe --- fs/bio.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/bio.c b/fs/bio.c index 7bbc98f0eda1..98711647ece4 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -817,6 +817,9 @@ struct bio *bio_copy_user_iov(struct request_queue *q, len += iov[i].iov_len; } + if (offset) + nr_pages++; + bmd = bio_alloc_map_data(nr_pages, iov_count, gfp_mask); if (!bmd) return ERR_PTR(-ENOMEM); -- cgit v1.2.3 From 802b352f2934f799ec2e159f61db6506094a936e Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 27 Apr 2009 21:24:28 -0700 Subject: ecryptfs: fix printk format warning fs/ecryptfs/inode.c:670: warning: format '%d' expects type 'int', but argument 3 has type 'size_t' Signed-off-by: Randy Dunlap Signed-off-by: Tyler Hicks Cc: Dustin Kirkland Signed-off-by: Andrew Morton --- fs/ecryptfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 93bc0f8174a7..1940edd08307 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -667,7 +667,7 @@ ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz) lower_buf = kmalloc(lower_bufsiz, GFP_KERNEL); if (lower_buf == NULL) { printk(KERN_ERR "%s: Out of memory whilst attempting to " - "kmalloc [%d] bytes\n", __func__, lower_bufsiz); + "kmalloc [%zd] bytes\n", __func__, lower_bufsiz); rc = -ENOMEM; goto out; } -- cgit v1.2.3 From ac20100df7a7a042423dcb8847f42d9f6ddb8d00 Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Mon, 27 Apr 2009 13:31:12 -0500 Subject: eCryptfs: Fix min function comparison warning This warning shows up on 64 bit builds: fs/ecryptfs/inode.c:693: warning: comparison of distinct pointer types lacks a cast Signed-off-by: Tyler Hicks --- fs/ecryptfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 1940edd08307..2f0945d63297 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -690,7 +690,7 @@ ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz) } /* Check for bufsiz <= 0 done in sys_readlinkat() */ rc = copy_to_user(buf, plaintext_name, - min((unsigned) bufsiz, plaintext_name_size)); + min((size_t) bufsiz, plaintext_name_size)); if (rc) rc = -EFAULT; else -- cgit v1.2.3 From 7e31a966ad270ba32a77c157c015cd7c82faaa55 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Wed, 29 Apr 2009 01:20:55 +0800 Subject: ocfs2/trivial: Remove unused variable in ocfs2_rename. With indexed dir enabled, now we use ocfs2_dir_lookup_result to wrap all the bh used for dir. So remove the 2 unused variables. Signed-off-by: Tao Ma Signed-off-by: Joel Becker --- fs/ocfs2/namei.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 2220f93f668b..33464c6b60a2 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -1025,10 +1025,8 @@ static int ocfs2_rename(struct inode *old_dir, struct inode *orphan_dir = NULL; struct ocfs2_dinode *newfe = NULL; char orphan_name[OCFS2_ORPHAN_NAMELEN + 1]; - struct buffer_head *orphan_entry_bh = NULL; struct buffer_head *newfe_bh = NULL; struct buffer_head *old_inode_bh = NULL; - struct buffer_head *insert_entry_bh = NULL; struct ocfs2_super *osb = NULL; u64 newfe_blkno, old_de_ino; handle_t *handle = NULL; @@ -1455,8 +1453,6 @@ bail: brelse(old_inode_bh); brelse(old_dir_bh); brelse(new_dir_bh); - brelse(orphan_entry_bh); - brelse(insert_entry_bh); mlog_exit(status); -- cgit v1.2.3 From def6b3ba56b637d58126ef67fc19bab57945fcc4 Mon Sep 17 00:00:00 2001 From: Lachlan McIlroy Date: Thu, 23 Apr 2009 22:18:00 -0400 Subject: xfs_file_last_byte() needs to acquire ilock We had some systems crash with this stack: [] ia64_leave_kernel+0x0/0x280 [] xfs_bmbt_get_startoff+0x0/0x20 [xfs] [] xfs_bmap_last_offset+0x210/0x280 [xfs] [] xfs_file_last_byte+0x70/0x1a0 [xfs] [] xfs_itruncate_start+0xc0/0x1a0 [xfs] [] xfs_inactive_free_eofblocks+0x290/0x460 [xfs] [] xfs_release+0x1b0/0x240 [xfs] [] xfs_file_release+0x70/0xa0 [xfs] [] __fput+0x1a0/0x420 [] fput+0x40/0x60 The problem here is that xfs_file_last_byte() does not acquire the inode lock and can therefore race with another thread that is modifying the extext list. While xfs_bmap_last_offset() is trying to lookup what was the last extent some extents were merged and the extent list shrunk so the index we lookup is now beyond the end of the extent list and potentially in a freed buffer. Signed-off-by: Lachlan McIlroy Reviewed-by: Christoph Hellwig Reviewed-by: Felix Blyakher Signed-off-by: Felix Blyakher --- fs/xfs/xfs_inode.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index e7ae08d1df48..123b20c8cbf2 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1258,8 +1258,10 @@ xfs_file_last_byte( * necessary. */ if (ip->i_df.if_flags & XFS_IFEXTENTS) { + xfs_ilock(ip, XFS_ILOCK_SHARED); error = xfs_bmap_last_offset(NULL, ip, &last_block, XFS_DATA_FORK); + xfs_iunlock(ip, XFS_ILOCK_SHARED); if (error) { last_block = 0; } -- cgit v1.2.3 From b9ec9068d79e039507a247ebc5bc9c0ce53654ce Mon Sep 17 00:00:00 2001 From: Olaf Weber Date: Fri, 17 Apr 2009 16:12:45 -0500 Subject: xfs: add more checks to superblock validation There had been reports where xfs filesystem was randomly corrupted with fsfuzzer, and xfs failed to handle it gracefully. This patch fixes couple of reported problem by providing additional checks in the superblock validation routine. Signed-off-by: Olaf Weber Reviewed-by: Josef 'Jeff' Sipek Reviewed-by: Christoph Hellwig Signed-off-by: Felix Blyakher --- fs/xfs/xfs_mount.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index b101990df027..65a99725d0cc 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -291,14 +291,17 @@ xfs_mount_validate_sb( sbp->sb_sectsize > XFS_MAX_SECTORSIZE || sbp->sb_sectlog < XFS_MIN_SECTORSIZE_LOG || sbp->sb_sectlog > XFS_MAX_SECTORSIZE_LOG || + sbp->sb_sectsize != (1 << sbp->sb_sectlog) || sbp->sb_blocksize < XFS_MIN_BLOCKSIZE || sbp->sb_blocksize > XFS_MAX_BLOCKSIZE || sbp->sb_blocklog < XFS_MIN_BLOCKSIZE_LOG || sbp->sb_blocklog > XFS_MAX_BLOCKSIZE_LOG || + sbp->sb_blocksize != (1 << sbp->sb_blocklog) || sbp->sb_inodesize < XFS_DINODE_MIN_SIZE || sbp->sb_inodesize > XFS_DINODE_MAX_SIZE || sbp->sb_inodelog < XFS_DINODE_MIN_LOG || sbp->sb_inodelog > XFS_DINODE_MAX_LOG || + sbp->sb_inodesize != (1 << sbp->sb_inodelog) || (sbp->sb_blocklog - sbp->sb_inodelog != sbp->sb_inopblog) || (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE) || (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) || -- cgit v1.2.3 From 5f79ed685fc6122018c4b5826e2e5bdb7bc6f109 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 29 Apr 2009 10:50:48 -0400 Subject: xfs: a couple getbmap cleanups - reshuffle various conditionals for data vs attr fork to make the code more readable - do fine-grainded goto-based error handling - exit early from conditionals instead of keeping a long else branch around - allow kmem_alloc to fail Signed-off-by: Christoph Hellwig Reviewed-by: Eric Sandeen Reviewed-by: Felix Blyakher Signed-off-by: Felix Blyakher --- fs/xfs/xfs_bmap.c | 162 ++++++++++++++++++++++++++---------------------------- 1 file changed, 79 insertions(+), 83 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 3a6ed426327a..abe42448b1c0 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -5880,7 +5880,7 @@ xfs_getbmap( void *arg) /* formatter arg */ { __int64_t bmvend; /* last block requested */ - int error; /* return value */ + int error = 0; /* return value */ __int64_t fixlen; /* length for -1 case */ int i; /* extent number */ int lock; /* lock state */ @@ -5899,30 +5899,8 @@ xfs_getbmap( mp = ip->i_mount; iflags = bmv->bmv_iflags; - whichfork = iflags & BMV_IF_ATTRFORK ? XFS_ATTR_FORK : XFS_DATA_FORK; - /* If the BMV_IF_NO_DMAPI_READ interface bit specified, do not - * generate a DMAPI read event. Otherwise, if the DM_EVENT_READ - * bit is set for the file, generate a read event in order - * that the DMAPI application may do its thing before we return - * the extents. Usually this means restoring user file data to - * regions of the file that look like holes. - * - * The "old behavior" (from XFS_IOC_GETBMAP) is to not specify - * BMV_IF_NO_DMAPI_READ so that read events are generated. - * If this were not true, callers of ioctl( XFS_IOC_GETBMAP ) - * could misinterpret holes in a DMAPI file as true holes, - * when in fact they may represent offline user data. - */ - if ((iflags & BMV_IF_NO_DMAPI_READ) == 0 && - DM_EVENT_ENABLED(ip, DM_EVENT_READ) && - whichfork == XFS_DATA_FORK) { - error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip, 0, 0, 0, NULL); - if (error) - return XFS_ERROR(error); - } - if (whichfork == XFS_ATTR_FORK) { if (XFS_IFORK_Q(ip)) { if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS && @@ -5936,11 +5914,37 @@ xfs_getbmap( ip->i_mount); return XFS_ERROR(EFSCORRUPTED); } - } else if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS && - ip->i_d.di_format != XFS_DINODE_FMT_BTREE && - ip->i_d.di_format != XFS_DINODE_FMT_LOCAL) - return XFS_ERROR(EINVAL); - if (whichfork == XFS_DATA_FORK) { + + prealloced = 0; + fixlen = 1LL << 32; + } else { + /* + * If the BMV_IF_NO_DMAPI_READ interface bit specified, do + * not generate a DMAPI read event. Otherwise, if the + * DM_EVENT_READ bit is set for the file, generate a read + * event in order that the DMAPI application may do its thing + * before we return the extents. Usually this means restoring + * user file data to regions of the file that look like holes. + * + * The "old behavior" (from XFS_IOC_GETBMAP) is to not specify + * BMV_IF_NO_DMAPI_READ so that read events are generated. + * If this were not true, callers of ioctl(XFS_IOC_GETBMAP) + * could misinterpret holes in a DMAPI file as true holes, + * when in fact they may represent offline user data. + */ + if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && + !(iflags & BMV_IF_NO_DMAPI_READ)) { + error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip, + 0, 0, 0, NULL); + if (error) + return XFS_ERROR(error); + } + + if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS && + ip->i_d.di_format != XFS_DINODE_FMT_BTREE && + ip->i_d.di_format != XFS_DINODE_FMT_LOCAL) + return XFS_ERROR(EINVAL); + if (xfs_get_extsz_hint(ip) || ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)){ prealloced = 1; @@ -5949,42 +5953,34 @@ xfs_getbmap( prealloced = 0; fixlen = ip->i_size; } - } else { - prealloced = 0; - fixlen = 1LL << 32; } if (bmv->bmv_length == -1) { fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, fixlen)); - bmv->bmv_length = MAX( (__int64_t)(fixlen - bmv->bmv_offset), - (__int64_t)0); - } else if (bmv->bmv_length < 0) - return XFS_ERROR(EINVAL); - if (bmv->bmv_length == 0) { + bmv->bmv_length = + max_t(__int64_t, fixlen - bmv->bmv_offset, 0); + } else if (bmv->bmv_length == 0) { bmv->bmv_entries = 0; return 0; + } else if (bmv->bmv_length < 0) { + return XFS_ERROR(EINVAL); } + nex = bmv->bmv_count - 1; if (nex <= 0) return XFS_ERROR(EINVAL); bmvend = bmv->bmv_offset + bmv->bmv_length; xfs_ilock(ip, XFS_IOLOCK_SHARED); - - if (((iflags & BMV_IF_DELALLOC) == 0) && - (whichfork == XFS_DATA_FORK) && - (ip->i_delayed_blks || ip->i_size > ip->i_d.di_size)) { - /* xfs_fsize_t last_byte = xfs_file_last_byte(ip); */ - error = xfs_flush_pages(ip, (xfs_off_t)0, - -1, 0, FI_REMAPF); - if (error) { - xfs_iunlock(ip, XFS_IOLOCK_SHARED); - return error; + if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) { + if (ip->i_delayed_blks || ip->i_size > ip->i_d.di_size) { + error = xfs_flush_pages(ip, 0, -1, 0, FI_REMAPF); + if (error) + goto out_unlock_iolock; } - } - ASSERT(whichfork == XFS_ATTR_FORK || (iflags & BMV_IF_DELALLOC) || - ip->i_delayed_blks == 0); + ASSERT(ip->i_delayed_blks == 0); + } lock = xfs_ilock_map_shared(ip); @@ -5995,23 +5991,25 @@ xfs_getbmap( if (nex > XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1) nex = XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1; - bmapi_flags = xfs_bmapi_aflag(whichfork) | - ((iflags & BMV_IF_PREALLOC) ? 0 : XFS_BMAPI_IGSTATE); + bmapi_flags = xfs_bmapi_aflag(whichfork); + if (!(iflags & BMV_IF_PREALLOC)) + bmapi_flags |= XFS_BMAPI_IGSTATE; /* * Allocate enough space to handle "subnex" maps at a time. */ + error = ENOMEM; subnex = 16; - map = kmem_alloc(subnex * sizeof(*map), KM_SLEEP); + map = kmem_alloc(subnex * sizeof(*map), KM_MAYFAIL); + if (!map) + goto out_unlock_ilock; bmv->bmv_entries = 0; - if ((XFS_IFORK_NEXTENTS(ip, whichfork) == 0)) { - if (((iflags & BMV_IF_DELALLOC) == 0) || - whichfork == XFS_ATTR_FORK) { - error = 0; - goto unlock_and_return; - } + if (XFS_IFORK_NEXTENTS(ip, whichfork) == 0 && + (whichfork == XFS_ATTR_FORK || !(iflags & BMV_IF_DELALLOC))) { + error = 0; + goto out_free_map; } nexleft = nex; @@ -6023,10 +6021,12 @@ xfs_getbmap( bmapi_flags, NULL, 0, map, &nmap, NULL, NULL); if (error) - goto unlock_and_return; + goto out_free_map; ASSERT(nmap <= subnex); for (i = 0; i < nmap && nexleft && bmv->bmv_length; i++) { + int full = 0; /* user array is full */ + out.bmv_oflags = 0; if (map[i].br_state == XFS_EXT_UNWRITTEN) out.bmv_oflags |= BMV_OF_PREALLOC; @@ -6041,36 +6041,32 @@ xfs_getbmap( whichfork == XFS_ATTR_FORK) { /* came to the end of attribute fork */ out.bmv_oflags |= BMV_OF_LAST; - goto unlock_and_return; - } else { - int full = 0; /* user array is full */ - - if (!xfs_getbmapx_fix_eof_hole(ip, &out, - prealloced, bmvend, - map[i].br_startblock)) { - goto unlock_and_return; - } - - /* format results & advance arg */ - error = formatter(&arg, &out, &full); - if (error || full) - goto unlock_and_return; - nexleft--; - bmv->bmv_offset = - out.bmv_offset + out.bmv_length; - bmv->bmv_length = MAX((__int64_t)0, - (__int64_t)(bmvend - bmv->bmv_offset)); - bmv->bmv_entries++; + goto out_free_map; } + + if (!xfs_getbmapx_fix_eof_hole(ip, &out, prealloced, + bmvend, map[i].br_startblock)) + goto out_free_map; + + /* format results & advance arg */ + error = formatter(&arg, &out, &full); + if (error || full) + goto out_free_map; + nexleft--; + bmv->bmv_offset = + out.bmv_offset + out.bmv_length; + bmv->bmv_length = + max_t(__int64_t, 0, bmvend - bmv->bmv_offset); + bmv->bmv_entries++; } } while (nmap && nexleft && bmv->bmv_length); -unlock_and_return: + out_free_map: + kmem_free(map); + out_unlock_ilock: xfs_iunlock_map_shared(ip, lock); + out_unlock_iolock: xfs_iunlock(ip, XFS_IOLOCK_SHARED); - - kmem_free(map); - return error; } -- cgit v1.2.3 From 28e211700a81b0a934b6c7a4b8e7dda843634d2f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Feb 2009 08:39:02 -0500 Subject: xfs: fix getbmap vs mmap deadlock xfs_getbmap (or rather the formatters called by it) copy out the getbmap structures under the ilock, which can deadlock against mmap. This has been reported via bugzilla a while ago (#717) and has recently also shown up via lockdep. So allocate a temporary buffer to format the kernel getbmap structures into and then copy them out after dropping the locks. A little problem with this is that we limit the number of extents we can copy out by the maximum allocation size, but I see no real way around that. Signed-off-by: Christoph Hellwig Reviewed-by: Eric Sandeen Reviewed-by: Felix Blyakher Signed-off-by: Felix Blyakher --- fs/xfs/xfs_bmap.c | 52 +++++++++++++++++++++++++++++++++++----------------- 1 file changed, 35 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index abe42448b1c0..ca7c6005a487 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -5890,12 +5890,13 @@ xfs_getbmap( int nexleft; /* # of user extents left */ int subnex; /* # of bmapi's can do */ int nmap; /* number of map entries */ - struct getbmapx out; /* output structure */ + struct getbmapx *out; /* output structure */ int whichfork; /* data or attr fork */ int prealloced; /* this is a file with * preallocated data space */ int iflags; /* interface flags */ int bmapi_flags; /* flags for xfs_bmapi */ + int cur_ext = 0; mp = ip->i_mount; iflags = bmv->bmv_iflags; @@ -5971,6 +5972,13 @@ xfs_getbmap( return XFS_ERROR(EINVAL); bmvend = bmv->bmv_offset + bmv->bmv_length; + + if (bmv->bmv_count > ULONG_MAX / sizeof(struct getbmapx)) + return XFS_ERROR(ENOMEM); + out = kmem_zalloc(bmv->bmv_count * sizeof(struct getbmapx), KM_MAYFAIL); + if (!out) + return XFS_ERROR(ENOMEM); + xfs_ilock(ip, XFS_IOLOCK_SHARED); if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) { if (ip->i_delayed_blks || ip->i_size > ip->i_d.di_size) { @@ -6025,39 +6033,39 @@ xfs_getbmap( ASSERT(nmap <= subnex); for (i = 0; i < nmap && nexleft && bmv->bmv_length; i++) { - int full = 0; /* user array is full */ - - out.bmv_oflags = 0; + out[cur_ext].bmv_oflags = 0; if (map[i].br_state == XFS_EXT_UNWRITTEN) - out.bmv_oflags |= BMV_OF_PREALLOC; + out[cur_ext].bmv_oflags |= BMV_OF_PREALLOC; else if (map[i].br_startblock == DELAYSTARTBLOCK) - out.bmv_oflags |= BMV_OF_DELALLOC; - out.bmv_offset = XFS_FSB_TO_BB(mp, map[i].br_startoff); - out.bmv_length = XFS_FSB_TO_BB(mp, map[i].br_blockcount); - out.bmv_unused1 = out.bmv_unused2 = 0; + out[cur_ext].bmv_oflags |= BMV_OF_DELALLOC; + out[cur_ext].bmv_offset = + XFS_FSB_TO_BB(mp, map[i].br_startoff); + out[cur_ext].bmv_length = + XFS_FSB_TO_BB(mp, map[i].br_blockcount); + out[cur_ext].bmv_unused1 = 0; + out[cur_ext].bmv_unused2 = 0; ASSERT(((iflags & BMV_IF_DELALLOC) != 0) || (map[i].br_startblock != DELAYSTARTBLOCK)); if (map[i].br_startblock == HOLESTARTBLOCK && whichfork == XFS_ATTR_FORK) { /* came to the end of attribute fork */ - out.bmv_oflags |= BMV_OF_LAST; + out[cur_ext].bmv_oflags |= BMV_OF_LAST; goto out_free_map; } - if (!xfs_getbmapx_fix_eof_hole(ip, &out, prealloced, - bmvend, map[i].br_startblock)) + if (!xfs_getbmapx_fix_eof_hole(ip, &out[cur_ext], + prealloced, bmvend, + map[i].br_startblock)) goto out_free_map; - /* format results & advance arg */ - error = formatter(&arg, &out, &full); - if (error || full) - goto out_free_map; nexleft--; bmv->bmv_offset = - out.bmv_offset + out.bmv_length; + out[cur_ext].bmv_offset + + out[cur_ext].bmv_length; bmv->bmv_length = max_t(__int64_t, 0, bmvend - bmv->bmv_offset); bmv->bmv_entries++; + cur_ext++; } } while (nmap && nexleft && bmv->bmv_length); @@ -6067,6 +6075,16 @@ xfs_getbmap( xfs_iunlock_map_shared(ip, lock); out_unlock_iolock: xfs_iunlock(ip, XFS_IOLOCK_SHARED); + + for (i = 0; i < cur_ext; i++) { + int full = 0; /* user array is full */ + + /* format results & advance arg */ + error = formatter(&arg, &out[i], &full); + if (error || full) + break; + } + return error; } -- cgit v1.2.3 From dfa13f39b798fee68250abe1aed851395c8b51b5 Mon Sep 17 00:00:00 2001 From: Joel Becker Date: Wed, 29 Apr 2009 17:55:08 -0700 Subject: ocfs2: Fix a missing credit when deleting from indexed directories. The ocfs2 directory index updates two blocks when we remove an entry - the dx root and the dx leaf. OCFS2_DELETE_INODE_CREDITS was only accounting for the dx leaf. This shows up when ocfs2_delete_inode() runs out of credits in jbd2_journal_dirty_metadata() at "J_ASSERT_JH(jh, handle->h_buffer_credits > 0);". The test that caught this was running dirop_file_racer from the ocfs2-test suite with a 250-character filename PREFIX. Run on a 512B blocksize, it forces the orphan dir index to grow large enough to trigger. Signed-off-by: Joel Becker --- fs/ocfs2/journal.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 619dd7f6c053..eb7b76331eb7 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -437,8 +437,9 @@ static inline int ocfs2_unlink_credits(struct super_block *sb) } /* dinode + orphan dir dinode + inode alloc dinode + orphan dir entry + - * inode alloc group descriptor + orphan dir index leaf */ -#define OCFS2_DELETE_INODE_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 3) + * inode alloc group descriptor + orphan dir index root + + * orphan dir index leaf */ +#define OCFS2_DELETE_INODE_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 4) /* dinode update, old dir dinode update, new dir dinode update, old * dir dir entry, new dir dir entry, dir entry update for renaming -- cgit v1.2.3 From a8985f3ac503b51c5abf8883fc4fb912e13b955c Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Thu, 30 Apr 2009 15:08:09 -0700 Subject: autofs4: fix incorrect return in autofs4_mount_busy() Fix an obvious incorrect return status in autofs4_mount_busy(). Signed-off-by: Ian Kent Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/autofs4/expire.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index 75f7ddacf7d6..3077d8f16523 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -70,8 +70,10 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry) * Otherwise it's an offset mount and we need to check * if we can umount its mount, if there is one. */ - if (!d_mountpoint(dentry)) + if (!d_mountpoint(dentry)) { + status = 0; goto done; + } } /* Update the expiry counter if fs is busy */ -- cgit v1.2.3 From b827e496c893de0c0f142abfaeb8730a2fd6b37f Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Thu, 30 Apr 2009 15:08:16 -0700 Subject: mm: close page_mkwrite races Change page_mkwrite to allow implementations to return with the page locked, and also change it's callers (in page fault paths) to hold the lock until the page is marked dirty. This allows the filesystem to have full control of page dirtying events coming from the VM. Rather than simply hold the page locked over the page_mkwrite call, we call page_mkwrite with the page unlocked and allow callers to return with it locked, so filesystems can avoid LOR conditions with page lock. The problem with the current scheme is this: a filesystem that wants to associate some metadata with a page as long as the page is dirty, will perform this manipulation in its ->page_mkwrite. It currently then must return with the page unlocked and may not hold any other locks (according to existing page_mkwrite convention). In this window, the VM could write out the page, clearing page-dirty. The filesystem has no good way to detect that a dirty pte is about to be attached, so it will happily write out the page, at which point, the filesystem may manipulate the metadata to reflect that the page is no longer dirty. It is not always possible to perform the required metadata manipulation in ->set_page_dirty, because that function cannot block or fail. The filesystem may need to allocate some data structure, for example. And the VM cannot mark the pte dirty before page_mkwrite, because page_mkwrite is allowed to fail, so we must not allow any window where the page could be written to if page_mkwrite does fail. This solution of holding the page locked over the 3 critical operations (page_mkwrite, setting the pte dirty, and finally setting the page dirty) closes out races nicely, preventing page cleaning for writeout being initiated in that window. This provides the filesystem with a strong synchronisation against the VM here. - Sage needs this race closed for ceph filesystem. - Trond for NFS (http://bugzilla.kernel.org/show_bug.cgi?id=12913). - I need it for fsblock. - I suspect other filesystems may need it too (eg. btrfs). - I have converted buffer.c to the new locking. Even simple block allocation under dirty pages might be susceptible to i_size changing under partial page at the end of file (we also have a buffer.c-side problem here, but it cannot be fixed properly without this patch). - Other filesystems (eg. NFS, maybe btrfs) will need to change their page_mkwrite functions themselves. [ This also moves page_mkwrite another step closer to fault, which should eventually allow page_mkwrite to be moved into ->fault, and thus avoiding a filesystem calldown and page lock/unlock cycle in __do_fault. ] [akpm@linux-foundation.org: fix derefs of NULL ->mapping] Cc: Sage Weil Cc: Trond Myklebust Signed-off-by: Nick Piggin Cc: Valdis Kletnieks Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/buffer.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index b3e5be7514f5..aed297739eb0 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2397,7 +2397,8 @@ block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, if ((page->mapping != inode->i_mapping) || (page_offset(page) > size)) { /* page got truncated out from underneath us */ - goto out_unlock; + unlock_page(page); + goto out; } /* page is wholly or partially inside EOF */ @@ -2411,14 +2412,15 @@ block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, ret = block_commit_write(page, 0, end); if (unlikely(ret)) { + unlock_page(page); if (ret == -ENOMEM) ret = VM_FAULT_OOM; else /* -ENOSPC, -EIO, etc */ ret = VM_FAULT_SIGBUS; - } + } else + ret = VM_FAULT_LOCKED; -out_unlock: - unlock_page(page); +out: return ret; } -- cgit v1.2.3 From 0816178638c15ce5472d39d771a96860dff4141a Mon Sep 17 00:00:00 2001 From: Vitaly Mayatskikh Date: Thu, 30 Apr 2009 15:08:18 -0700 Subject: pagemap: require aligned-length, non-null reads of /proc/pid/pagemap The intention of commit aae8679b0ebcaa92f99c1c3cb0cd651594a43915 ("pagemap: fix bug in add_to_pagemap, require aligned-length reads of /proc/pid/pagemap") was to force reads of /proc/pid/pagemap to be a multiple of 8 bytes, but now it allows to read 0 bytes, which actually puts some data to user's buffer. According to POSIX, if count is zero, read() should return zero and has no other results. Signed-off-by: Vitaly Mayatskikh Cc: Thomas Tuttle Acked-by: Matt Mackall Cc: Alexey Dobriyan Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 39e4ad4f59f4..6f61b7cc32e0 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -665,6 +665,10 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, goto out_task; ret = 0; + + if (!count) + goto out_task; + mm = get_task_mm(task); if (!mm) goto out_task; -- cgit v1.2.3 From 74641f584da8eccf30becfbb5507ab457187db22 Mon Sep 17 00:00:00 2001 From: Ivan Kokshaysky Date: Thu, 30 Apr 2009 15:08:49 -0700 Subject: alpha: binfmt_aout fix This fixes the problem introduced by commit 3bfacef412 (get rid of special-casing the /sbin/loader on alpha): osf/1 ecoff binary segfaults when binfmt_aout built as module. That happens because aout binary handler gets on the top of the binfmt list due to late registration, and kernel attempts to execute the binary without preparatory work that must be done by binfmt_loader. Fixed by changing the registration order of the default binfmt handlers using list_add_tail() and introducing insert_binfmt() function which places new handler on the top of the binfmt list. This might be generally useful for installing arch-specific frontends for default handlers or just for overriding them. Signed-off-by: Ivan Kokshaysky Cc: Al Viro Cc: Richard Henderson Signed-off-by: Linus Torvalds --- fs/exec.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index a3a8ce83940f..639177b0eeac 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -69,17 +69,18 @@ int suid_dumpable = 0; static LIST_HEAD(formats); static DEFINE_RWLOCK(binfmt_lock); -int register_binfmt(struct linux_binfmt * fmt) +int __register_binfmt(struct linux_binfmt * fmt, int insert) { if (!fmt) return -EINVAL; write_lock(&binfmt_lock); - list_add(&fmt->lh, &formats); + insert ? list_add(&fmt->lh, &formats) : + list_add_tail(&fmt->lh, &formats); write_unlock(&binfmt_lock); return 0; } -EXPORT_SYMBOL(register_binfmt); +EXPORT_SYMBOL(__register_binfmt); void unregister_binfmt(struct linux_binfmt * fmt) { -- cgit v1.2.3 From 00a62ce91e554198ef28234c91c36f850f5a3bc9 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Thu, 30 Apr 2009 15:08:51 -0700 Subject: mm: fix Committed_AS underflow on large NR_CPUS environment The Committed_AS field can underflow in certain situations: > # while true; do cat /proc/meminfo | grep _AS; sleep 1; done | uniq -c > 1 Committed_AS: 18446744073709323392 kB > 11 Committed_AS: 18446744073709455488 kB > 6 Committed_AS: 35136 kB > 5 Committed_AS: 18446744073709454400 kB > 7 Committed_AS: 35904 kB > 3 Committed_AS: 18446744073709453248 kB > 2 Committed_AS: 34752 kB > 9 Committed_AS: 18446744073709453248 kB > 8 Committed_AS: 34752 kB > 3 Committed_AS: 18446744073709320960 kB > 7 Committed_AS: 18446744073709454080 kB > 3 Committed_AS: 18446744073709320960 kB > 5 Committed_AS: 18446744073709454080 kB > 6 Committed_AS: 18446744073709320960 kB Because NR_CPUS can be greater than 1000 and meminfo_proc_show() does not check for underflow. But NR_CPUS proportional isn't good calculation. In general, possibility of lock contention is proportional to the number of online cpus, not theorical maximum cpus (NR_CPUS). The current kernel has generic percpu-counter stuff. using it is right way. it makes code simplify and percpu_counter_read_positive() don't make underflow issue. Reported-by: Dave Hansen Signed-off-by: KOSAKI Motohiro Cc: Eric B Munson Cc: Mel Gorman Cc: Christoph Lameter Cc: [All kernel versions] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/meminfo.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 74ea974f5ca6..c6b0302af4c4 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -35,7 +35,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) #define K(x) ((x) << (PAGE_SHIFT - 10)) si_meminfo(&i); si_swapinfo(&i); - committed = atomic_long_read(&vm_committed_space); + committed = percpu_counter_read_positive(&vm_committed_as); allowed = ((totalram_pages - hugetlb_total_pages()) * sysctl_overcommit_ratio / 100) + total_swap_pages; -- cgit v1.2.3 From 0ae05fb254a5f2617fc8fcfff7be959b54a5e963 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 30 Apr 2009 15:08:54 -0700 Subject: ptrace: s/parent/real_parent/ in binfmt_elf_fdpic.c ->real_parent is the parent. ->parent may be the tracer. Signed-off-by: Oleg Nesterov Acked-by: David Howells Acked-by: Roland McGrath Cc: Greg Ungerer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_elf_fdpic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 70cfc4b84ae0..fdb66faa24f1 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1388,7 +1388,7 @@ static void fill_prstatus(struct elf_prstatus *prstatus, prstatus->pr_sigpend = p->pending.signal.sig[0]; prstatus->pr_sighold = p->blocked.sig[0]; prstatus->pr_pid = task_pid_vnr(p); - prstatus->pr_ppid = task_pid_vnr(p->parent); + prstatus->pr_ppid = task_pid_vnr(p->real_parent); prstatus->pr_pgrp = task_pgrp_vnr(p); prstatus->pr_sid = task_session_vnr(p); if (thread_group_leader(p)) { @@ -1433,7 +1433,7 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p, psinfo->pr_psargs[len] = 0; psinfo->pr_pid = task_pid_vnr(p); - psinfo->pr_ppid = task_pid_vnr(p->parent); + psinfo->pr_ppid = task_pid_vnr(p->real_parent); psinfo->pr_pgrp = task_pgrp_vnr(p); psinfo->pr_sid = task_session_vnr(p); -- cgit v1.2.3