diff options
Diffstat (limited to 'fs')
52 files changed, 494 insertions, 287 deletions
diff --git a/fs/affs/namei.c b/fs/affs/namei.c index 181e05b46e72..92448d0ad900 100644 --- a/fs/affs/namei.c +++ b/fs/affs/namei.c @@ -224,9 +224,10 @@ affs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) affs_lock_dir(dir); bh = affs_find_entry(dir, dentry); - affs_unlock_dir(dir); - if (IS_ERR(bh)) + if (IS_ERR(bh)) { + affs_unlock_dir(dir); return ERR_CAST(bh); + } if (bh) { u32 ino = bh->b_blocknr; @@ -240,10 +241,13 @@ affs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) } affs_brelse(bh); inode = affs_iget(sb, ino); - if (IS_ERR(inode)) + if (IS_ERR(inode)) { + affs_unlock_dir(dir); return ERR_CAST(inode); + } } d_add(dentry, inode); + affs_unlock_dir(dir); return NULL; } @@ -629,9 +629,8 @@ static void free_ioctx_users(struct percpu_ref *ref) while (!list_empty(&ctx->active_reqs)) { req = list_first_entry(&ctx->active_reqs, struct aio_kiocb, ki_list); - - list_del_init(&req->ki_list); kiocb_cancel(req); + list_del_init(&req->ki_list); } spin_unlock_irq(&ctx->ctx_lock); @@ -1067,8 +1066,8 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id) ctx = rcu_dereference(table->table[id]); if (ctx && ctx->user_id == ctx_id) { - percpu_ref_get(&ctx->users); - ret = ctx; + if (percpu_ref_tryget_live(&ctx->users)) + ret = ctx; } out: rcu_read_unlock(); diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index e2f5be261532..38ee08675468 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -2769,6 +2769,8 @@ again: * contention with the cow code */ if (cow) { + bool last_level = (level == (BTRFS_MAX_LEVEL - 1)); + /* * if we don't really need to cow this block * then we don't want to set the path blocking, @@ -2793,9 +2795,13 @@ again: } btrfs_set_path_blocking(p); - err = btrfs_cow_block(trans, root, b, - p->nodes[level + 1], - p->slots[level + 1], &b); + if (last_level) + err = btrfs_cow_block(trans, root, b, NULL, 0, + &b); + else + err = btrfs_cow_block(trans, root, b, + p->nodes[level + 1], + p->slots[level + 1], &b); if (err) { ret = err; goto done; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 85b207d19aa5..7efd70bfeaf7 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -923,7 +923,7 @@ static int check_async_write(struct inode *inode, unsigned long bio_flags) if (bio_flags & EXTENT_BIO_TREE_LOG) return 0; #ifdef CONFIG_X86 - if (cpu_has_xmm4_2) + if (static_cpu_has_safe(X86_FEATURE_XMM4_2)) return 0; #endif return 1; @@ -1196,7 +1196,7 @@ static struct btrfs_subvolume_writers *btrfs_alloc_subvolume_writers(void) if (!writers) return ERR_PTR(-ENOMEM); - ret = percpu_counter_init(&writers->counter, 0, GFP_KERNEL); + ret = percpu_counter_init(&writers->counter, 0, GFP_NOFS); if (ret < 0) { kfree(writers); return ERR_PTR(ret); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 260f94b019c9..982a9d509817 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4392,6 +4392,7 @@ again: if (wait_for_alloc) { mutex_unlock(&fs_info->chunk_mutex); wait_for_alloc = 0; + cond_resched(); goto again; } diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index d4a6eef31854..052973620595 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1861,10 +1861,19 @@ int btrfs_release_file(struct inode *inode, struct file *filp) static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end) { int ret; + struct blk_plug plug; + /* + * This is only called in fsync, which would do synchronous writes, so + * a plug can merge adjacent IOs as much as possible. Esp. in case of + * multiple disks using raid profile, a large IO can be split to + * several segments of stripe length (currently 64K). + */ + blk_start_plug(&plug); atomic_inc(&BTRFS_I(inode)->sync_writers); ret = btrfs_fdatawrite_range(inode, start, end); atomic_dec(&BTRFS_I(inode)->sync_writers); + blk_finish_plug(&plug); return ret; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 81b5a461d94e..1f01a8172308 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6413,8 +6413,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, goto out_unlock_inode; } else { btrfs_update_inode(trans, root, inode); - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); } out_unlock: @@ -6489,8 +6488,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, goto out_unlock_inode; BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); out_unlock: btrfs_end_transaction(trans, root); @@ -6633,12 +6631,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) if (err) goto out_fail_inode; - d_instantiate(dentry, inode); - /* - * mkdir is special. We're unlocking after we call d_instantiate - * to avoid a race with nfsd calling d_instantiate. - */ - unlock_new_inode(inode); + d_instantiate_new(dentry, inode); drop_on_err = 0; out_fail: @@ -9789,8 +9782,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, goto out_unlock_inode; } - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); out_unlock: btrfs_end_transaction(trans, root); diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 1a33d3eb36de..b9fa99577bf7 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -2160,11 +2160,21 @@ int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, } /* - * reconstruct from the q stripe if they are - * asking for mirror 3 + * Loop retry: + * for 'mirror == 2', reconstruct from all other stripes. + * for 'mirror_num > 2', select a stripe to fail on every retry. */ - if (mirror_num == 3) - rbio->failb = rbio->real_stripes - 2; + if (mirror_num > 2) { + /* + * 'mirror == 3' is to fail the p stripe and + * reconstruct from the q stripe. 'mirror > 3' is to + * fail a data stripe and reconstruct from p+q stripe. + */ + rbio->failb = rbio->real_stripes - (mirror_num - 1); + ASSERT(rbio->failb > 0); + if (rbio->failb <= rbio->faila) + rbio->failb--; + } ret = lock_stripe_add(rbio); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 19b56873b797..83c73738165e 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -4674,6 +4674,9 @@ static int send_hole(struct send_ctx *sctx, u64 end) u64 len; int ret = 0; + if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) + return send_update_extent(sctx, offset, end - offset); + p = fs_path_alloc(); if (!p) return -ENOMEM; diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c index 846d277b1901..2b2978c04e80 100644 --- a/fs/btrfs/tests/qgroup-tests.c +++ b/fs/btrfs/tests/qgroup-tests.c @@ -70,7 +70,7 @@ static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr, btrfs_set_extent_generation(leaf, item, 1); btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_TREE_BLOCK); block_info = (struct btrfs_tree_block_info *)(item + 1); - btrfs_set_tree_block_level(leaf, block_info, 1); + btrfs_set_tree_block_level(leaf, block_info, 0); iref = (struct btrfs_extent_inline_ref *)(block_info + 1); if (parent > 0) { btrfs_set_extent_inline_ref_type(leaf, iref, diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 6ba022ed4a52..738f5d6beb95 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2223,8 +2223,10 @@ again: nritems = btrfs_header_nritems(path->nodes[0]); if (path->slots[0] >= nritems) { ret = btrfs_next_leaf(root, path); - if (ret) + if (ret == 1) break; + else if (ret < 0) + goto out; } btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); @@ -3378,8 +3380,11 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, * from this directory and from this transaction */ ret = btrfs_next_leaf(root, path); - if (ret == 1) { - last_offset = (u64)-1; + if (ret) { + if (ret == 1) + last_offset = (u64)-1; + else + err = ret; goto done; } btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); @@ -3830,6 +3835,7 @@ fill_holes: ASSERT(ret == 0); src = src_path->nodes[0]; i = 0; + need_find_last_extent = true; } btrfs_item_key_to_cpu(src, &key, i); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index ed75d70b4bc2..b4d63a9842fa 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -5056,7 +5056,14 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) else if (map->type & BTRFS_BLOCK_GROUP_RAID5) ret = 2; else if (map->type & BTRFS_BLOCK_GROUP_RAID6) - ret = 3; + /* + * There could be two corrupted data stripes, we need + * to loop retry in order to rebuild the correct data. + * + * Fail a stripe at a time on every retry except the + * stripe under reconstruction. + */ + ret = map->num_stripes; else ret = 1; free_extent_map(em); diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 0c92af11f4f4..8632380d2b94 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -6421,9 +6421,7 @@ SetEARetry: pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_EA); - parm_data = - (struct fealist *) (((char *) &pSMB->hdr.Protocol) + - offset); + parm_data = (void *)pSMB + offsetof(struct smb_hdr, Protocol) + offset; pSMB->ParameterOffset = cpu_to_le16(param_offset); pSMB->DataOffset = cpu_to_le16(offset); pSMB->SetupCount = 1; diff --git a/fs/dcache.c b/fs/dcache.c index 750ddc627855..7df640155468 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1897,6 +1897,28 @@ struct dentry *d_instantiate_unique(struct dentry *entry, struct inode *inode) EXPORT_SYMBOL(d_instantiate_unique); +/* + * This should be equivalent to d_instantiate() + unlock_new_inode(), + * with lockdep-related part of unlock_new_inode() done before + * anything else. Use that instead of open-coding d_instantiate()/ + * unlock_new_inode() combinations. + */ +void d_instantiate_new(struct dentry *entry, struct inode *inode) +{ + BUG_ON(!hlist_unhashed(&entry->d_u.d_alias)); + BUG_ON(!inode); + lockdep_annotate_inode_mutex_key(inode); + security_d_instantiate(entry, inode); + spin_lock(&inode->i_lock); + __d_instantiate(entry, inode); + WARN_ON(!(inode->i_state & I_NEW)); + inode->i_state &= ~I_NEW; + smp_mb(); + wake_up_bit(&inode->i_state, __I_NEW); + spin_unlock(&inode->i_lock); +} +EXPORT_SYMBOL(d_instantiate_new); + /** * d_instantiate_no_diralias - instantiate a non-aliased dentry * @entry: dentry to complete diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index cb3ecf442d96..d108ed116858 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -291,7 +291,6 @@ ecryptfs_create(struct inode *directory_inode, struct dentry *ecryptfs_dentry, iput(ecryptfs_inode); goto out; } - unlock_new_inode(ecryptfs_inode); crypt_stat = &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat; if (get_events() && get_events()->open_cb) @@ -299,7 +298,7 @@ ecryptfs_create(struct inode *directory_inode, struct dentry *ecryptfs_dentry, ecryptfs_inode_to_lower(ecryptfs_inode), crypt_stat); - d_instantiate(ecryptfs_dentry, ecryptfs_inode); + d_instantiate_new(ecryptfs_dentry, ecryptfs_inode); out: return rc; } diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index 3267a80dbbe2..da3d40ef1668 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -40,8 +40,7 @@ static inline int ext2_add_nondir(struct dentry *dentry, struct inode *inode) { int err = ext2_add_link(dentry, inode); if (!err) { - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); return 0; } inode_dec_link_count(inode); @@ -267,8 +266,7 @@ static int ext2_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode) if (err) goto out_fail; - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); out: return err; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index b9324d0ff218..cf09d5f71ad2 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2426,8 +2426,7 @@ static int ext4_add_nondir(handle_t *handle, int err = ext4_add_entry(handle, dentry, inode); if (!err) { ext4_mark_inode_dirty(handle, inode); - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); return 0; } drop_nlink(inode); @@ -2666,8 +2665,7 @@ out_clear_inode: err = ext4_mark_inode_dirty(handle, dir); if (err) goto out_clear_inode; - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); if (IS_DIRSYNC(dir)) ext4_handle_sync(handle); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index d0bfcfed35e2..97c17b3d984c 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -238,15 +238,12 @@ enum { #define CP_DISCARD 0x00000010 #define CP_TRIMMED 0x00000020 -#define DEF_BATCHED_TRIM_SECTIONS 2048 -#define BATCHED_TRIM_SEGMENTS(sbi) \ - (GET_SEG_FROM_SEC(sbi, SM_I(sbi)->trim_sections)) -#define BATCHED_TRIM_BLOCKS(sbi) \ - (BATCHED_TRIM_SEGMENTS(sbi) << (sbi)->log_blocks_per_seg) #define MAX_DISCARD_BLOCKS(sbi) BLKS_PER_SEC(sbi) #define DEF_MAX_DISCARD_REQUEST 8 /* issue 8 discards per round */ +#define DEF_MAX_DISCARD_LEN 512 /* Max. 2MB per discard */ #define DEF_MIN_DISCARD_ISSUE_TIME 50 /* 50 ms, if exists */ #define DEF_MAX_DISCARD_ISSUE_TIME 60000 /* 60 s, if no candidates */ +#define DEF_DISCARD_URGENT_UTIL 80 /* do more discard over 80% */ #define DEF_CP_INTERVAL 60 /* 60 secs */ #define DEF_IDLE_INTERVAL 5 /* 5 secs */ @@ -753,7 +750,8 @@ static inline void set_extent_info(struct extent_info *ei, unsigned int fofs, static inline bool __is_discard_mergeable(struct discard_info *back, struct discard_info *front) { - return back->lstart + back->len == front->lstart; + return (back->lstart + back->len == front->lstart) && + (back->len + front->len < DEF_MAX_DISCARD_LEN); } static inline bool __is_discard_back_mergeable(struct discard_info *cur, @@ -1139,6 +1137,7 @@ enum { enum fsync_mode { FSYNC_MODE_POSIX, /* fsync follows posix semantics */ FSYNC_MODE_STRICT, /* fsync behaves in line with ext4 */ + FSYNC_MODE_NOBARRIER, /* fsync behaves nobarrier based on posix */ }; #ifdef CONFIG_F2FS_FS_ENCRYPTION @@ -2853,8 +2852,6 @@ int f2fs_flush_device_cache(struct f2fs_sb_info *sbi); void destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free); void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr); bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr); -void init_discard_policy(struct discard_policy *dpolicy, int discard_type, - unsigned int granularity); void drop_discard_cmd(struct f2fs_sb_info *sbi); void stop_discard_thread(struct f2fs_sb_info *sbi); bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 7587758a285f..40d03d58b390 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -309,7 +309,7 @@ sync_nodes: remove_ino_entry(sbi, ino, APPEND_INO); clear_inode_flag(inode, FI_APPEND_WRITE); flush_out: - if (!atomic) + if (!atomic && F2FS_OPTION(sbi).fsync_mode != FSYNC_MODE_NOBARRIER) ret = f2fs_issue_flush(sbi, inode->i_ino); if (!ret) { remove_ino_entry(sbi, ino, UPDATE_INO); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index fecae8685d2a..0355891dbbf8 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -294,8 +294,7 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, alloc_nid_done(sbi, ino); - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); if (IS_DIRSYNC(dir)) f2fs_sync_fs(sbi->sb, 1); @@ -594,8 +593,7 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, err = page_symlink(inode, disk_link.name, disk_link.len); err_out: - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); /* * Let's flush symlink data in order to avoid broken symlink as much as @@ -658,8 +656,7 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) alloc_nid_done(sbi, inode->i_ino); - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); if (IS_DIRSYNC(dir)) f2fs_sync_fs(sbi->sb, 1); @@ -710,8 +707,7 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, alloc_nid_done(sbi, inode->i_ino); - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); if (IS_DIRSYNC(dir)) f2fs_sync_fs(sbi->sb, 1); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 01bc94df9f00..a02d5c1a7ed2 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -996,6 +996,39 @@ static void __check_sit_bitmap(struct f2fs_sb_info *sbi, #endif } +static void __init_discard_policy(struct f2fs_sb_info *sbi, + struct discard_policy *dpolicy, + int discard_type, unsigned int granularity) +{ + /* common policy */ + dpolicy->type = discard_type; + dpolicy->sync = true; + dpolicy->granularity = granularity; + + dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST; + dpolicy->io_aware_gran = MAX_PLIST_NUM; + + if (discard_type == DPOLICY_BG) { + dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME; + dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME; + dpolicy->io_aware = true; + dpolicy->sync = false; + if (utilization(sbi) > DEF_DISCARD_URGENT_UTIL) { + dpolicy->granularity = 1; + dpolicy->max_interval = DEF_MIN_DISCARD_ISSUE_TIME; + } + } else if (discard_type == DPOLICY_FORCE) { + dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME; + dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME; + dpolicy->io_aware = false; + } else if (discard_type == DPOLICY_FSTRIM) { + dpolicy->io_aware = false; + } else if (discard_type == DPOLICY_UMOUNT) { + dpolicy->io_aware = false; + } +} + + /* this function is copied from blkdev_issue_discard from block/blk-lib.c */ static void __submit_discard_cmd(struct f2fs_sb_info *sbi, struct discard_policy *dpolicy, @@ -1210,68 +1243,6 @@ static int __queue_discard_cmd(struct f2fs_sb_info *sbi, return 0; } -static void __issue_discard_cmd_range(struct f2fs_sb_info *sbi, - struct discard_policy *dpolicy, - unsigned int start, unsigned int end) -{ - struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; - struct discard_cmd *prev_dc = NULL, *next_dc = NULL; - struct rb_node **insert_p = NULL, *insert_parent = NULL; - struct discard_cmd *dc; - struct blk_plug plug; - int issued; - -next: - issued = 0; - - mutex_lock(&dcc->cmd_lock); - f2fs_bug_on(sbi, !__check_rb_tree_consistence(sbi, &dcc->root)); - - dc = (struct discard_cmd *)__lookup_rb_tree_ret(&dcc->root, - NULL, start, - (struct rb_entry **)&prev_dc, - (struct rb_entry **)&next_dc, - &insert_p, &insert_parent, true); - if (!dc) - dc = next_dc; - - blk_start_plug(&plug); - - while (dc && dc->lstart <= end) { - struct rb_node *node; - - if (dc->len < dpolicy->granularity) - goto skip; - - if (dc->state != D_PREP) { - list_move_tail(&dc->list, &dcc->fstrim_list); - goto skip; - } - - __submit_discard_cmd(sbi, dpolicy, dc); - - if (++issued >= dpolicy->max_requests) { - start = dc->lstart + dc->len; - - blk_finish_plug(&plug); - mutex_unlock(&dcc->cmd_lock); - - schedule(); - - goto next; - } -skip: - node = rb_next(&dc->rb_node); - dc = rb_entry_safe(node, struct discard_cmd, rb_node); - - if (fatal_signal_pending(current)) - break; - } - - blk_finish_plug(&plug); - mutex_unlock(&dcc->cmd_lock); -} - static int __issue_discard_cmd(struct f2fs_sb_info *sbi, struct discard_policy *dpolicy) { @@ -1412,7 +1383,18 @@ next: static void __wait_all_discard_cmd(struct f2fs_sb_info *sbi, struct discard_policy *dpolicy) { - __wait_discard_cmd_range(sbi, dpolicy, 0, UINT_MAX); + struct discard_policy dp; + + if (dpolicy) { + __wait_discard_cmd_range(sbi, dpolicy, 0, UINT_MAX); + return; + } + + /* wait all */ + __init_discard_policy(sbi, &dp, DPOLICY_FSTRIM, 1); + __wait_discard_cmd_range(sbi, &dp, 0, UINT_MAX); + __init_discard_policy(sbi, &dp, DPOLICY_UMOUNT, 1); + __wait_discard_cmd_range(sbi, &dp, 0, UINT_MAX); } /* This should be covered by global mutex, &sit_i->sentry_lock */ @@ -1457,11 +1439,13 @@ bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi) struct discard_policy dpolicy; bool dropped; - init_discard_policy(&dpolicy, DPOLICY_UMOUNT, dcc->discard_granularity); + __init_discard_policy(sbi, &dpolicy, DPOLICY_UMOUNT, + dcc->discard_granularity); __issue_discard_cmd(sbi, &dpolicy); dropped = __drop_discard_cmd(sbi); - __wait_all_discard_cmd(sbi, &dpolicy); + /* just to make sure there is no pending discard commands */ + __wait_all_discard_cmd(sbi, NULL); return dropped; } @@ -1477,7 +1461,7 @@ static int issue_discard_thread(void *data) set_freezable(); do { - init_discard_policy(&dpolicy, DPOLICY_BG, + __init_discard_policy(sbi, &dpolicy, DPOLICY_BG, dcc->discard_granularity); wait_event_interruptible_timeout(*q, @@ -1495,7 +1479,7 @@ static int issue_discard_thread(void *data) dcc->discard_wake = 0; if (sbi->gc_thread && sbi->gc_thread->gc_urgent) - init_discard_policy(&dpolicy, DPOLICY_FORCE, 1); + __init_discard_policy(sbi, &dpolicy, DPOLICY_FORCE, 1); sb_start_intwrite(sbi->sb); @@ -1788,32 +1772,6 @@ skip: wake_up_discard_thread(sbi, false); } -void init_discard_policy(struct discard_policy *dpolicy, - int discard_type, unsigned int granularity) -{ - /* common policy */ - dpolicy->type = discard_type; - dpolicy->sync = true; - dpolicy->granularity = granularity; - - dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST; - dpolicy->io_aware_gran = MAX_PLIST_NUM; - - if (discard_type == DPOLICY_BG) { - dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME; - dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME; - dpolicy->io_aware = true; - } else if (discard_type == DPOLICY_FORCE) { - dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME; - dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME; - dpolicy->io_aware = false; - } else if (discard_type == DPOLICY_FSTRIM) { - dpolicy->io_aware = false; - } else if (discard_type == DPOLICY_UMOUNT) { - dpolicy->io_aware = false; - } -} - static int create_discard_cmd_control(struct f2fs_sb_info *sbi) { dev_t dev = sbi->sb->s_bdev->bd_dev; @@ -2453,11 +2411,72 @@ bool exist_trim_candidates(struct f2fs_sb_info *sbi, struct cp_control *cpc) return has_candidate; } +static void __issue_discard_cmd_range(struct f2fs_sb_info *sbi, + struct discard_policy *dpolicy, + unsigned int start, unsigned int end) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct discard_cmd *prev_dc = NULL, *next_dc = NULL; + struct rb_node **insert_p = NULL, *insert_parent = NULL; + struct discard_cmd *dc; + struct blk_plug plug; + int issued; + +next: + issued = 0; + + mutex_lock(&dcc->cmd_lock); + f2fs_bug_on(sbi, !__check_rb_tree_consistence(sbi, &dcc->root)); + + dc = (struct discard_cmd *)__lookup_rb_tree_ret(&dcc->root, + NULL, start, + (struct rb_entry **)&prev_dc, + (struct rb_entry **)&next_dc, + &insert_p, &insert_parent, true); + if (!dc) + dc = next_dc; + + blk_start_plug(&plug); + + while (dc && dc->lstart <= end) { + struct rb_node *node; + + if (dc->len < dpolicy->granularity) + goto skip; + + if (dc->state != D_PREP) { + list_move_tail(&dc->list, &dcc->fstrim_list); + goto skip; + } + + __submit_discard_cmd(sbi, dpolicy, dc); + + if (++issued >= dpolicy->max_requests) { + start = dc->lstart + dc->len; + + blk_finish_plug(&plug); + mutex_unlock(&dcc->cmd_lock); + __wait_all_discard_cmd(sbi, NULL); + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto next; + } +skip: + node = rb_next(&dc->rb_node); + dc = rb_entry_safe(node, struct discard_cmd, rb_node); + + if (fatal_signal_pending(current)) + break; + } + + blk_finish_plug(&plug); + mutex_unlock(&dcc->cmd_lock); +} + int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) { __u64 start = F2FS_BYTES_TO_BLK(range->start); __u64 end = start + F2FS_BYTES_TO_BLK(range->len) - 1; - unsigned int start_segno, end_segno, cur_segno; + unsigned int start_segno, end_segno; block_t start_block, end_block; struct cp_control cpc; struct discard_policy dpolicy; @@ -2483,40 +2502,36 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) cpc.reason = CP_DISCARD; cpc.trim_minlen = max_t(__u64, 1, F2FS_BYTES_TO_BLK(range->minlen)); + cpc.trim_start = start_segno; + cpc.trim_end = end_segno; - /* do checkpoint to issue discard commands safely */ - for (cur_segno = start_segno; cur_segno <= end_segno; - cur_segno = cpc.trim_end + 1) { - cpc.trim_start = cur_segno; - - if (sbi->discard_blks == 0) - break; - else if (sbi->discard_blks < BATCHED_TRIM_BLOCKS(sbi)) - cpc.trim_end = end_segno; - else - cpc.trim_end = min_t(unsigned int, - rounddown(cur_segno + - BATCHED_TRIM_SEGMENTS(sbi), - sbi->segs_per_sec) - 1, end_segno); - - mutex_lock(&sbi->gc_mutex); - err = write_checkpoint(sbi, &cpc); - mutex_unlock(&sbi->gc_mutex); - if (err) - break; + if (sbi->discard_blks == 0) + goto out; - schedule(); - } + mutex_lock(&sbi->gc_mutex); + err = write_checkpoint(sbi, &cpc); + mutex_unlock(&sbi->gc_mutex); + if (err) + goto out; start_block = START_BLOCK(sbi, start_segno); - end_block = START_BLOCK(sbi, min(cur_segno, end_segno) + 1); + end_block = START_BLOCK(sbi, end_segno + 1); - init_discard_policy(&dpolicy, DPOLICY_FSTRIM, cpc.trim_minlen); + __init_discard_policy(sbi, &dpolicy, DPOLICY_FSTRIM, cpc.trim_minlen); __issue_discard_cmd_range(sbi, &dpolicy, start_block, end_block); - trimmed = __wait_discard_cmd_range(sbi, &dpolicy, + + /* + * We filed discard candidates, but actually we don't need to wait for + * all of them, since they'll be issued in idle time along with runtime + * discard option. User configuration looks like using runtime discard + * or periodic fstrim instead of it. + */ + if (!test_opt(sbi, DISCARD)) { + trimmed = __wait_discard_cmd_range(sbi, &dpolicy, start_block, end_block); + range->len = F2FS_BLK_TO_BYTES(trimmed); + } out: - range->len = F2FS_BLK_TO_BYTES(trimmed); return err; } @@ -3904,8 +3919,6 @@ int build_segment_manager(struct f2fs_sb_info *sbi) sm_info->min_hot_blocks = DEF_MIN_HOT_BLOCKS; sm_info->min_ssr_sections = reserved_sections(sbi); - sm_info->trim_sections = DEF_BATCHED_TRIM_SECTIONS; - INIT_LIST_HEAD(&sm_info->sit_entry_set); init_rwsem(&sm_info->curseg_lock); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 55b2bad55671..cb57ad3ca32d 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -740,6 +740,10 @@ static int parse_options(struct super_block *sb, char *options) } else if (strlen(name) == 6 && !strncmp(name, "strict", 6)) { F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_STRICT; + } else if (strlen(name) == 9 && + !strncmp(name, "nobarrier", 9)) { + F2FS_OPTION(sbi).fsync_mode = + FSYNC_MODE_NOBARRIER; } else { kfree(name); return -EINVAL; diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index f33a56d6e6dd..2c53de9251be 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -245,6 +245,9 @@ out: return count; } + if (!strcmp(a->attr.name, "trim_sections")) + return -EINVAL; + *ui = t; if (!strcmp(a->attr.name, "iostat_enable") && *ui == 0) diff --git a/fs/fscache/page.c b/fs/fscache/page.c index 6b35fc4860a0..1de16a5a5c4e 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -776,6 +776,7 @@ static void fscache_write_op(struct fscache_operation *_op) _enter("{OP%x,%d}", op->op.debug_id, atomic_read(&op->op.usage)); +again: spin_lock(&object->lock); cookie = object->cookie; @@ -816,10 +817,6 @@ static void fscache_write_op(struct fscache_operation *_op) goto superseded; page = results[0]; _debug("gang %d [%lx]", n, page->index); - if (page->index >= op->store_limit) { - fscache_stat(&fscache_n_store_pages_over_limit); - goto superseded; - } radix_tree_tag_set(&cookie->stores, page->index, FSCACHE_COOKIE_STORING_TAG); @@ -829,6 +826,9 @@ static void fscache_write_op(struct fscache_operation *_op) spin_unlock(&cookie->stores_lock); spin_unlock(&object->lock); + if (page->index >= op->store_limit) + goto discard_page; + fscache_stat(&fscache_n_store_pages); fscache_stat(&fscache_n_cop_write_page); ret = object->cache->ops->write_page(op, page); @@ -844,6 +844,11 @@ static void fscache_write_op(struct fscache_operation *_op) _leave(""); return; +discard_page: + fscache_stat(&fscache_n_store_pages_over_limit); + fscache_end_page_write(object, page); + goto again; + superseded: /* this writer is going away and there aren't any more things to * write */ diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 1543aa1b2a93..8744bd773823 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -806,7 +806,7 @@ static long __gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_alloc_parms ap = { .aflags = 0, }; unsigned int data_blocks = 0, ind_blocks = 0, rblocks; - loff_t bytes, max_bytes, max_blks = UINT_MAX; + loff_t bytes, max_bytes, max_blks; int error; const loff_t pos = offset; const loff_t count = len; @@ -858,7 +858,8 @@ static long __gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t return error; /* ap.allowed tells us how many blocks quota will allow * us to write. Check if this reduces max_blks */ - if (ap.allowed && ap.allowed < max_blks) + max_blks = UINT_MAX; + if (ap.allowed) max_blks = ap.allowed; error = gfs2_inplace_reserve(ip, &ap); diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h index ad04b3acae2b..a81ed38d8442 100644 --- a/fs/gfs2/quota.h +++ b/fs/gfs2/quota.h @@ -43,6 +43,8 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip, { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); int ret; + + ap->allowed = UINT_MAX; /* Assume we are permitted a whole lot */ if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF) return 0; ret = gfs2_quota_lock(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE); diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index 30c4c9ebb693..e27317169697 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -207,8 +207,7 @@ static int jffs2_create(struct inode *dir_i, struct dentry *dentry, __func__, inode->i_ino, inode->i_mode, inode->i_nlink, f->inocache->pino_nlink, inode->i_mapping->nrpages); - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); return 0; fail: @@ -428,8 +427,7 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char mutex_unlock(&dir_f->sem); jffs2_complete_reservation(c); - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); return 0; fail: @@ -573,8 +571,7 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, umode_t mode mutex_unlock(&dir_f->sem); jffs2_complete_reservation(c); - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); return 0; fail: @@ -745,8 +742,7 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, umode_t mode mutex_unlock(&dir_f->sem); jffs2_complete_reservation(c); - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); return 0; fail: diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index 2caf1682036d..85e2594fe95c 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -361,7 +361,6 @@ error_io: ret = -EIO; error: mutex_unlock(&f->sem); - jffs2_do_clear_inode(c, f); iget_failed(inode); return ERR_PTR(ret); } diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index 9d7551f5c32a..f217ae750adb 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -178,8 +178,7 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, umode_t mode, unlock_new_inode(ip); iput(ip); } else { - unlock_new_inode(ip); - d_instantiate(dentry, ip); + d_instantiate_new(dentry, ip); } out2: @@ -313,8 +312,7 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, umode_t mode) unlock_new_inode(ip); iput(ip); } else { - unlock_new_inode(ip); - d_instantiate(dentry, ip); + d_instantiate_new(dentry, ip); } out2: @@ -1058,8 +1056,7 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry, unlock_new_inode(ip); iput(ip); } else { - unlock_new_inode(ip); - d_instantiate(dentry, ip); + d_instantiate_new(dentry, ip); } out2: @@ -1443,8 +1440,7 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry, unlock_new_inode(ip); iput(ip); } else { - unlock_new_inode(ip); - d_instantiate(dentry, ip); + d_instantiate_new(dentry, ip); } out1: diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 0f397e62de5a..41c8ddbc80dc 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1780,7 +1780,7 @@ static int nfs4_open_reclaim(struct nfs4_state_owner *sp, struct nfs4_state *sta return ret; } -static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct nfs4_state *state, const nfs4_stateid *stateid, int err) +static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct nfs4_state *state, const nfs4_stateid *stateid, struct file_lock *fl, int err) { switch (err) { default: @@ -1827,7 +1827,11 @@ static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct return -EAGAIN; case -ENOMEM: case -NFS4ERR_DENIED: - /* kill_proc(fl->fl_pid, SIGLOST, 1); */ + if (fl) { + struct nfs4_lock_state *lsp = fl->fl_u.nfs4_fl.owner; + if (lsp) + set_bit(NFS_LOCK_LOST, &lsp->ls_flags); + } return 0; } return err; @@ -1863,7 +1867,7 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, err = nfs4_open_recover_helper(opendata, FMODE_READ); } nfs4_opendata_put(opendata); - return nfs4_handle_delegation_recall_error(server, state, stateid, err); + return nfs4_handle_delegation_recall_error(server, state, stateid, NULL, err); } static void nfs4_open_confirm_prepare(struct rpc_task *task, void *calldata) @@ -6157,7 +6161,7 @@ int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state, if (err != 0) return err; err = _nfs4_do_setlk(state, F_SETLK, fl, NFS_LOCK_NEW); - return nfs4_handle_delegation_recall_error(server, state, stateid, err); + return nfs4_handle_delegation_recall_error(server, state, stateid, fl, err); } struct nfs_release_lockowner_data { diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 83fba40396ae..44f5cea49699 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1386,6 +1386,7 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_ struct inode *inode = state->inode; struct nfs_inode *nfsi = NFS_I(inode); struct file_lock *fl; + struct nfs4_lock_state *lsp; int status = 0; struct file_lock_context *flctx = inode->i_flctx; struct list_head *list; @@ -1426,7 +1427,9 @@ restart: case -NFS4ERR_DENIED: case -NFS4ERR_RECLAIM_BAD: case -NFS4ERR_RECLAIM_CONFLICT: - /* kill_proc(fl->fl_pid, SIGLOST, 1); */ + lsp = fl->fl_u.nfs4_fl.owner; + if (lsp) + set_bit(NFS_LOCK_LOST, &lsp->ls_flags); status = 0; } spin_lock(&flctx->flc_lock); diff --git a/fs/nfs/nfs4sysctl.c b/fs/nfs/nfs4sysctl.c index 0fbd3ab1be22..44a7bbbf92f8 100644 --- a/fs/nfs/nfs4sysctl.c +++ b/fs/nfs/nfs4sysctl.c @@ -31,7 +31,7 @@ static struct ctl_table nfs4_cb_sysctls[] = { .data = &nfs_idmap_cache_timeout, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec_jiffies, + .proc_handler = proc_dointvec, }, { } }; diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c index c9a1a491aa91..cd7f5b0abe84 100644 --- a/fs/nilfs2/namei.c +++ b/fs/nilfs2/namei.c @@ -50,8 +50,7 @@ static inline int nilfs_add_nondir(struct dentry *dentry, struct inode *inode) { int err = nilfs_add_link(dentry, inode); if (!err) { - d_instantiate(dentry, inode); - unlock_new_inode(inode); + d_instantiate_new(dentry, inode); return 0; } inode_dec_link_count(inode); @@ -246,8 +245,7 @@ static int nilfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) goto out_fail; nilfs_mark_inode_dirty(inode); - d_instantiate(dentry, inode); - unlock_new_inode(inode); + d_instantiate_new(dentry, inode); out: if (!err) err = nilfs_transaction_commit(dir->i_sb); diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index 164307b99405..1e0d8da0d3cd 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -314,7 +314,9 @@ struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type) return ERR_PTR(ret); } + down_read(&OCFS2_I(inode)->ip_xattr_sem); acl = ocfs2_get_acl_nolock(inode, type, di_bh); + up_read(&OCFS2_I(inode)->ip_xattr_sem); ocfs2_inode_unlock(inode, 0); brelse(di_bh); @@ -333,7 +335,9 @@ int ocfs2_acl_chmod(struct inode *inode, struct buffer_head *bh) if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) return 0; + down_read(&OCFS2_I(inode)->ip_xattr_sem); acl = ocfs2_get_acl_nolock(inode, ACL_TYPE_ACCESS, bh); + up_read(&OCFS2_I(inode)->ip_xattr_sem); if (IS_ERR(acl) || !acl) return PTR_ERR(acl); ret = __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); @@ -364,8 +368,10 @@ int ocfs2_init_acl(handle_t *handle, if (!S_ISLNK(inode->i_mode)) { if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) { + down_read(&OCFS2_I(dir)->ip_xattr_sem); acl = ocfs2_get_acl_nolock(dir, ACL_TYPE_DEFAULT, dir_bh); + up_read(&OCFS2_I(dir)->ip_xattr_sem); if (IS_ERR(acl)) return PTR_ERR(acl); } diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 2ee7fe747cea..c55a9c47ac17 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -674,20 +674,6 @@ static void dlm_leave_domain(struct dlm_ctxt *dlm) spin_unlock(&dlm->spinlock); } -int dlm_shutting_down(struct dlm_ctxt *dlm) -{ - int ret = 0; - - spin_lock(&dlm_domain_lock); - - if (dlm->dlm_state == DLM_CTXT_IN_SHUTDOWN) - ret = 1; - - spin_unlock(&dlm_domain_lock); - - return ret; -} - void dlm_unregister_domain(struct dlm_ctxt *dlm) { int leave = 0; diff --git a/fs/ocfs2/dlm/dlmdomain.h b/fs/ocfs2/dlm/dlmdomain.h index fd6122a38dbd..8a9281411c18 100644 --- a/fs/ocfs2/dlm/dlmdomain.h +++ b/fs/ocfs2/dlm/dlmdomain.h @@ -28,7 +28,30 @@ extern spinlock_t dlm_domain_lock; extern struct list_head dlm_domains; -int dlm_shutting_down(struct dlm_ctxt *dlm); +static inline int dlm_joined(struct dlm_ctxt *dlm) +{ + int ret = 0; + + spin_lock(&dlm_domain_lock); + if (dlm->dlm_state == DLM_CTXT_JOINED) + ret = 1; + spin_unlock(&dlm_domain_lock); + + return ret; +} + +static inline int dlm_shutting_down(struct dlm_ctxt *dlm) +{ + int ret = 0; + + spin_lock(&dlm_domain_lock); + if (dlm->dlm_state == DLM_CTXT_IN_SHUTDOWN) + ret = 1; + spin_unlock(&dlm_domain_lock); + + return ret; +} + void dlm_fire_domain_eviction_callbacks(struct dlm_ctxt *dlm, int node_num); diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 4a338803e7e9..88149b4387c2 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -1377,6 +1377,15 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data, if (!dlm_grab(dlm)) return -EINVAL; + if (!dlm_joined(dlm)) { + mlog(ML_ERROR, "Domain %s not joined! " + "lockres %.*s, master %u\n", + dlm->name, mres->lockname_len, + mres->lockname, mres->master); + dlm_put(dlm); + return -EINVAL; + } + BUG_ON(!(mres->flags & (DLM_MRES_RECOVERY|DLM_MRES_MIGRATION))); real_master = mres->master; diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 13534f4fe5b5..722eb5bc9b8f 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -666,23 +666,24 @@ static int __ocfs2_journal_access(handle_t *handle, /* we can safely remove this assertion after testing. */ if (!buffer_uptodate(bh)) { mlog(ML_ERROR, "giving me a buffer that's not uptodate!\n"); - mlog(ML_ERROR, "b_blocknr=%llu\n", - (unsigned long long)bh->b_blocknr); + mlog(ML_ERROR, "b_blocknr=%llu, b_state=0x%lx\n", + (unsigned long long)bh->b_blocknr, bh->b_state); lock_buffer(bh); /* - * A previous attempt to write this buffer head failed. - * Nothing we can do but to retry the write and hope for - * the best. + * A previous transaction with a couple of buffer heads fail + * to checkpoint, so all the bhs are marked as BH_Write_EIO. + * For current transaction, the bh is just among those error + * bhs which previous transaction handle. We can't just clear + * its BH_Write_EIO and reuse directly, since other bhs are + * not written to disk yet and that will cause metadata + * inconsistency. So we should set fs read-only to avoid + * further damage. */ if (buffer_write_io_error(bh) && !buffer_uptodate(bh)) { - clear_buffer_write_io_error(bh); - set_buffer_uptodate(bh); - } - - if (!buffer_uptodate(bh)) { unlock_buffer(bh); - return -EIO; + return ocfs2_error(osb->sb, "A previous attempt to " + "write this buffer head failed\n"); } unlock_buffer(bh); } diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 2de4c8a9340c..4f5141350af8 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -477,9 +477,8 @@ static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb) new = ocfs2_get_system_file_inode(osb, i, osb->slot_num); if (!new) { ocfs2_release_system_inodes(osb); - status = -EINVAL; + status = ocfs2_is_soft_readonly(osb) ? -EROFS : -EINVAL; mlog_errno(status); - /* FIXME: Should ERROR_RO_FS */ mlog(ML_ERROR, "Unable to load system inode %d, " "possibly corrupt fs?", i); goto bail; @@ -508,7 +507,7 @@ static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb) new = ocfs2_get_system_file_inode(osb, i, osb->slot_num); if (!new) { ocfs2_release_system_inodes(osb); - status = -EINVAL; + status = ocfs2_is_soft_readonly(osb) ? -EROFS : -EINVAL; mlog(ML_ERROR, "status=%d, sysfile=%d, slot=%d\n", status, i, osb->slot_num); goto bail; diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 877830b05e12..4f0788232f2f 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -639,9 +639,11 @@ int ocfs2_calc_xattr_init(struct inode *dir, si->value_len); if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) { + down_read(&OCFS2_I(dir)->ip_xattr_sem); acl_len = ocfs2_xattr_get_nolock(dir, dir_bh, OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT, "", NULL, 0); + up_read(&OCFS2_I(dir)->ip_xattr_sem); if (acl_len > 0) { a_size = ocfs2_xattr_entry_real_size(0, acl_len); if (S_ISDIR(mode)) diff --git a/fs/proc/base.c b/fs/proc/base.c index 2b7fe41740f5..4c0989076151 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -95,6 +95,8 @@ #include "internal.h" #include "fd.h" +#include "../../lib/kstrtox.h" + /* NOTE: * Implementing inode permission operations in /proc is almost * certainly an error. Permission checks need to happen during @@ -2040,8 +2042,33 @@ end_instantiate: static int dname_to_vma_addr(struct dentry *dentry, unsigned long *start, unsigned long *end) { - if (sscanf(dentry->d_name.name, "%lx-%lx", start, end) != 2) + const char *str = dentry->d_name.name; + unsigned long long sval, eval; + unsigned int len; + + len = _parse_integer(str, 16, &sval); + if (len & KSTRTOX_OVERFLOW) + return -EINVAL; + if (sval != (unsigned long)sval) return -EINVAL; + str += len; + + if (*str != '-') + return -EINVAL; + str++; + + len = _parse_integer(str, 16, &eval); + if (len & KSTRTOX_OVERFLOW) + return -EINVAL; + if (eval != (unsigned long)eval) + return -EINVAL; + str += len; + + if (*str != '\0') + return -EINVAL; + + *start = sval; + *end = eval; return 0; } diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 4dbe1e2daeca..5e1054f028af 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -654,7 +654,10 @@ static bool proc_sys_link_fill_cache(struct file *file, struct ctl_table *table) { bool ret = true; + head = sysctl_head_grab(head); + if (IS_ERR(head)) + return false; if (S_ISLNK(table->mode)) { /* It is not an error if we can not follow the link ignore it */ diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index 3ebc70167e41..eb611bdd4725 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -687,8 +687,7 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, umode_t mod reiserfs_update_inode_transaction(inode); reiserfs_update_inode_transaction(dir); - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); retval = journal_end(&th); out_failed: @@ -771,8 +770,7 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode goto out_failed; } - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); retval = journal_end(&th); out_failed: @@ -871,8 +869,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode /* the above add_entry did not update dir's stat data */ reiserfs_update_sd(&th, dir); - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); retval = journal_end(&th); out_failed: reiserfs_write_unlock(dir->i_sb); @@ -1186,8 +1183,7 @@ static int reiserfs_symlink(struct inode *parent_dir, goto out_failed; } - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); retval = journal_end(&th); out_failed: reiserfs_write_unlock(parent_dir->i_sb); diff --git a/fs/udf/namei.c b/fs/udf/namei.c index c97b5a8d1e24..f34c545f4e54 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -611,8 +611,7 @@ static int udf_add_nondir(struct dentry *dentry, struct inode *inode) if (fibh.sbh != fibh.ebh) brelse(fibh.ebh); brelse(fibh.sbh); - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); return 0; } @@ -722,8 +721,7 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) inc_nlink(dir); dir->i_ctime = dir->i_mtime = current_fs_time(dir->i_sb); mark_inode_dirty(dir); - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); if (fibh.sbh != fibh.ebh) brelse(fibh.ebh); brelse(fibh.sbh); diff --git a/fs/udf/super.c b/fs/udf/super.c index ee09c97f3ab2..159977ec8e54 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -2073,8 +2073,9 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) bool lvid_open = false; uopt.flags = (1 << UDF_FLAG_USE_AD_IN_ICB) | (1 << UDF_FLAG_STRICT); - uopt.uid = INVALID_UID; - uopt.gid = INVALID_GID; + /* By default we'll use overflow[ug]id when UDF inode [ug]id == -1 */ + uopt.uid = make_kuid(current_user_ns(), overflowuid); + uopt.gid = make_kgid(current_user_ns(), overflowgid); uopt.umask = 0; uopt.fmode = UDF_INVALID_MODE; uopt.dmode = UDF_INVALID_MODE; diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c index 47966554317c..2ec7689c25cf 100644 --- a/fs/ufs/namei.c +++ b/fs/ufs/namei.c @@ -38,8 +38,7 @@ static inline int ufs_add_nondir(struct dentry *dentry, struct inode *inode) { int err = ufs_add_link(dentry, inode); if (!err) { - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); return 0; } inode_dec_link_count(inode); @@ -191,8 +190,7 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode) if (err) goto out_fail; - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); return 0; out_fail: diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index e1e7fe3b5424..b663b756f552 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -1924,6 +1924,93 @@ xfs_alloc_space_available( } /* + * Check the agfl fields of the agf for inconsistency or corruption. The purpose + * is to detect an agfl header padding mismatch between current and early v5 + * kernels. This problem manifests as a 1-slot size difference between the + * on-disk flcount and the active [first, last] range of a wrapped agfl. This + * may also catch variants of agfl count corruption unrelated to padding. Either + * way, we'll reset the agfl and warn the user. + * + * Return true if a reset is required before the agfl can be used, false + * otherwise. + */ +static bool +xfs_agfl_needs_reset( + struct xfs_mount *mp, + struct xfs_agf *agf) +{ + uint32_t f = be32_to_cpu(agf->agf_flfirst); + uint32_t l = be32_to_cpu(agf->agf_fllast); + uint32_t c = be32_to_cpu(agf->agf_flcount); + int agfl_size = XFS_AGFL_SIZE(mp); + int active; + + /* no agfl header on v4 supers */ + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return false; + + /* + * The agf read verifier catches severe corruption of these fields. + * Repeat some sanity checks to cover a packed -> unpacked mismatch if + * the verifier allows it. + */ + if (f >= agfl_size || l >= agfl_size) + return true; + if (c > agfl_size) + return true; + + /* + * Check consistency between the on-disk count and the active range. An + * agfl padding mismatch manifests as an inconsistent flcount. + */ + if (c && l >= f) + active = l - f + 1; + else if (c) + active = agfl_size - f + l + 1; + else + active = 0; + + return active != c; +} + +/* + * Reset the agfl to an empty state. Ignore/drop any existing blocks since the + * agfl content cannot be trusted. Warn the user that a repair is required to + * recover leaked blocks. + * + * The purpose of this mechanism is to handle filesystems affected by the agfl + * header padding mismatch problem. A reset keeps the filesystem online with a + * relatively minor free space accounting inconsistency rather than suffer the + * inevitable crash from use of an invalid agfl block. + */ +static void +xfs_agfl_reset( + struct xfs_trans *tp, + struct xfs_buf *agbp, + struct xfs_perag *pag) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + + ASSERT(pag->pagf_agflreset); + trace_xfs_agfl_reset(mp, agf, 0, _RET_IP_); + + xfs_warn(mp, + "WARNING: Reset corrupted AGFL on AG %u. %d blocks leaked. " + "Please unmount and run xfs_repair.", + pag->pag_agno, pag->pagf_flcount); + + agf->agf_flfirst = 0; + agf->agf_fllast = cpu_to_be32(XFS_AGFL_SIZE(mp) - 1); + agf->agf_flcount = 0; + xfs_alloc_log_agf(tp, agbp, XFS_AGF_FLFIRST | XFS_AGF_FLLAST | + XFS_AGF_FLCOUNT); + + pag->pagf_flcount = 0; + pag->pagf_agflreset = false; +} + +/* * Decide whether to use this allocation group for this allocation. * If so, fix up the btree freelist's size. */ @@ -1983,6 +2070,10 @@ xfs_alloc_fix_freelist( } } + /* reset a padding mismatched agfl before final free space check */ + if (pag->pagf_agflreset) + xfs_agfl_reset(tp, agbp, pag); + /* If there isn't enough total space or single-extent, reject it. */ need = xfs_alloc_min_freelist(mp, pag); if (!xfs_alloc_space_available(args, need, flags)) @@ -2121,6 +2212,7 @@ xfs_alloc_get_freelist( agf->agf_flfirst = 0; pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno)); + ASSERT(!pag->pagf_agflreset); be32_add_cpu(&agf->agf_flcount, -1); xfs_trans_agflist_delta(tp, -1); pag->pagf_flcount--; @@ -2226,6 +2318,7 @@ xfs_alloc_put_freelist( agf->agf_fllast = 0; pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno)); + ASSERT(!pag->pagf_agflreset); be32_add_cpu(&agf->agf_flcount, 1); xfs_trans_agflist_delta(tp, 1); pag->pagf_flcount++; @@ -2417,6 +2510,7 @@ xfs_alloc_read_agf( pag->pagb_count = 0; pag->pagb_tree = RB_ROOT; pag->pagf_init = 1; + pag->pagf_agflreset = xfs_agfl_needs_reset(mp, agf); } #ifdef DEBUG else if (!XFS_FORCED_SHUTDOWN(mp)) { diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index f949818fa1c7..fb9636cc927c 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -130,9 +130,6 @@ xfs_attr_get( if (XFS_FORCED_SHUTDOWN(ip->i_mount)) return -EIO; - if (!xfs_inode_hasattr(ip)) - return -ENOATTR; - error = xfs_attr_args_init(&args, ip, name, flags); if (error) return error; @@ -417,9 +414,6 @@ xfs_attr_remove( if (XFS_FORCED_SHUTDOWN(dp->i_mount)) return -EIO; - if (!xfs_inode_hasattr(dp)) - return -ENOATTR; - error = xfs_attr_args_init(&args, dp, name, flags); if (error) return error; diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index e85a9519a5ae..64ad05cb831a 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -50,19 +50,19 @@ xfs_trim_extents( pag = xfs_perag_get(mp, agno); - error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp); - if (error || !agbp) - goto out_put_perag; - - cur = xfs_allocbt_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_CNT); - /* * Force out the log. This means any transactions that might have freed - * space before we took the AGF buffer lock are now on disk, and the + * space before we take the AGF buffer lock are now on disk, and the * volatile disk cache is flushed. */ xfs_log_force(mp, XFS_LOG_SYNC); + error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp); + if (error || !agbp) + goto out_put_perag; + + cur = xfs_allocbt_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_CNT); + /* * Look up the longest btree in the AGF and start with it. */ diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index f52c72a1a06f..73b725f965eb 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -3323,8 +3323,6 @@ maybe_sleep: */ if (iclog->ic_state & XLOG_STATE_IOERROR) return -EIO; - if (log_flushed) - *log_flushed = 1; } else { no_sleep: @@ -3432,8 +3430,6 @@ try_again: xlog_wait(&iclog->ic_prev->ic_write_wait, &log->l_icloglock); - if (log_flushed) - *log_flushed = 1; already_slept = 1; goto try_again; } @@ -3467,9 +3463,6 @@ try_again: */ if (iclog->ic_state & XLOG_STATE_IOERROR) return -EIO; - - if (log_flushed) - *log_flushed = 1; } else { /* just return */ spin_unlock(&log->l_icloglock); } diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index b57098481c10..ae3e52749f20 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -278,6 +278,7 @@ typedef struct xfs_perag { char pagi_inodeok; /* The agi is ok for inodes */ __uint8_t pagf_levels[XFS_BTNUM_AGF]; /* # of levels in bno & cnt btree */ + bool pagf_agflreset; /* agfl requires reset before use */ __uint32_t pagf_flcount; /* count of blocks in freelist */ xfs_extlen_t pagf_freeblks; /* total free blocks */ xfs_extlen_t pagf_longest; /* longest free space */ diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 877079eb0f8f..cc6fa64821d2 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -1485,7 +1485,7 @@ TRACE_EVENT(xfs_trans_commit_lsn, __entry->lsn) ); -TRACE_EVENT(xfs_agf, +DECLARE_EVENT_CLASS(xfs_agf_class, TP_PROTO(struct xfs_mount *mp, struct xfs_agf *agf, int flags, unsigned long caller_ip), TP_ARGS(mp, agf, flags, caller_ip), @@ -1541,6 +1541,13 @@ TRACE_EVENT(xfs_agf, __entry->longest, (void *)__entry->caller_ip) ); +#define DEFINE_AGF_EVENT(name) \ +DEFINE_EVENT(xfs_agf_class, name, \ + TP_PROTO(struct xfs_mount *mp, struct xfs_agf *agf, int flags, \ + unsigned long caller_ip), \ + TP_ARGS(mp, agf, flags, caller_ip)) +DEFINE_AGF_EVENT(xfs_agf); +DEFINE_AGF_EVENT(xfs_agfl_reset); TRACE_EVENT(xfs_free_extent, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, |
