diff options
Diffstat (limited to 'fs')
| -rw-r--r-- | fs/btrfs/file.c | 2 | ||||
| -rw-r--r-- | fs/btrfs/tree-log.c | 137 | ||||
| -rw-r--r-- | fs/coredump.c | 30 | ||||
| -rw-r--r-- | fs/dcache.c | 5 | ||||
| -rw-r--r-- | fs/ext4/ext4.h | 23 | ||||
| -rw-r--r-- | fs/ext4/move_extent.c | 11 | ||||
| -rw-r--r-- | fs/ext4/super.c | 47 | ||||
| -rw-r--r-- | fs/fhandle.c | 2 | ||||
| -rw-r--r-- | fs/fs-writeback.c | 37 | ||||
| -rw-r--r-- | fs/fuse/cuse.c | 4 | ||||
| -rw-r--r-- | fs/fuse/file.c | 33 | ||||
| -rw-r--r-- | fs/fuse/fuse_i.h | 9 | ||||
| -rw-r--r-- | fs/jbd2/journal.c | 17 | ||||
| -rw-r--r-- | fs/nfs/dir.c | 6 | ||||
| -rw-r--r-- | fs/nfs/inode.c | 2 | ||||
| -rw-r--r-- | fs/nfs/nfs4file.c | 4 | ||||
| -rw-r--r-- | fs/nfsd/nfs4proc.c | 1 | ||||
| -rw-r--r-- | fs/nfsd/nfs4xdr.c | 13 | ||||
| -rw-r--r-- | fs/ocfs2/dlm/dlmconvert.c | 24 | ||||
| -rw-r--r-- | fs/ocfs2/dlm/dlmrecovery.c | 1 | ||||
| -rw-r--r-- | fs/open.c | 6 | ||||
| -rw-r--r-- | fs/overlayfs/super.c | 33 | ||||
| -rw-r--r-- | fs/proc_namespace.c | 2 | ||||
| -rw-r--r-- | fs/quota/dquot.c | 3 | ||||
| -rw-r--r-- | fs/splice.c | 3 | ||||
| -rw-r--r-- | fs/xfs/xfs_attr_list.c | 19 |
26 files changed, 395 insertions, 79 deletions
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 0f09526aa7d9..5e5db3687e34 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1885,7 +1885,7 @@ static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end) */ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) { - struct dentry *dentry = file->f_path.dentry; + struct dentry *dentry = file_dentry(file); struct inode *inode = d_inode(dentry); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 323e12cc9d2f..0e044d7ee721 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4406,6 +4406,127 @@ static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans, return ret; } +/* + * When we are logging a new inode X, check if it doesn't have a reference that + * matches the reference from some other inode Y created in a past transaction + * and that was renamed in the current transaction. If we don't do this, then at + * log replay time we can lose inode Y (and all its files if it's a directory): + * + * mkdir /mnt/x + * echo "hello world" > /mnt/x/foobar + * sync + * mv /mnt/x /mnt/y + * mkdir /mnt/x # or touch /mnt/x + * xfs_io -c fsync /mnt/x + * <power fail> + * mount fs, trigger log replay + * + * After the log replay procedure, we would lose the first directory and all its + * files (file foobar). + * For the case where inode Y is not a directory we simply end up losing it: + * + * echo "123" > /mnt/foo + * sync + * mv /mnt/foo /mnt/bar + * echo "abc" > /mnt/foo + * xfs_io -c fsync /mnt/foo + * <power fail> + * + * We also need this for cases where a snapshot entry is replaced by some other + * entry (file or directory) otherwise we end up with an unreplayable log due to + * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as + * if it were a regular entry: + * + * mkdir /mnt/x + * btrfs subvolume snapshot /mnt /mnt/x/snap + * btrfs subvolume delete /mnt/x/snap + * rmdir /mnt/x + * mkdir /mnt/x + * fsync /mnt/x or fsync some new file inside it + * <power fail> + * + * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in + * the same transaction. + */ +static int btrfs_check_ref_name_override(struct extent_buffer *eb, + const int slot, + const struct btrfs_key *key, + struct inode *inode) +{ + int ret; + struct btrfs_path *search_path; + char *name = NULL; + u32 name_len = 0; + u32 item_size = btrfs_item_size_nr(eb, slot); + u32 cur_offset = 0; + unsigned long ptr = btrfs_item_ptr_offset(eb, slot); + + search_path = btrfs_alloc_path(); + if (!search_path) + return -ENOMEM; + search_path->search_commit_root = 1; + search_path->skip_locking = 1; + + while (cur_offset < item_size) { + u64 parent; + u32 this_name_len; + u32 this_len; + unsigned long name_ptr; + struct btrfs_dir_item *di; + + if (key->type == BTRFS_INODE_REF_KEY) { + struct btrfs_inode_ref *iref; + + iref = (struct btrfs_inode_ref *)(ptr + cur_offset); + parent = key->offset; + this_name_len = btrfs_inode_ref_name_len(eb, iref); + name_ptr = (unsigned long)(iref + 1); + this_len = sizeof(*iref) + this_name_len; + } else { + struct btrfs_inode_extref *extref; + + extref = (struct btrfs_inode_extref *)(ptr + + cur_offset); + parent = btrfs_inode_extref_parent(eb, extref); + this_name_len = btrfs_inode_extref_name_len(eb, extref); + name_ptr = (unsigned long)&extref->name; + this_len = sizeof(*extref) + this_name_len; + } + + if (this_name_len > name_len) { + char *new_name; + + new_name = krealloc(name, this_name_len, GFP_NOFS); + if (!new_name) { + ret = -ENOMEM; + goto out; + } + name_len = this_name_len; + name = new_name; + } + + read_extent_buffer(eb, name, name_ptr, this_name_len); + di = btrfs_lookup_dir_item(NULL, BTRFS_I(inode)->root, + search_path, parent, + name, this_name_len, 0); + if (di && !IS_ERR(di)) { + ret = 1; + goto out; + } else if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto out; + } + btrfs_release_path(search_path); + + cur_offset += this_len; + } + ret = 0; +out: + btrfs_free_path(search_path); + kfree(name); + return ret; +} + /* log a single inode in the tree log. * At least one parent directory for this inode must exist in the tree * or be logged already. @@ -4578,6 +4699,22 @@ again: if (min_key.type == BTRFS_INODE_ITEM_KEY) need_log_inode_item = false; + if ((min_key.type == BTRFS_INODE_REF_KEY || + min_key.type == BTRFS_INODE_EXTREF_KEY) && + BTRFS_I(inode)->generation == trans->transid) { + ret = btrfs_check_ref_name_override(path->nodes[0], + path->slots[0], + &min_key, inode); + if (ret < 0) { + err = ret; + goto out_unlock; + } else if (ret > 0) { + err = 1; + btrfs_set_log_full_commit(root->fs_info, trans); + goto out_unlock; + } + } + /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */ if (min_key.type == BTRFS_XATTR_ITEM_KEY) { if (ins_nr == 0) diff --git a/fs/coredump.c b/fs/coredump.c index 1777331eee76..dfc87c5f5a54 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -32,6 +32,9 @@ #include <linux/pipe_fs_i.h> #include <linux/oom.h> #include <linux/compat.h> +#include <linux/sched.h> +#include <linux/fs.h> +#include <linux/path.h> #include <asm/uaccess.h> #include <asm/mmu_context.h> @@ -627,6 +630,8 @@ void do_coredump(const siginfo_t *siginfo) } } else { struct inode *inode; + int open_flags = O_CREAT | O_RDWR | O_NOFOLLOW | + O_LARGEFILE | O_EXCL; if (cprm.limit < binfmt->min_coredump) goto fail_unlock; @@ -665,10 +670,27 @@ void do_coredump(const siginfo_t *siginfo) * what matters is that at least one of the two processes * writes its coredump successfully, not which one. */ - cprm.file = filp_open(cn.corename, - O_CREAT | 2 | O_NOFOLLOW | - O_LARGEFILE | O_EXCL, - 0600); + if (need_suid_safe) { + /* + * Using user namespaces, normal user tasks can change + * their current->fs->root to point to arbitrary + * directories. Since the intention of the "only dump + * with a fully qualified path" rule is to control where + * coredumps may be placed using root privileges, + * current->fs->root must not be used. Instead, use the + * root directory of init_task. + */ + struct path root; + + task_lock(&init_task); + get_fs_root(init_task.fs, &root); + task_unlock(&init_task); + cprm.file = file_open_root(root.dentry, root.mnt, + cn.corename, open_flags, 0600); + path_put(&root); + } else { + cprm.file = filp_open(cn.corename, open_flags, 0600); + } if (IS_ERR(cprm.file)) goto fail_unlock; diff --git a/fs/dcache.c b/fs/dcache.c index 877bcbbd03ff..18effa378f97 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1666,7 +1666,8 @@ void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op) DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE | DCACHE_OP_DELETE | - DCACHE_OP_SELECT_INODE)); + DCACHE_OP_SELECT_INODE | + DCACHE_OP_REAL)); dentry->d_op = op; if (!op) return; @@ -1684,6 +1685,8 @@ void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op) dentry->d_flags |= DCACHE_OP_PRUNE; if (op->d_select_inode) dentry->d_flags |= DCACHE_OP_SELECT_INODE; + if (op->d_real) + dentry->d_flags |= DCACHE_OP_REAL; } EXPORT_SYMBOL(d_set_d_op); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 5cf6d8be48dd..786cb51cab56 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -850,6 +850,29 @@ do { \ #include "extents_status.h" /* + * Lock subclasses for i_data_sem in the ext4_inode_info structure. + * + * These are needed to avoid lockdep false positives when we need to + * allocate blocks to the quota inode during ext4_map_blocks(), while + * holding i_data_sem for a normal (non-quota) inode. Since we don't + * do quota tracking for the quota inode, this avoids deadlock (as + * well as infinite recursion, since it isn't turtles all the way + * down...) + * + * I_DATA_SEM_NORMAL - Used for most inodes + * I_DATA_SEM_OTHER - Used by move_inode.c for the second normal inode + * where the second inode has larger inode number + * than the first + * I_DATA_SEM_QUOTA - Used for quota inodes only + */ +enum { + I_DATA_SEM_NORMAL = 0, + I_DATA_SEM_OTHER, + I_DATA_SEM_QUOTA, +}; + + +/* * fourth extended file system inode data in memory */ struct ext4_inode_info { diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index e032a0423e35..9bdbf98240a0 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -60,10 +60,10 @@ ext4_double_down_write_data_sem(struct inode *first, struct inode *second) { if (first < second) { down_write(&EXT4_I(first)->i_data_sem); - down_write_nested(&EXT4_I(second)->i_data_sem, SINGLE_DEPTH_NESTING); + down_write_nested(&EXT4_I(second)->i_data_sem, I_DATA_SEM_OTHER); } else { down_write(&EXT4_I(second)->i_data_sem); - down_write_nested(&EXT4_I(first)->i_data_sem, SINGLE_DEPTH_NESTING); + down_write_nested(&EXT4_I(first)->i_data_sem, I_DATA_SEM_OTHER); } } @@ -483,6 +483,13 @@ mext_check_arguments(struct inode *orig_inode, return -EBUSY; } + if (IS_NOQUOTA(orig_inode) || IS_NOQUOTA(donor_inode)) { + ext4_debug("ext4 move extent: The argument files should " + "not be quota files [ino:orig %lu, donor %lu]\n", + orig_inode->i_ino, donor_inode->i_ino); + return -EBUSY; + } + /* Ext4 move extent supports only extent based file */ if (!(ext4_test_inode_flag(orig_inode, EXT4_INODE_EXTENTS))) { ext4_debug("ext4 move extent: orig file is not extents " diff --git a/fs/ext4/super.c b/fs/ext4/super.c index c9ab67da6e5a..ba1cf0bf2f81 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1292,9 +1292,9 @@ static int set_qf_name(struct super_block *sb, int qtype, substring_t *args) return -1; } if (ext4_has_feature_quota(sb)) { - ext4_msg(sb, KERN_ERR, "Cannot set journaled quota options " - "when QUOTA feature is enabled"); - return -1; + ext4_msg(sb, KERN_INFO, "Journaled quota options " + "ignored when QUOTA feature is enabled"); + return 1; } qname = match_strdup(args); if (!qname) { @@ -1657,10 +1657,10 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, return -1; } if (ext4_has_feature_quota(sb)) { - ext4_msg(sb, KERN_ERR, - "Cannot set journaled quota options " + ext4_msg(sb, KERN_INFO, + "Quota format mount options ignored " "when QUOTA feature is enabled"); - return -1; + return 1; } sbi->s_jquota_fmt = m->mount_opt; #endif @@ -1721,11 +1721,11 @@ static int parse_options(char *options, struct super_block *sb, #ifdef CONFIG_QUOTA if (ext4_has_feature_quota(sb) && (test_opt(sb, USRQUOTA) || test_opt(sb, GRPQUOTA))) { - ext4_msg(sb, KERN_ERR, "Cannot set quota options when QUOTA " - "feature is enabled"); - return 0; - } - if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) { + ext4_msg(sb, KERN_INFO, "Quota feature enabled, usrquota and grpquota " + "mount options ignored."); + clear_opt(sb, USRQUOTA); + clear_opt(sb, GRPQUOTA); + } else if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) { if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA]) clear_opt(sb, USRQUOTA); @@ -4936,6 +4936,20 @@ static int ext4_quota_on_mount(struct super_block *sb, int type) EXT4_SB(sb)->s_jquota_fmt, type); } +static void lockdep_set_quota_inode(struct inode *inode, int subclass) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + + /* The first argument of lockdep_set_subclass has to be + * *exactly* the same as the argument to init_rwsem() --- in + * this case, in init_once() --- or lockdep gets unhappy + * because the name of the lock is set using the + * stringification of the argument to init_rwsem(). + */ + (void) ei; /* shut up clang warning if !CONFIG_LOCKDEP */ + lockdep_set_subclass(&ei->i_data_sem, subclass); +} + /* * Standard function to be called on quota_on */ @@ -4975,8 +4989,12 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id, if (err) return err; } - - return dquot_quota_on(sb, type, format_id, path); + lockdep_set_quota_inode(path->dentry->d_inode, I_DATA_SEM_QUOTA); + err = dquot_quota_on(sb, type, format_id, path); + if (err) + lockdep_set_quota_inode(path->dentry->d_inode, + I_DATA_SEM_NORMAL); + return err; } static int ext4_quota_enable(struct super_block *sb, int type, int format_id, @@ -5002,8 +5020,11 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id, /* Don't account quota for quota files to avoid recursion */ qf_inode->i_flags |= S_NOQUOTA; + lockdep_set_quota_inode(qf_inode, I_DATA_SEM_QUOTA); err = dquot_enable(qf_inode, type, format_id, flags); iput(qf_inode); + if (err) + lockdep_set_quota_inode(qf_inode, I_DATA_SEM_NORMAL); return err; } diff --git a/fs/fhandle.c b/fs/fhandle.c index d59712dfa3e7..ca3c3dd01789 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -228,7 +228,7 @@ long do_handle_open(int mountdirfd, path_put(&path); return fd; } - file = file_open_root(path.dentry, path.mnt, "", open_flag); + file = file_open_root(path.dentry, path.mnt, "", open_flag, 0); if (IS_ERR(file)) { put_unused_fd(fd); retval = PTR_ERR(file); diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index bf8c78920bee..de11206dda63 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -281,13 +281,15 @@ locked_inode_to_wb_and_lock_list(struct inode *inode) wb_get(wb); spin_unlock(&inode->i_lock); spin_lock(&wb->list_lock); - wb_put(wb); /* not gonna deref it anymore */ /* i_wb may have changed inbetween, can't use inode_to_wb() */ - if (likely(wb == inode->i_wb)) - return wb; /* @inode already has ref */ + if (likely(wb == inode->i_wb)) { + wb_put(wb); /* @inode already has ref */ + return wb; + } spin_unlock(&wb->list_lock); + wb_put(wb); cpu_relax(); spin_lock(&inode->i_lock); } @@ -1339,10 +1341,10 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) * we go e.g. from filesystem. Flusher thread uses __writeback_single_inode() * and does more profound writeback list handling in writeback_sb_inodes(). */ -static int -writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, - struct writeback_control *wbc) +static int writeback_single_inode(struct inode *inode, + struct writeback_control *wbc) { + struct bdi_writeback *wb; int ret = 0; spin_lock(&inode->i_lock); @@ -1380,7 +1382,8 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, ret = __writeback_single_inode(inode, wbc); wbc_detach_inode(wbc); - spin_lock(&wb->list_lock); + + wb = inode_to_wb_and_lock_list(inode); spin_lock(&inode->i_lock); /* * If inode is clean, remove it from writeback lists. Otherwise don't @@ -1455,6 +1458,7 @@ static long writeback_sb_inodes(struct super_block *sb, while (!list_empty(&wb->b_io)) { struct inode *inode = wb_inode(wb->b_io.prev); + struct bdi_writeback *tmp_wb; if (inode->i_sb != sb) { if (work->sb) { @@ -1545,15 +1549,23 @@ static long writeback_sb_inodes(struct super_block *sb, cond_resched(); } - - spin_lock(&wb->list_lock); + /* + * Requeue @inode if still dirty. Be careful as @inode may + * have been switched to another wb in the meantime. + */ + tmp_wb = inode_to_wb_and_lock_list(inode); spin_lock(&inode->i_lock); if (!(inode->i_state & I_DIRTY_ALL)) wrote++; - requeue_inode(inode, wb, &wbc); + requeue_inode(inode, tmp_wb, &wbc); inode_sync_complete(inode); spin_unlock(&inode->i_lock); + if (unlikely(tmp_wb != wb)) { + spin_unlock(&tmp_wb->list_lock); + spin_lock(&wb->list_lock); + } + /* * bail out to wb_writeback() often enough to check * background threshold and other termination conditions. @@ -2340,7 +2352,6 @@ EXPORT_SYMBOL(sync_inodes_sb); */ int write_inode_now(struct inode *inode, int sync) { - struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; struct writeback_control wbc = { .nr_to_write = LONG_MAX, .sync_mode = sync ? WB_SYNC_ALL : WB_SYNC_NONE, @@ -2352,7 +2363,7 @@ int write_inode_now(struct inode *inode, int sync) wbc.nr_to_write = 0; might_sleep(); - return writeback_single_inode(inode, wb, &wbc); + return writeback_single_inode(inode, &wbc); } EXPORT_SYMBOL(write_inode_now); @@ -2369,7 +2380,7 @@ EXPORT_SYMBOL(write_inode_now); */ int sync_inode(struct inode *inode, struct writeback_control *wbc) { - return writeback_single_inode(inode, &inode_to_bdi(inode)->wb, wbc); + return writeback_single_inode(inode, wbc); } EXPORT_SYMBOL(sync_inode); diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index 8e3ee1936c7e..c5b6b7165489 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -90,7 +90,7 @@ static struct list_head *cuse_conntbl_head(dev_t devt) static ssize_t cuse_read_iter(struct kiocb *kiocb, struct iov_iter *to) { - struct fuse_io_priv io = { .async = 0, .file = kiocb->ki_filp }; + struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(kiocb->ki_filp); loff_t pos = 0; return fuse_direct_io(&io, to, &pos, FUSE_DIO_CUSE); @@ -98,7 +98,7 @@ static ssize_t cuse_read_iter(struct kiocb *kiocb, struct iov_iter *to) static ssize_t cuse_write_iter(struct kiocb *kiocb, struct iov_iter *from) { - struct fuse_io_priv io = { .async = 0, .file = kiocb->ki_filp }; + struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(kiocb->ki_filp); loff_t pos = 0; /* * No locking or generic_write_checks(), the server is diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 8ff09059a91d..461dcf5e4526 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -546,6 +546,11 @@ static void fuse_release_user_pages(struct fuse_req *req, int write) } } +static void fuse_io_release(struct kref *kref) +{ + kfree(container_of(kref, struct fuse_io_priv, refcnt)); +} + static ssize_t fuse_get_res_by_io(struct fuse_io_priv *io) { if (io->err) @@ -603,8 +608,9 @@ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos) } io->iocb->ki_complete(io->iocb, res, 0); - kfree(io); } + + kref_put(&io->refcnt, fuse_io_release); } static void fuse_aio_complete_req(struct fuse_conn *fc, struct fuse_req *req) @@ -631,6 +637,7 @@ static size_t fuse_async_req_send(struct fuse_conn *fc, struct fuse_req *req, size_t num_bytes, struct fuse_io_priv *io) { spin_lock(&io->lock); + kref_get(&io->refcnt); io->size += num_bytes; io->reqs++; spin_unlock(&io->lock); @@ -709,7 +716,7 @@ static void fuse_short_read(struct fuse_req *req, struct inode *inode, static int fuse_do_readpage(struct file *file, struct page *page) { - struct fuse_io_priv io = { .async = 0, .file = file }; + struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(file); struct inode *inode = page->mapping->host; struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_req *req; @@ -1009,7 +1016,7 @@ static size_t fuse_send_write_pages(struct fuse_req *req, struct file *file, size_t res; unsigned offset; unsigned i; - struct fuse_io_priv io = { .async = 0, .file = file }; + struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(file); for (i = 0; i < req->num_pages; i++) fuse_wait_on_page_writeback(inode, req->pages[i]->index); @@ -1430,7 +1437,7 @@ static ssize_t __fuse_direct_read(struct fuse_io_priv *io, static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to) { - struct fuse_io_priv io = { .async = 0, .file = iocb->ki_filp }; + struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb->ki_filp); return __fuse_direct_read(&io, to, &iocb->ki_pos); } @@ -1438,7 +1445,7 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); - struct fuse_io_priv io = { .async = 0, .file = file }; + struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(file); ssize_t res; if (is_bad_inode(inode)) @@ -2824,6 +2831,7 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset) loff_t i_size; size_t count = iov_iter_count(iter); struct fuse_io_priv *io; + bool is_sync = is_sync_kiocb(iocb); pos = offset; inode = file->f_mapping->host; @@ -2844,6 +2852,7 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset) if (!io) return -ENOMEM; spin_lock_init(&io->lock); + kref_init(&io->refcnt); io->reqs = 1; io->bytes = -1; io->size = 0; @@ -2863,12 +2872,18 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset) * to wait on real async I/O requests, so we must submit this request * synchronously. */ - if (!is_sync_kiocb(iocb) && (offset + count > i_size) && + if (!is_sync && (offset + count > i_size) && iov_iter_rw(iter) == WRITE) io->async = false; - if (io->async && is_sync_kiocb(iocb)) + if (io->async && is_sync) { + /* + * Additional reference to keep io around after + * calling fuse_aio_complete() + */ + kref_get(&io->refcnt); io->done = &wait; + } if (iov_iter_rw(iter) == WRITE) { ret = fuse_direct_io(io, iter, &pos, FUSE_DIO_WRITE); @@ -2881,14 +2896,14 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset) fuse_aio_complete(io, ret < 0 ? ret : 0, -1); /* we have a non-extending, async request, so return */ - if (!is_sync_kiocb(iocb)) + if (!is_sync) return -EIOCBQUEUED; wait_for_completion(&wait); ret = fuse_get_res_by_io(io); } - kfree(io); + kref_put(&io->refcnt, fuse_io_release); if (iov_iter_rw(iter) == WRITE) { if (ret > 0) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 8db84062741d..0cbeea6ee831 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -22,6 +22,7 @@ #include <linux/rbtree.h> #include <linux/poll.h> #include <linux/workqueue.h> +#include <linux/kref.h> /** Max number of pages that can be used in a single read request */ #define FUSE_MAX_PAGES_PER_REQ 32 @@ -248,6 +249,7 @@ struct fuse_args { /** The request IO state (for asynchronous processing) */ struct fuse_io_priv { + struct kref refcnt; int async; spinlock_t lock; unsigned reqs; @@ -261,6 +263,13 @@ struct fuse_io_priv { struct completion *done; }; +#define FUSE_IO_PRIV_SYNC(f) \ +{ \ + .refcnt = { ATOMIC_INIT(1) }, \ + .async = 0, \ + .file = f, \ +} + /** * Request flags * diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 81e622681c82..624a57a9c4aa 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1408,11 +1408,12 @@ out: /** * jbd2_mark_journal_empty() - Mark on disk journal as empty. * @journal: The journal to update. + * @write_op: With which operation should we write the journal sb * * Update a journal's dynamic superblock fields to show that journal is empty. * Write updated superblock to disk waiting for IO to complete. */ -static void jbd2_mark_journal_empty(journal_t *journal) +static void jbd2_mark_journal_empty(journal_t *journal, int write_op) { journal_superblock_t *sb = journal->j_superblock; @@ -1430,7 +1431,7 @@ static void jbd2_mark_journal_empty(journal_t *journal) sb->s_start = cpu_to_be32(0); read_unlock(&journal->j_state_lock); - jbd2_write_superblock(journal, WRITE_FUA); + jbd2_write_superblock(journal, write_op); /* Log is no longer empty */ write_lock(&journal->j_state_lock); @@ -1716,7 +1717,13 @@ int jbd2_journal_destroy(journal_t *journal) if (journal->j_sb_buffer) { if (!is_journal_aborted(journal)) { mutex_lock(&journal->j_checkpoint_mutex); - jbd2_mark_journal_empty(journal); + + write_lock(&journal->j_state_lock); + journal->j_tail_sequence = + ++journal->j_transaction_sequence; + write_unlock(&journal->j_state_lock); + + jbd2_mark_journal_empty(journal, WRITE_FLUSH_FUA); mutex_unlock(&journal->j_checkpoint_mutex); } else err = -EIO; @@ -1975,7 +1982,7 @@ int jbd2_journal_flush(journal_t *journal) * the magic code for a fully-recovered superblock. Any future * commits of data to the journal will restore the current * s_start value. */ - jbd2_mark_journal_empty(journal); + jbd2_mark_journal_empty(journal, WRITE_FUA); mutex_unlock(&journal->j_checkpoint_mutex); write_lock(&journal->j_state_lock); J_ASSERT(!journal->j_running_transaction); @@ -2021,7 +2028,7 @@ int jbd2_journal_wipe(journal_t *journal, int write) if (write) { /* Lock to make assertions happy... */ mutex_lock(&journal->j_checkpoint_mutex); - jbd2_mark_journal_empty(journal); + jbd2_mark_journal_empty(journal, WRITE_FUA); mutex_unlock(&journal->j_checkpoint_mutex); } diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index ce5a21861074..5fc2162afb67 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -377,7 +377,7 @@ int nfs_readdir_xdr_filler(struct page **pages, nfs_readdir_descriptor_t *desc, again: timestamp = jiffies; gencount = nfs_inc_attr_generation_counter(); - error = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred, entry->cookie, pages, + error = NFS_PROTO(inode)->readdir(file_dentry(file), cred, entry->cookie, pages, NFS_SERVER(inode)->dtsize, desc->plus); if (error < 0) { /* We requested READDIRPLUS, but the server doesn't grok it */ @@ -560,7 +560,7 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en count++; if (desc->plus != 0) - nfs_prime_dcache(desc->file->f_path.dentry, entry); + nfs_prime_dcache(file_dentry(desc->file), entry); status = nfs_readdir_add_to_array(entry, page); if (status != 0) @@ -864,7 +864,7 @@ static bool nfs_dir_mapping_need_revalidate(struct inode *dir) */ static int nfs_readdir(struct file *file, struct dir_context *ctx) { - struct dentry *dentry = file->f_path.dentry; + struct dentry *dentry = file_dentry(file); struct inode *inode = d_inode(dentry); nfs_readdir_descriptor_t my_desc, *desc = &my_desc; diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 3e2071a177fd..f714b98cfd74 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -927,7 +927,7 @@ int nfs_open(struct inode *inode, struct file *filp) { struct nfs_open_context *ctx; - ctx = alloc_nfs_open_context(filp->f_path.dentry, filp->f_mode); + ctx = alloc_nfs_open_context(file_dentry(filp), filp->f_mode); if (IS_ERR(ctx)) return PTR_ERR(ctx); nfs_file_set_open_context(filp, ctx); diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index db9b5fea5b3e..679e003818b1 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -26,7 +26,7 @@ static int nfs4_file_open(struct inode *inode, struct file *filp) { struct nfs_open_context *ctx; - struct dentry *dentry = filp->f_path.dentry; + struct dentry *dentry = file_dentry(filp); struct dentry *parent = NULL; struct inode *dir; unsigned openflags = filp->f_flags; @@ -57,7 +57,7 @@ nfs4_file_open(struct inode *inode, struct file *filp) parent = dget_parent(dentry); dir = d_inode(parent); - ctx = alloc_nfs_open_context(filp->f_path.dentry, filp->f_mode); + ctx = alloc_nfs_open_context(file_dentry(filp), filp->f_mode); err = PTR_ERR(ctx); if (IS_ERR(ctx)) goto out; diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index a9f096c7e99f..7d5351cd67fb 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -877,6 +877,7 @@ nfsd4_secinfo(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, &exp, &dentry); if (err) return err; + fh_unlock(&cstate->current_fh); if (d_really_is_negative(dentry)) { exp_put(exp); err = nfserr_noent; diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 51c9e9ca39a4..12935209deca 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -1072,8 +1072,9 @@ nfsd4_decode_rename(struct nfsd4_compoundargs *argp, struct nfsd4_rename *rename READ_BUF(4); rename->rn_snamelen = be32_to_cpup(p++); - READ_BUF(rename->rn_snamelen + 4); + READ_BUF(rename->rn_snamelen); SAVEMEM(rename->rn_sname, rename->rn_snamelen); + READ_BUF(4); rename->rn_tnamelen = be32_to_cpup(p++); READ_BUF(rename->rn_tnamelen); SAVEMEM(rename->rn_tname, rename->rn_tnamelen); @@ -1155,13 +1156,14 @@ nfsd4_decode_setclientid(struct nfsd4_compoundargs *argp, struct nfsd4_setclient READ_BUF(8); setclientid->se_callback_prog = be32_to_cpup(p++); setclientid->se_callback_netid_len = be32_to_cpup(p++); - - READ_BUF(setclientid->se_callback_netid_len + 4); + READ_BUF(setclientid->se_callback_netid_len); SAVEMEM(setclientid->se_callback_netid_val, setclientid->se_callback_netid_len); + READ_BUF(4); setclientid->se_callback_addr_len = be32_to_cpup(p++); - READ_BUF(setclientid->se_callback_addr_len + 4); + READ_BUF(setclientid->se_callback_addr_len); SAVEMEM(setclientid->se_callback_addr_val, setclientid->se_callback_addr_len); + READ_BUF(4); setclientid->se_callback_ident = be32_to_cpup(p++); DECODE_TAIL; @@ -1815,8 +1817,9 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp) READ_BUF(4); argp->taglen = be32_to_cpup(p++); - READ_BUF(argp->taglen + 8); + READ_BUF(argp->taglen); SAVEMEM(argp->tag, argp->taglen); + READ_BUF(8); argp->minorversion = be32_to_cpup(p++); argp->opcnt = be32_to_cpup(p++); max_reply += 4 + (XDR_QUADLEN(argp->taglen) << 2); diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c index e36d63ff1783..f90931335c6b 100644 --- a/fs/ocfs2/dlm/dlmconvert.c +++ b/fs/ocfs2/dlm/dlmconvert.c @@ -262,6 +262,7 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm, struct dlm_lock *lock, int flags, int type) { enum dlm_status status; + u8 old_owner = res->owner; mlog(0, "type=%d, convert_type=%d, busy=%d\n", lock->ml.type, lock->ml.convert_type, res->state & DLM_LOCK_RES_IN_PROGRESS); @@ -287,6 +288,19 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm, status = DLM_DENIED; goto bail; } + + if (lock->ml.type == type && lock->ml.convert_type == LKM_IVMODE) { + mlog(0, "last convert request returned DLM_RECOVERING, but " + "owner has already queued and sent ast to me. res %.*s, " + "(cookie=%u:%llu, type=%d, conv=%d)\n", + res->lockname.len, res->lockname.name, + dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)), + lock->ml.type, lock->ml.convert_type); + status = DLM_NORMAL; + goto bail; + } + res->state |= DLM_LOCK_RES_IN_PROGRESS; /* move lock to local convert queue */ /* do not alter lock refcount. switching lists. */ @@ -316,11 +330,19 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm, spin_lock(&res->spinlock); res->state &= ~DLM_LOCK_RES_IN_PROGRESS; lock->convert_pending = 0; - /* if it failed, move it back to granted queue */ + /* if it failed, move it back to granted queue. + * if master returns DLM_NORMAL and then down before sending ast, + * it may have already been moved to granted queue, reset to + * DLM_RECOVERING and retry convert */ if (status != DLM_NORMAL) { if (status != DLM_NOTQUEUED) dlm_error(status); dlm_revert_pending_convert(res, lock); + } else if ((res->state & DLM_LOCK_RES_RECOVERING) || + (old_owner != res->owner)) { + mlog(0, "res %.*s is in recovering or has been recovered.\n", + res->lockname.len, res->lockname.name); + status = DLM_RECOVERING; } bail: spin_unlock(&res->spinlock); diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 42f0cae93a0a..4a338803e7e9 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -2064,7 +2064,6 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm, dlm_lock_get(lock); if (lock->convert_pending) { /* move converting lock back to granted */ - BUG_ON(i != DLM_CONVERTING_LIST); mlog(0, "node died with convert pending " "on %.*s. move back to granted list.\n", res->lockname.len, res->lockname.name); diff --git a/fs/open.c b/fs/open.c index b6f1e96a7c0b..6a24f988d253 100644 --- a/fs/open.c +++ b/fs/open.c @@ -995,14 +995,12 @@ struct file *filp_open(const char *filename, int flags, umode_t mode) EXPORT_SYMBOL(filp_open); struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt, - const char *filename, int flags) + const char *filename, int flags, umode_t mode) { struct open_flags op; - int err = build_open_flags(flags, 0, &op); + int err = build_open_flags(flags, mode, &op); if (err) return ERR_PTR(err); - if (flags & O_CREAT) - return ERR_PTR(-EINVAL); return do_file_open_root(dentry, mnt, filename, &op); } EXPORT_SYMBOL(file_open_root); diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 000b2ed05c29..a1acc6004a91 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -276,6 +276,37 @@ static void ovl_dentry_release(struct dentry *dentry) } } +static struct dentry *ovl_d_real(struct dentry *dentry, struct inode *inode) +{ + struct dentry *real; + + if (d_is_dir(dentry)) { + if (!inode || inode == d_inode(dentry)) + return dentry; + goto bug; + } + + real = ovl_dentry_upper(dentry); + if (real && (!inode || inode == d_inode(real))) + return real; + + real = ovl_dentry_lower(dentry); + if (!real) + goto bug; + + if (!inode || inode == d_inode(real)) + return real; + + /* Handle recursion */ + if (real->d_flags & DCACHE_OP_REAL) + return real->d_op->d_real(real, inode); + +bug: + WARN(1, "ovl_d_real(%pd4, %s:%lu\n): real dentry not found\n", dentry, + inode ? inode->i_sb->s_id : "NULL", inode ? inode->i_ino : 0); + return dentry; +} + static int ovl_dentry_revalidate(struct dentry *dentry, unsigned int flags) { struct ovl_entry *oe = dentry->d_fsdata; @@ -320,11 +351,13 @@ static int ovl_dentry_weak_revalidate(struct dentry *dentry, unsigned int flags) static const struct dentry_operations ovl_dentry_operations = { .d_release = ovl_dentry_release, .d_select_inode = ovl_d_select_inode, + .d_real = ovl_d_real, }; static const struct dentry_operations ovl_reval_dentry_operations = { .d_release = ovl_dentry_release, .d_select_inode = ovl_d_select_inode, + .d_real = ovl_d_real, .d_revalidate = ovl_dentry_revalidate, .d_weak_revalidate = ovl_dentry_weak_revalidate, }; diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c index 8ebd9a334085..87645955990d 100644 --- a/fs/proc_namespace.c +++ b/fs/proc_namespace.c @@ -197,6 +197,8 @@ static int show_vfsstat(struct seq_file *m, struct vfsmount *mnt) if (sb->s_op->show_devname) { seq_puts(m, "device "); err = sb->s_op->show_devname(m, mnt_path.dentry); + if (err) + goto out; } else { if (r->mnt_devname) { seq_puts(m, "device "); diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index ef0d64b2a6d9..353ff31dcee1 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -1398,7 +1398,7 @@ static int dquot_active(const struct inode *inode) static int __dquot_initialize(struct inode *inode, int type) { int cnt, init_needed = 0; - struct dquot **dquots, *got[MAXQUOTAS]; + struct dquot **dquots, *got[MAXQUOTAS] = {}; struct super_block *sb = inode->i_sb; qsize_t rsv; int ret = 0; @@ -1415,7 +1415,6 @@ static int __dquot_initialize(struct inode *inode, int type) int rc; struct dquot *dquot; - got[cnt] = NULL; if (type != -1 && cnt != type) continue; /* diff --git a/fs/splice.c b/fs/splice.c index 4cf700d50b40..0f77e9682857 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -185,6 +185,9 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe, unsigned int spd_pages = spd->nr_pages; int ret, do_wakeup, page_nr; + if (!spd_pages) + return 0; + ret = 0; do_wakeup = 0; page_nr = 0; diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index 0ef7c2ed3f8a..4fa14820e2e2 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -202,8 +202,10 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) sbp->namelen, sbp->valuelen, &sbp->name[sbp->namelen]); - if (error) + if (error) { + kmem_free(sbuf); return error; + } if (context->seen_enough) break; cursor->offset++; @@ -454,14 +456,13 @@ xfs_attr3_leaf_list_int( args.rmtblkcnt = xfs_attr3_rmt_blocks( args.dp->i_mount, valuelen); retval = xfs_attr_rmtval_get(&args); - if (retval) - return retval; - retval = context->put_listent(context, - entry->flags, - name_rmt->name, - (int)name_rmt->namelen, - valuelen, - args.value); + if (!retval) + retval = context->put_listent(context, + entry->flags, + name_rmt->name, + (int)name_rmt->namelen, + valuelen, + args.value); kmem_free(args.value); } else { retval = context->put_listent(context, |
