summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/affs/super.c5
-rw-r--r--fs/btrfs/async-thread.c2
-rw-r--r--fs/btrfs/backref.c10
-rw-r--r--fs/btrfs/ctree.h2
-rw-r--r--fs/btrfs/delayed-inode.c3
-rw-r--r--fs/btrfs/delayed-inode.h2
-rw-r--r--fs/btrfs/disk-io.c34
-rw-r--r--fs/btrfs/extent-tree.c13
-rw-r--r--fs/btrfs/file.c2
-rw-r--r--fs/btrfs/inode-map.c9
-rw-r--r--fs/btrfs/inode-map.h1
-rw-r--r--fs/btrfs/inode.c47
-rw-r--r--fs/btrfs/ioctl.c125
-rw-r--r--fs/btrfs/root-tree.c10
-rw-r--r--fs/btrfs/send.c16
-rw-r--r--fs/btrfs/super.c24
-rw-r--r--fs/btrfs/tree-log.c137
-rw-r--r--fs/btrfs/volumes.c29
-rw-r--r--fs/cifs/cifs_debug.c2
-rw-r--r--fs/cifs/cifs_debug.h9
-rw-r--r--fs/cifs/cifsencrypt.c2
-rw-r--r--fs/cifs/cifsfs.h12
-rw-r--r--fs/cifs/cifssmb.c21
-rw-r--r--fs/cifs/connect.c2
-rw-r--r--fs/cifs/readdir.c1
-rw-r--r--fs/cifs/sess.c139
-rw-r--r--fs/cifs/smb2glob.h1
-rw-r--r--fs/cifs/smb2inode.c8
-rw-r--r--fs/cifs/smb2pdu.c40
-rw-r--r--fs/cifs/smb2proto.h2
-rw-r--r--fs/cifs/transport.c6
-rw-r--r--fs/coredump.c30
-rw-r--r--fs/dcache.c25
-rw-r--r--fs/debugfs/inode.c2
-rw-r--r--fs/devpts/inode.c20
-rw-r--r--fs/direct-io.c2
-rw-r--r--fs/efivarfs/file.c70
-rw-r--r--fs/efivarfs/inode.c30
-rw-r--r--fs/efivarfs/internal.h3
-rw-r--r--fs/efivarfs/super.c16
-rw-r--r--fs/ext4/balloc.c7
-rw-r--r--fs/ext4/crypto_key.c4
-rw-r--r--fs/ext4/ext4.h36
-rw-r--r--fs/ext4/extents.c133
-rw-r--r--fs/ext4/file.c66
-rw-r--r--fs/ext4/ialloc.c65
-rw-r--r--fs/ext4/inode.c109
-rw-r--r--fs/ext4/mballoc.c10
-rw-r--r--fs/ext4/move_extent.c27
-rw-r--r--fs/ext4/namei.c2
-rw-r--r--fs/ext4/resize.c2
-rw-r--r--fs/ext4/super.c48
-rw-r--r--fs/ext4/truncate.h2
-rw-r--r--fs/fhandle.c2
-rw-r--r--fs/fs-writeback.c76
-rw-r--r--fs/fuse/cuse.c4
-rw-r--r--fs/fuse/file.c33
-rw-r--r--fs/fuse/fuse_i.h9
-rw-r--r--fs/hostfs/hostfs_kern.c4
-rw-r--r--fs/hpfs/namei.c31
-rw-r--r--fs/hpfs/super.c42
-rw-r--r--fs/hugetlbfs/inode.c19
-rw-r--r--fs/isofs/rock.c13
-rw-r--r--fs/jbd2/journal.c17
-rw-r--r--fs/jffs2/README.Locking5
-rw-r--r--fs/jffs2/build.c75
-rw-r--r--fs/jffs2/dir.c11
-rw-r--r--fs/jffs2/file.c39
-rw-r--r--fs/jffs2/gc.c17
-rw-r--r--fs/jffs2/nodelist.h6
-rw-r--r--fs/locks.c51
-rw-r--r--fs/namei.c48
-rw-r--r--fs/ncpfs/dir.c2
-rw-r--r--fs/nfs/dir.c6
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c8
-rw-r--r--fs/nfs/inode.c56
-rw-r--r--fs/nfs/nfs4file.c4
-rw-r--r--fs/nfs/nfs4proc.c7
-rw-r--r--fs/nfsd/nfs4proc.c1
-rw-r--r--fs/nfsd/nfs4xdr.c13
-rw-r--r--fs/ocfs2/acl.c87
-rw-r--r--fs/ocfs2/acl.h5
-rw-r--r--fs/ocfs2/aops.c1
-rw-r--r--fs/ocfs2/dlm/dlmconvert.c24
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c26
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c3
-rw-r--r--fs/ocfs2/dlmglue.c6
-rw-r--r--fs/ocfs2/file.c4
-rw-r--r--fs/ocfs2/namei.c23
-rw-r--r--fs/ocfs2/refcounttree.c17
-rw-r--r--fs/ocfs2/xattr.c14
-rw-r--r--fs/ocfs2/xattr.h4
-rw-r--r--fs/open.c18
-rw-r--r--fs/overlayfs/copy_up.c41
-rw-r--r--fs/overlayfs/dir.c10
-rw-r--r--fs/overlayfs/inode.c15
-rw-r--r--fs/overlayfs/readdir.c3
-rw-r--r--fs/overlayfs/super.c51
-rw-r--r--fs/pipe.c47
-rw-r--r--fs/pnode.c32
-rw-r--r--fs/proc/array.c2
-rw-r--r--fs/proc/base.c24
-rw-r--r--fs/proc/namespaces.c4
-rw-r--r--fs/proc/task_mmu.c40
-rw-r--r--fs/proc_namespace.c2
-rw-r--r--fs/quota/dquot.c3
-rw-r--r--fs/splice.c3
-rw-r--r--fs/super.c1
-rw-r--r--fs/timerfd.c2
-rw-r--r--fs/udf/inode.c15
-rw-r--r--fs/udf/unicode.c21
-rw-r--r--fs/userfaultfd.c6
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c2
-rw-r--r--fs/xfs/libxfs/xfs_alloc_btree.c1
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c1
-rw-r--r--fs/xfs/libxfs/xfs_attr_remote.c1
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.c1
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.c1
-rw-r--r--fs/xfs/libxfs/xfs_dir2_block.c1
-rw-r--r--fs/xfs/libxfs/xfs_dir2_data.c2
-rw-r--r--fs/xfs/libxfs/xfs_dir2_leaf.c2
-rw-r--r--fs/xfs/libxfs/xfs_dir2_node.c1
-rw-r--r--fs/xfs/libxfs/xfs_dquot_buf.c37
-rw-r--r--fs/xfs/libxfs/xfs_format.h2
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c1
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.c1
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c16
-rw-r--r--fs/xfs/libxfs/xfs_quota_defs.h2
-rw-r--r--fs/xfs/libxfs/xfs_sb.c2
-rw-r--r--fs/xfs/libxfs/xfs_shared.h1
-rw-r--r--fs/xfs/libxfs/xfs_symlink_remote.c1
-rw-r--r--fs/xfs/xfs_attr_list.c19
-rw-r--r--fs/xfs/xfs_buf.c17
-rw-r--r--fs/xfs/xfs_buf.h1
-rw-r--r--fs/xfs/xfs_error.c4
-rw-r--r--fs/xfs/xfs_fsops.c4
-rw-r--r--fs/xfs/xfs_inode.c26
-rw-r--r--fs/xfs/xfs_log_recover.c9
-rw-r--r--fs/xfs/xfs_super.c10
-rw-r--r--fs/xfs/xfs_trans_ail.c1
140 files changed, 2006 insertions, 739 deletions
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 5b50c4ca43a7..f90c535703ce 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -528,7 +528,7 @@ affs_remount(struct super_block *sb, int *flags, char *data)
char *prefix = NULL;
new_opts = kstrdup(data, GFP_KERNEL);
- if (!new_opts)
+ if (data && !new_opts)
return -ENOMEM;
pr_debug("%s(flags=0x%x,opts=\"%s\")\n", __func__, *flags, data);
@@ -546,7 +546,8 @@ affs_remount(struct super_block *sb, int *flags, char *data)
}
flush_delayed_work(&sbi->sb_work);
- replace_mount_options(sb, new_opts);
+ if (new_opts)
+ replace_mount_options(sb, new_opts);
sbi->s_flags = mount_flags;
sbi->s_mode = mode;
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 3e36e4adc4a3..9aba42b78253 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -328,8 +328,8 @@ static inline void __btrfs_queue_work(struct __btrfs_workqueue *wq,
list_add_tail(&work->ordered_list, &wq->ordered_list);
spin_unlock_irqrestore(&wq->list_lock, flags);
}
- queue_work(wq->normal_wq, &work->normal_work);
trace_btrfs_work_queued(work);
+ queue_work(wq->normal_wq, &work->normal_work);
}
void btrfs_queue_work(struct btrfs_workqueue *wq,
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index d453d62ab0c6..e2f659dc5745 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1417,7 +1417,8 @@ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
read_extent_buffer(eb, dest + bytes_left,
name_off, name_len);
if (eb != eb_in) {
- btrfs_tree_read_unlock_blocking(eb);
+ if (!path->skip_locking)
+ btrfs_tree_read_unlock_blocking(eb);
free_extent_buffer(eb);
}
ret = btrfs_find_item(fs_root, path, parent, 0,
@@ -1437,9 +1438,10 @@ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
eb = path->nodes[0];
/* make sure we can use eb after releasing the path */
if (eb != eb_in) {
- atomic_inc(&eb->refs);
- btrfs_tree_read_lock(eb);
- btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+ if (!path->skip_locking)
+ btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+ path->nodes[0] = NULL;
+ path->locks[0] = 0;
}
btrfs_release_path(path);
iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 35489e7129a7..385b449fd7ed 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1572,7 +1572,7 @@ struct btrfs_fs_info {
spinlock_t delayed_iput_lock;
struct list_head delayed_iputs;
- struct rw_semaphore delayed_iput_sem;
+ struct mutex cleaner_delayed_iput_mutex;
/* this protects tree_mod_seq_list */
spinlock_t tree_mod_seq_lock;
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index e0941fbb913c..02b934d0ee65 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1694,7 +1694,7 @@ int btrfs_should_delete_dir_index(struct list_head *del_list,
*
*/
int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
- struct list_head *ins_list)
+ struct list_head *ins_list, bool *emitted)
{
struct btrfs_dir_item *di;
struct btrfs_delayed_item *curr, *next;
@@ -1738,6 +1738,7 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
if (over)
return 1;
+ *emitted = true;
}
return 0;
}
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index f70119f25421..0167853c84ae 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -144,7 +144,7 @@ void btrfs_put_delayed_items(struct list_head *ins_list,
int btrfs_should_delete_dir_index(struct list_head *del_list,
u64 index);
int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
- struct list_head *ins_list);
+ struct list_head *ins_list, bool *emitted);
/* for init */
int __init btrfs_delayed_inode_init(void);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 974be09e7556..41fb43183406 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1582,8 +1582,23 @@ int btrfs_init_fs_root(struct btrfs_root *root)
ret = get_anon_bdev(&root->anon_dev);
if (ret)
goto free_writers;
+
+ mutex_lock(&root->objectid_mutex);
+ ret = btrfs_find_highest_objectid(root,
+ &root->highest_objectid);
+ if (ret) {
+ mutex_unlock(&root->objectid_mutex);
+ goto free_root_dev;
+ }
+
+ ASSERT(root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID);
+
+ mutex_unlock(&root->objectid_mutex);
+
return 0;
+free_root_dev:
+ free_anon_bdev(root->anon_dev);
free_writers:
btrfs_free_subvolume_writers(root->subv_writers);
fail:
@@ -1762,7 +1777,6 @@ static int cleaner_kthread(void *arg)
int again;
struct btrfs_trans_handle *trans;
- set_freezable();
do {
again = 0;
@@ -1782,7 +1796,10 @@ static int cleaner_kthread(void *arg)
goto sleep;
}
+ mutex_lock(&root->fs_info->cleaner_delayed_iput_mutex);
btrfs_run_delayed_iputs(root);
+ mutex_unlock(&root->fs_info->cleaner_delayed_iput_mutex);
+
again = btrfs_clean_one_deleted_snapshot(root);
mutex_unlock(&root->fs_info->cleaner_mutex);
@@ -2542,8 +2559,8 @@ int open_ctree(struct super_block *sb,
mutex_init(&fs_info->delete_unused_bgs_mutex);
mutex_init(&fs_info->reloc_mutex);
mutex_init(&fs_info->delalloc_root_mutex);
+ mutex_init(&fs_info->cleaner_delayed_iput_mutex);
seqlock_init(&fs_info->profiles_lock);
- init_rwsem(&fs_info->delayed_iput_sem);
INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
INIT_LIST_HEAD(&fs_info->space_info);
@@ -2668,6 +2685,7 @@ int open_ctree(struct super_block *sb,
if (btrfs_check_super_csum(bh->b_data)) {
printk(KERN_ERR "BTRFS: superblock checksum mismatch\n");
err = -EINVAL;
+ brelse(bh);
goto fail_alloc;
}
@@ -2900,6 +2918,18 @@ retry_root_backup:
tree_root->commit_root = btrfs_root_node(tree_root);
btrfs_set_root_refs(&tree_root->root_item, 1);
+ mutex_lock(&tree_root->objectid_mutex);
+ ret = btrfs_find_highest_objectid(tree_root,
+ &tree_root->highest_objectid);
+ if (ret) {
+ mutex_unlock(&tree_root->objectid_mutex);
+ goto recovery_tree_root;
+ }
+
+ ASSERT(tree_root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID);
+
+ mutex_unlock(&tree_root->objectid_mutex);
+
ret = btrfs_read_roots(fs_info, tree_root);
if (ret)
goto recovery_tree_root;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index c4661db2b72a..2368cac1115a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4086,8 +4086,10 @@ commit_trans:
!atomic_read(&root->fs_info->open_ioctl_trans)) {
need_commit--;
- if (need_commit > 0)
+ if (need_commit > 0) {
+ btrfs_start_delalloc_roots(fs_info, 0, -1);
btrfs_wait_ordered_roots(fs_info, -1);
+ }
trans = btrfs_join_transaction(root);
if (IS_ERR(trans))
@@ -4100,11 +4102,12 @@ commit_trans:
if (ret)
return ret;
/*
- * make sure that all running delayed iput are
- * done
+ * The cleaner kthread might still be doing iput
+ * operations. Wait for it to finish so that
+ * more space is released.
*/
- down_write(&root->fs_info->delayed_iput_sem);
- up_write(&root->fs_info->delayed_iput_sem);
+ mutex_lock(&root->fs_info->cleaner_delayed_iput_mutex);
+ mutex_unlock(&root->fs_info->cleaner_delayed_iput_mutex);
goto again;
} else {
btrfs_end_transaction(trans, root);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 0f09526aa7d9..5e5db3687e34 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1885,7 +1885,7 @@ static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end)
*/
int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
{
- struct dentry *dentry = file->f_path.dentry;
+ struct dentry *dentry = file_dentry(file);
struct inode *inode = d_inode(dentry);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_trans_handle *trans;
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 767a6056ac45..07573dc1614a 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -515,7 +515,7 @@ out:
return ret;
}
-static int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid)
+int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid)
{
struct btrfs_path *path;
int ret;
@@ -555,13 +555,6 @@ int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid)
int ret;
mutex_lock(&root->objectid_mutex);
- if (unlikely(root->highest_objectid < BTRFS_FIRST_FREE_OBJECTID)) {
- ret = btrfs_find_highest_objectid(root,
- &root->highest_objectid);
- if (ret)
- goto out;
- }
-
if (unlikely(root->highest_objectid >= BTRFS_LAST_FREE_OBJECTID)) {
ret = -ENOSPC;
goto out;
diff --git a/fs/btrfs/inode-map.h b/fs/btrfs/inode-map.h
index ddb347bfee23..c8e864b2d530 100644
--- a/fs/btrfs/inode-map.h
+++ b/fs/btrfs/inode-map.h
@@ -9,5 +9,6 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
struct btrfs_trans_handle *trans);
int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid);
+int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid);
#endif
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a70c5790f8f5..4bc9dbf29a73 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3142,8 +3142,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
if (empty)
return;
- down_read(&fs_info->delayed_iput_sem);
-
spin_lock(&fs_info->delayed_iput_lock);
list_splice_init(&fs_info->delayed_iputs, &list);
spin_unlock(&fs_info->delayed_iput_lock);
@@ -3154,8 +3152,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
iput(delayed->inode);
kfree(delayed);
}
-
- up_read(&root->fs_info->delayed_iput_sem);
}
/*
@@ -5741,6 +5737,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
char *name_ptr;
int name_len;
int is_curr = 0; /* ctx->pos points to the current index? */
+ bool emitted;
/* FIXME, use a real flag for deciding about the key type */
if (root->fs_info->tree_root == root)
@@ -5769,6 +5766,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
if (ret < 0)
goto err;
+ emitted = false;
while (1) {
leaf = path->nodes[0];
slot = path->slots[0];
@@ -5848,6 +5846,7 @@ skip:
if (over)
goto nopos;
+ emitted = true;
di_len = btrfs_dir_name_len(leaf, di) +
btrfs_dir_data_len(leaf, di) + sizeof(*di);
di_cur += di_len;
@@ -5860,11 +5859,20 @@ next:
if (key_type == BTRFS_DIR_INDEX_KEY) {
if (is_curr)
ctx->pos++;
- ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list);
+ ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list, &emitted);
if (ret)
goto nopos;
}
+ /*
+ * If we haven't emitted any dir entry, we must not touch ctx->pos as
+ * it was was set to the termination value in previous call. We assume
+ * that "." and ".." were emitted if we reach this point and set the
+ * termination value as well for an empty directory.
+ */
+ if (ctx->pos > 2 && !emitted)
+ goto nopos;
+
/* Reached end of directory/root. Bump pos past the last item. */
ctx->pos++;
@@ -6481,7 +6489,7 @@ out_unlock_inode:
static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
struct dentry *dentry)
{
- struct btrfs_trans_handle *trans;
+ struct btrfs_trans_handle *trans = NULL;
struct btrfs_root *root = BTRFS_I(dir)->root;
struct inode *inode = d_inode(old_dentry);
u64 index;
@@ -6507,6 +6515,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
trans = btrfs_start_transaction(root, 5);
if (IS_ERR(trans)) {
err = PTR_ERR(trans);
+ trans = NULL;
goto fail;
}
@@ -6540,9 +6549,10 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
btrfs_log_new_name(trans, inode, NULL, parent);
}
- btrfs_end_transaction(trans, root);
btrfs_balance_delayed_items(root);
fail:
+ if (trans)
+ btrfs_end_transaction(trans, root);
if (drop_inode) {
inode_dec_link_count(inode);
iput(inode);
@@ -7985,6 +7995,7 @@ static void btrfs_endio_direct_read(struct bio *bio)
kfree(dip);
+ dio_bio->bi_error = bio->bi_error;
dio_end_io(dio_bio, bio->bi_error);
if (io_bio->end_io)
@@ -8030,6 +8041,7 @@ out_test:
kfree(dip);
+ dio_bio->bi_error = bio->bi_error;
dio_end_io(dio_bio, bio->bi_error);
bio_put(bio);
}
@@ -8534,15 +8546,28 @@ int btrfs_readpage(struct file *file, struct page *page)
static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
{
struct extent_io_tree *tree;
-
+ struct inode *inode = page->mapping->host;
+ int ret;
if (current->flags & PF_MEMALLOC) {
redirty_page_for_writepage(wbc, page);
unlock_page(page);
return 0;
}
+
+ /*
+ * If we are under memory pressure we will call this directly from the
+ * VM, we need to make sure we have the inode referenced for the ordered
+ * extent. If not just return like we didn't do anything.
+ */
+ if (!igrab(inode)) {
+ redirty_page_for_writepage(wbc, page);
+ return AOP_WRITEPAGE_ACTIVATE;
+ }
tree = &BTRFS_I(page->mapping->host)->io_tree;
- return extent_write_full_page(tree, page, btrfs_get_extent, wbc);
+ ret = extent_write_full_page(tree, page, btrfs_get_extent, wbc);
+ btrfs_add_delayed_iput(inode);
+ return ret;
}
static int btrfs_writepages(struct address_space *mapping,
@@ -9636,9 +9661,11 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
/*
* 2 items for inode item and ref
* 2 items for dir items
+ * 1 item for updating parent inode item
+ * 1 item for the inline extent item
* 1 item for xattr if selinux is on
*/
- trans = btrfs_start_transaction(root, 5);
+ trans = btrfs_start_transaction(root, 7);
if (IS_ERR(trans))
return PTR_ERR(trans);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index da94138eb85e..bfcd87ee8ff5 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -568,6 +568,10 @@ static noinline int create_subvol(struct inode *dir,
goto fail;
}
+ mutex_lock(&new_root->objectid_mutex);
+ new_root->highest_objectid = new_dirid;
+ mutex_unlock(&new_root->objectid_mutex);
+
/*
* insert the directory item
*/
@@ -1644,7 +1648,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
src_inode = file_inode(src.file);
if (src_inode->i_sb != file_inode(file)->i_sb) {
- btrfs_info(BTRFS_I(src_inode)->root->fs_info,
+ btrfs_info(BTRFS_I(file_inode(file))->root->fs_info,
"Snapshot src from another FS");
ret = -EXDEV;
} else if (!inode_owner_or_capable(src_inode)) {
@@ -2782,24 +2786,29 @@ out:
static struct page *extent_same_get_page(struct inode *inode, pgoff_t index)
{
struct page *page;
- struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
page = grab_cache_page(inode->i_mapping, index);
if (!page)
- return NULL;
+ return ERR_PTR(-ENOMEM);
if (!PageUptodate(page)) {
- if (extent_read_full_page_nolock(tree, page, btrfs_get_extent,
- 0))
- return NULL;
+ int ret;
+
+ ret = btrfs_readpage(NULL, page);
+ if (ret)
+ return ERR_PTR(ret);
lock_page(page);
if (!PageUptodate(page)) {
unlock_page(page);
page_cache_release(page);
- return NULL;
+ return ERR_PTR(-EIO);
+ }
+ if (page->mapping != inode->i_mapping) {
+ unlock_page(page);
+ page_cache_release(page);
+ return ERR_PTR(-EAGAIN);
}
}
- unlock_page(page);
return page;
}
@@ -2811,17 +2820,31 @@ static int gather_extent_pages(struct inode *inode, struct page **pages,
pgoff_t index = off >> PAGE_CACHE_SHIFT;
for (i = 0; i < num_pages; i++) {
+again:
pages[i] = extent_same_get_page(inode, index + i);
- if (!pages[i])
- return -ENOMEM;
+ if (IS_ERR(pages[i])) {
+ int err = PTR_ERR(pages[i]);
+
+ if (err == -EAGAIN)
+ goto again;
+ pages[i] = NULL;
+ return err;
+ }
}
return 0;
}
-static inline void lock_extent_range(struct inode *inode, u64 off, u64 len)
+static int lock_extent_range(struct inode *inode, u64 off, u64 len,
+ bool retry_range_locking)
{
- /* do any pending delalloc/csum calc on src, one way or
- another, and lock file content */
+ /*
+ * Do any pending delalloc/csum calculations on inode, one way or
+ * another, and lock file content.
+ * The locking order is:
+ *
+ * 1) pages
+ * 2) range in the inode's io tree
+ */
while (1) {
struct btrfs_ordered_extent *ordered;
lock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1);
@@ -2839,8 +2862,11 @@ static inline void lock_extent_range(struct inode *inode, u64 off, u64 len)
unlock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1);
if (ordered)
btrfs_put_ordered_extent(ordered);
+ if (!retry_range_locking)
+ return -EAGAIN;
btrfs_wait_ordered_range(inode, off, len);
}
+ return 0;
}
static void btrfs_double_inode_unlock(struct inode *inode1, struct inode *inode2)
@@ -2865,15 +2891,24 @@ static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1,
unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
}
-static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
- struct inode *inode2, u64 loff2, u64 len)
+static int btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
+ struct inode *inode2, u64 loff2, u64 len,
+ bool retry_range_locking)
{
+ int ret;
+
if (inode1 < inode2) {
swap(inode1, inode2);
swap(loff1, loff2);
}
- lock_extent_range(inode1, loff1, len);
- lock_extent_range(inode2, loff2, len);
+ ret = lock_extent_range(inode1, loff1, len, retry_range_locking);
+ if (ret)
+ return ret;
+ ret = lock_extent_range(inode2, loff2, len, retry_range_locking);
+ if (ret)
+ unlock_extent(&BTRFS_I(inode1)->io_tree, loff1,
+ loff1 + len - 1);
+ return ret;
}
struct cmp_pages {
@@ -2889,11 +2924,15 @@ static void btrfs_cmp_data_free(struct cmp_pages *cmp)
for (i = 0; i < cmp->num_pages; i++) {
pg = cmp->src_pages[i];
- if (pg)
+ if (pg) {
+ unlock_page(pg);
page_cache_release(pg);
+ }
pg = cmp->dst_pages[i];
- if (pg)
+ if (pg) {
+ unlock_page(pg);
page_cache_release(pg);
+ }
}
kfree(cmp->src_pages);
kfree(cmp->dst_pages);
@@ -2954,6 +2993,8 @@ static int btrfs_cmp_data(struct inode *src, u64 loff, struct inode *dst,
src_page = cmp->src_pages[i];
dst_page = cmp->dst_pages[i];
+ ASSERT(PageLocked(src_page));
+ ASSERT(PageLocked(dst_page));
addr = kmap_atomic(src_page);
dst_addr = kmap_atomic(dst_page);
@@ -3066,14 +3107,46 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
goto out_unlock;
}
+again:
ret = btrfs_cmp_data_prepare(src, loff, dst, dst_loff, olen, &cmp);
if (ret)
goto out_unlock;
if (same_inode)
- lock_extent_range(src, same_lock_start, same_lock_len);
+ ret = lock_extent_range(src, same_lock_start, same_lock_len,
+ false);
else
- btrfs_double_extent_lock(src, loff, dst, dst_loff, len);
+ ret = btrfs_double_extent_lock(src, loff, dst, dst_loff, len,
+ false);
+ /*
+ * If one of the inodes has dirty pages in the respective range or
+ * ordered extents, we need to flush dellaloc and wait for all ordered
+ * extents in the range. We must unlock the pages and the ranges in the
+ * io trees to avoid deadlocks when flushing delalloc (requires locking
+ * pages) and when waiting for ordered extents to complete (they require
+ * range locking).
+ */
+ if (ret == -EAGAIN) {
+ /*
+ * Ranges in the io trees already unlocked. Now unlock all
+ * pages before waiting for all IO to complete.
+ */
+ btrfs_cmp_data_free(&cmp);
+ if (same_inode) {
+ btrfs_wait_ordered_range(src, same_lock_start,
+ same_lock_len);
+ } else {
+ btrfs_wait_ordered_range(src, loff, len);
+ btrfs_wait_ordered_range(dst, dst_loff, len);
+ }
+ goto again;
+ }
+ ASSERT(ret == 0);
+ if (WARN_ON(ret)) {
+ /* ranges in the io trees already unlocked */
+ btrfs_cmp_data_free(&cmp);
+ return ret;
+ }
/* pass original length for comparison so we stay within i_size */
ret = btrfs_cmp_data(src, loff, dst, dst_loff, olen, &cmp);
@@ -3895,9 +3968,15 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
u64 lock_start = min_t(u64, off, destoff);
u64 lock_len = max_t(u64, off, destoff) + len - lock_start;
- lock_extent_range(src, lock_start, lock_len);
+ ret = lock_extent_range(src, lock_start, lock_len, true);
} else {
- btrfs_double_extent_lock(src, off, inode, destoff, len);
+ ret = btrfs_double_extent_lock(src, off, inode, destoff, len,
+ true);
+ }
+ ASSERT(ret == 0);
+ if (WARN_ON(ret)) {
+ /* ranges in the io trees already unlocked */
+ goto out_unlock;
}
ret = btrfs_clone(src, inode, off, olen, len, destoff, 0);
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 7cf8509deda7..2c849b08a91b 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -310,8 +310,16 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state);
err = btrfs_insert_fs_root(root->fs_info, root);
+ /*
+ * The root might have been inserted already, as before we look
+ * for orphan roots, log replay might have happened, which
+ * triggers a transaction commit and qgroup accounting, which
+ * in turn reads and inserts fs roots while doing backref
+ * walking.
+ */
+ if (err == -EEXIST)
+ err = 0;
if (err) {
- BUG_ON(err == -EEXIST);
btrfs_free_fs_root(root);
break;
}
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 355a458cba1a..63a6152be04b 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1469,7 +1469,21 @@ static int read_symlink(struct btrfs_root *root,
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto out;
- BUG_ON(ret);
+ if (ret) {
+ /*
+ * An empty symlink inode. Can happen in rare error paths when
+ * creating a symlink (transaction committed before the inode
+ * eviction handler removed the symlink inode items and a crash
+ * happened in between or the subvol was snapshoted in between).
+ * Print an informative message to dmesg/syslog so that the user
+ * can delete the symlink.
+ */
+ btrfs_err(root->fs_info,
+ "Found empty symlink inode %llu at root %llu",
+ ino, root->root_key.objectid);
+ ret = -EIO;
+ goto out;
+ }
ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_file_extent_item);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 24154e422945..fe609b81dd1b 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1956,6 +1956,8 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
* there are other factors that may change the result (like a new metadata
* chunk).
*
+ * If metadata is exhausted, f_bavail will be 0.
+ *
* FIXME: not accurate for mixed block groups, total and free/used are ok,
* available appears slightly larger.
*/
@@ -1967,11 +1969,13 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
struct btrfs_space_info *found;
u64 total_used = 0;
u64 total_free_data = 0;
+ u64 total_free_meta = 0;
int bits = dentry->d_sb->s_blocksize_bits;
__be32 *fsid = (__be32 *)fs_info->fsid;
unsigned factor = 1;
struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
int ret;
+ u64 thresh = 0;
/*
* holding chunk_muext to avoid allocating new chunks, holding
@@ -1997,6 +2001,8 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
}
}
}
+ if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
+ total_free_meta += found->disk_total - found->disk_used;
total_used += found->disk_used;
}
@@ -2019,6 +2025,24 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_bavail += div_u64(total_free_data, factor);
buf->f_bavail = buf->f_bavail >> bits;
+ /*
+ * We calculate the remaining metadata space minus global reserve. If
+ * this is (supposedly) smaller than zero, there's no space. But this
+ * does not hold in practice, the exhausted state happens where's still
+ * some positive delta. So we apply some guesswork and compare the
+ * delta to a 4M threshold. (Practically observed delta was ~2M.)
+ *
+ * We probably cannot calculate the exact threshold value because this
+ * depends on the internal reservations requested by various
+ * operations, so some operations that consume a few metadata will
+ * succeed even if the Avail is zero. But this is better than the other
+ * way around.
+ */
+ thresh = 4 * 1024 * 1024;
+
+ if (total_free_meta - thresh < block_rsv->size)
+ buf->f_bavail = 0;
+
buf->f_type = BTRFS_SUPER_MAGIC;
buf->f_bsize = dentry->d_sb->s_blocksize;
buf->f_namelen = BTRFS_NAME_LEN;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 323e12cc9d2f..0e044d7ee721 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -4406,6 +4406,127 @@ static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans,
return ret;
}
+/*
+ * When we are logging a new inode X, check if it doesn't have a reference that
+ * matches the reference from some other inode Y created in a past transaction
+ * and that was renamed in the current transaction. If we don't do this, then at
+ * log replay time we can lose inode Y (and all its files if it's a directory):
+ *
+ * mkdir /mnt/x
+ * echo "hello world" > /mnt/x/foobar
+ * sync
+ * mv /mnt/x /mnt/y
+ * mkdir /mnt/x # or touch /mnt/x
+ * xfs_io -c fsync /mnt/x
+ * <power fail>
+ * mount fs, trigger log replay
+ *
+ * After the log replay procedure, we would lose the first directory and all its
+ * files (file foobar).
+ * For the case where inode Y is not a directory we simply end up losing it:
+ *
+ * echo "123" > /mnt/foo
+ * sync
+ * mv /mnt/foo /mnt/bar
+ * echo "abc" > /mnt/foo
+ * xfs_io -c fsync /mnt/foo
+ * <power fail>
+ *
+ * We also need this for cases where a snapshot entry is replaced by some other
+ * entry (file or directory) otherwise we end up with an unreplayable log due to
+ * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as
+ * if it were a regular entry:
+ *
+ * mkdir /mnt/x
+ * btrfs subvolume snapshot /mnt /mnt/x/snap
+ * btrfs subvolume delete /mnt/x/snap
+ * rmdir /mnt/x
+ * mkdir /mnt/x
+ * fsync /mnt/x or fsync some new file inside it
+ * <power fail>
+ *
+ * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in
+ * the same transaction.
+ */
+static int btrfs_check_ref_name_override(struct extent_buffer *eb,
+ const int slot,
+ const struct btrfs_key *key,
+ struct inode *inode)
+{
+ int ret;
+ struct btrfs_path *search_path;
+ char *name = NULL;
+ u32 name_len = 0;
+ u32 item_size = btrfs_item_size_nr(eb, slot);
+ u32 cur_offset = 0;
+ unsigned long ptr = btrfs_item_ptr_offset(eb, slot);
+
+ search_path = btrfs_alloc_path();
+ if (!search_path)
+ return -ENOMEM;
+ search_path->search_commit_root = 1;
+ search_path->skip_locking = 1;
+
+ while (cur_offset < item_size) {
+ u64 parent;
+ u32 this_name_len;
+ u32 this_len;
+ unsigned long name_ptr;
+ struct btrfs_dir_item *di;
+
+ if (key->type == BTRFS_INODE_REF_KEY) {
+ struct btrfs_inode_ref *iref;
+
+ iref = (struct btrfs_inode_ref *)(ptr + cur_offset);
+ parent = key->offset;
+ this_name_len = btrfs_inode_ref_name_len(eb, iref);
+ name_ptr = (unsigned long)(iref + 1);
+ this_len = sizeof(*iref) + this_name_len;
+ } else {
+ struct btrfs_inode_extref *extref;
+
+ extref = (struct btrfs_inode_extref *)(ptr +
+ cur_offset);
+ parent = btrfs_inode_extref_parent(eb, extref);
+ this_name_len = btrfs_inode_extref_name_len(eb, extref);
+ name_ptr = (unsigned long)&extref->name;
+ this_len = sizeof(*extref) + this_name_len;
+ }
+
+ if (this_name_len > name_len) {
+ char *new_name;
+
+ new_name = krealloc(name, this_name_len, GFP_NOFS);
+ if (!new_name) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ name_len = this_name_len;
+ name = new_name;
+ }
+
+ read_extent_buffer(eb, name, name_ptr, this_name_len);
+ di = btrfs_lookup_dir_item(NULL, BTRFS_I(inode)->root,
+ search_path, parent,
+ name, this_name_len, 0);
+ if (di && !IS_ERR(di)) {
+ ret = 1;
+ goto out;
+ } else if (IS_ERR(di)) {
+ ret = PTR_ERR(di);
+ goto out;
+ }
+ btrfs_release_path(search_path);
+
+ cur_offset += this_len;
+ }
+ ret = 0;
+out:
+ btrfs_free_path(search_path);
+ kfree(name);
+ return ret;
+}
+
/* log a single inode in the tree log.
* At least one parent directory for this inode must exist in the tree
* or be logged already.
@@ -4578,6 +4699,22 @@ again:
if (min_key.type == BTRFS_INODE_ITEM_KEY)
need_log_inode_item = false;
+ if ((min_key.type == BTRFS_INODE_REF_KEY ||
+ min_key.type == BTRFS_INODE_EXTREF_KEY) &&
+ BTRFS_I(inode)->generation == trans->transid) {
+ ret = btrfs_check_ref_name_override(path->nodes[0],
+ path->slots[0],
+ &min_key, inode);
+ if (ret < 0) {
+ err = ret;
+ goto out_unlock;
+ } else if (ret > 0) {
+ err = 1;
+ btrfs_set_log_full_commit(root->fs_info, trans);
+ goto out_unlock;
+ }
+ }
+
/* Skip xattrs, we log them later with btrfs_log_all_xattrs() */
if (min_key.type == BTRFS_XATTR_ITEM_KEY) {
if (ins_nr == 0)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index a23399e8e3ab..9c62a6f9757a 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -232,6 +232,7 @@ static struct btrfs_device *__alloc_device(void)
spin_lock_init(&dev->reada_lock);
atomic_set(&dev->reada_in_flight, 0);
atomic_set(&dev->dev_stats_ccnt, 0);
+ btrfs_device_data_ordered_init(dev);
INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
@@ -1257,6 +1258,15 @@ int find_free_dev_extent_start(struct btrfs_transaction *transaction,
int ret;
int slot;
struct extent_buffer *l;
+ u64 min_search_start;
+
+ /*
+ * We don't want to overwrite the superblock on the drive nor any area
+ * used by the boot loader (grub for example), so we make sure to start
+ * at an offset of at least 1MB.
+ */
+ min_search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
+ search_start = max(search_start, min_search_start);
path = btrfs_alloc_path();
if (!path)
@@ -1397,18 +1407,9 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans,
struct btrfs_device *device, u64 num_bytes,
u64 *start, u64 *len)
{
- struct btrfs_root *root = device->dev_root;
- u64 search_start;
-
/* FIXME use last free of some kind */
-
- /*
- * we don't want to overwrite the superblock on the drive,
- * so we make sure to start at an offset of at least 1MB
- */
- search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
return find_free_dev_extent_start(trans->transaction, device,
- num_bytes, search_start, start, len);
+ num_bytes, 0, start, len);
}
static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
@@ -6512,6 +6513,14 @@ int btrfs_read_sys_array(struct btrfs_root *root)
goto out_short_read;
num_stripes = btrfs_chunk_num_stripes(sb, chunk);
+ if (!num_stripes) {
+ printk(KERN_ERR
+ "BTRFS: invalid number of stripes %u in sys_array at offset %u\n",
+ num_stripes, cur_offset);
+ ret = -EIO;
+ break;
+ }
+
len = btrfs_chunk_item_size(num_stripes);
if (cur_offset + len > array_size)
goto out_short_read;
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 7febcf2475c5..50b268483302 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -50,7 +50,7 @@ void cifs_vfs_err(const char *fmt, ...)
vaf.fmt = fmt;
vaf.va = &args;
- pr_err("CIFS VFS: %pV", &vaf);
+ pr_err_ratelimited("CIFS VFS: %pV", &vaf);
va_end(args);
}
diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h
index f40fbaca1b2a..66cf0f9fff89 100644
--- a/fs/cifs/cifs_debug.h
+++ b/fs/cifs/cifs_debug.h
@@ -51,14 +51,13 @@ __printf(1, 2) void cifs_vfs_err(const char *fmt, ...);
/* information message: e.g., configuration, major event */
#define cifs_dbg(type, fmt, ...) \
do { \
- if (type == FYI) { \
- if (cifsFYI & CIFS_INFO) { \
- pr_debug("%s: " fmt, __FILE__, ##__VA_ARGS__); \
- } \
+ if (type == FYI && cifsFYI & CIFS_INFO) { \
+ pr_debug_ratelimited("%s: " \
+ fmt, __FILE__, ##__VA_ARGS__); \
} else if (type == VFS) { \
cifs_vfs_err(fmt, ##__VA_ARGS__); \
} else if (type == NOISY && type != 0) { \
- pr_debug(fmt, ##__VA_ARGS__); \
+ pr_debug_ratelimited(fmt, ##__VA_ARGS__); \
} \
} while (0)
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index afa09fce8151..e682b36a210f 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -714,7 +714,7 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
ses->auth_key.response = kmalloc(baselen + tilen, GFP_KERNEL);
if (!ses->auth_key.response) {
- rc = ENOMEM;
+ rc = -ENOMEM;
ses->auth_key.len = 0;
goto setup_ntlmv2_rsp_ret;
}
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index c3cc1609025f..44b3d4280abb 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -31,19 +31,15 @@
* so that it will fit. We use hash_64 to convert the value to 31 bits, and
* then add 1, to ensure that we don't end up with a 0 as the value.
*/
-#if BITS_PER_LONG == 64
static inline ino_t
cifs_uniqueid_to_ino_t(u64 fileid)
{
+ if ((sizeof(ino_t)) < (sizeof(u64)))
+ return (ino_t)hash_64(fileid, (sizeof(ino_t) * 8) - 1) + 1;
+
return (ino_t)fileid;
+
}
-#else
-static inline ino_t
-cifs_uniqueid_to_ino_t(u64 fileid)
-{
- return (ino_t)hash_64(fileid, (sizeof(ino_t) * 8) - 1) + 1;
-}
-#endif
extern struct file_system_type cifs_fs_type;
extern const struct address_space_operations cifs_addr_ops;
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 90b4f9f7de66..76fcb50295a3 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -1396,11 +1396,10 @@ openRetry:
* current bigbuf.
*/
static int
-cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid)
+discard_remaining_data(struct TCP_Server_Info *server)
{
unsigned int rfclen = get_rfc1002_length(server->smallbuf);
int remaining = rfclen + 4 - server->total_read;
- struct cifs_readdata *rdata = mid->callback_data;
while (remaining > 0) {
int length;
@@ -1414,10 +1413,20 @@ cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid)
remaining -= length;
}
- dequeue_mid(mid, rdata->result);
return 0;
}
+static int
+cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid)
+{
+ int length;
+ struct cifs_readdata *rdata = mid->callback_data;
+
+ length = discard_remaining_data(server);
+ dequeue_mid(mid, rdata->result);
+ return length;
+}
+
int
cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
{
@@ -1446,6 +1455,12 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
return length;
server->total_read += length;
+ if (server->ops->is_status_pending &&
+ server->ops->is_status_pending(buf, server, 0)) {
+ discard_remaining_data(server);
+ return -1;
+ }
+
/* Was the SMB read successful? */
rdata->result = server->ops->map_error(buf, false);
if (rdata->result != 0) {
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index ecb0803bdb0e..3c194ff0d2f0 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -368,7 +368,6 @@ cifs_reconnect(struct TCP_Server_Info *server)
server->session_key.response = NULL;
server->session_key.len = 0;
server->lstrp = jiffies;
- mutex_unlock(&server->srv_mutex);
/* mark submitted MIDs for retry and issue callback */
INIT_LIST_HEAD(&retry_list);
@@ -381,6 +380,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
list_move(&mid_entry->qhead, &retry_list);
}
spin_unlock(&GlobalMid_Lock);
+ mutex_unlock(&server->srv_mutex);
cifs_dbg(FYI, "%s: issuing mid callbacks\n", __func__);
list_for_each_safe(tmp, tmp2, &retry_list) {
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 0557c45e9c33..b30a4a6d98a0 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -847,6 +847,7 @@ int cifs_readdir(struct file *file, struct dir_context *ctx)
* if buggy server returns . and .. late do we want to
* check for that here?
*/
+ *tmp_buf = 0;
rc = cifs_filldir(current_entry, file, ctx,
tmp_buf, max_len);
if (rc) {
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 59727e32ed0f..af0ec2d5ad0e 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -400,19 +400,27 @@ int build_ntlmssp_auth_blob(unsigned char *pbuffer,
sec_blob->LmChallengeResponse.MaximumLength = 0;
sec_blob->NtChallengeResponse.BufferOffset = cpu_to_le32(tmp - pbuffer);
- rc = setup_ntlmv2_rsp(ses, nls_cp);
- if (rc) {
- cifs_dbg(VFS, "Error %d during NTLMSSP authentication\n", rc);
- goto setup_ntlmv2_ret;
+ if (ses->user_name != NULL) {
+ rc = setup_ntlmv2_rsp(ses, nls_cp);
+ if (rc) {
+ cifs_dbg(VFS, "Error %d during NTLMSSP authentication\n", rc);
+ goto setup_ntlmv2_ret;
+ }
+ memcpy(tmp, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
+ ses->auth_key.len - CIFS_SESS_KEY_SIZE);
+ tmp += ses->auth_key.len - CIFS_SESS_KEY_SIZE;
+
+ sec_blob->NtChallengeResponse.Length =
+ cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE);
+ sec_blob->NtChallengeResponse.MaximumLength =
+ cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE);
+ } else {
+ /*
+ * don't send an NT Response for anonymous access
+ */
+ sec_blob->NtChallengeResponse.Length = 0;
+ sec_blob->NtChallengeResponse.MaximumLength = 0;
}
- memcpy(tmp, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
- ses->auth_key.len - CIFS_SESS_KEY_SIZE);
- tmp += ses->auth_key.len - CIFS_SESS_KEY_SIZE;
-
- sec_blob->NtChallengeResponse.Length =
- cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE);
- sec_blob->NtChallengeResponse.MaximumLength =
- cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE);
if (ses->domainName == NULL) {
sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer);
@@ -670,20 +678,24 @@ sess_auth_lanman(struct sess_data *sess_data)
pSMB->req.hdr.Flags2 &= ~SMBFLG2_UNICODE;
- /* no capabilities flags in old lanman negotiation */
- pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_AUTH_RESP_SIZE);
-
- /* Calculate hash with password and copy into bcc_ptr.
- * Encryption Key (stored as in cryptkey) gets used if the
- * security mode bit in Negottiate Protocol response states
- * to use challenge/response method (i.e. Password bit is 1).
- */
- rc = calc_lanman_hash(ses->password, ses->server->cryptkey,
- ses->server->sec_mode & SECMODE_PW_ENCRYPT ?
- true : false, lnm_session_key);
-
- memcpy(bcc_ptr, (char *)lnm_session_key, CIFS_AUTH_RESP_SIZE);
- bcc_ptr += CIFS_AUTH_RESP_SIZE;
+ if (ses->user_name != NULL) {
+ /* no capabilities flags in old lanman negotiation */
+ pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_AUTH_RESP_SIZE);
+
+ /* Calculate hash with password and copy into bcc_ptr.
+ * Encryption Key (stored as in cryptkey) gets used if the
+ * security mode bit in Negottiate Protocol response states
+ * to use challenge/response method (i.e. Password bit is 1).
+ */
+ rc = calc_lanman_hash(ses->password, ses->server->cryptkey,
+ ses->server->sec_mode & SECMODE_PW_ENCRYPT ?
+ true : false, lnm_session_key);
+
+ memcpy(bcc_ptr, (char *)lnm_session_key, CIFS_AUTH_RESP_SIZE);
+ bcc_ptr += CIFS_AUTH_RESP_SIZE;
+ } else {
+ pSMB->old_req.PasswordLength = 0;
+ }
/*
* can not sign if LANMAN negotiated so no need
@@ -769,26 +781,31 @@ sess_auth_ntlm(struct sess_data *sess_data)
capabilities = cifs_ssetup_hdr(ses, pSMB);
pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities);
- pSMB->req_no_secext.CaseInsensitivePasswordLength =
- cpu_to_le16(CIFS_AUTH_RESP_SIZE);
- pSMB->req_no_secext.CaseSensitivePasswordLength =
- cpu_to_le16(CIFS_AUTH_RESP_SIZE);
-
- /* calculate ntlm response and session key */
- rc = setup_ntlm_response(ses, sess_data->nls_cp);
- if (rc) {
- cifs_dbg(VFS, "Error %d during NTLM authentication\n",
- rc);
- goto out;
- }
+ if (ses->user_name != NULL) {
+ pSMB->req_no_secext.CaseInsensitivePasswordLength =
+ cpu_to_le16(CIFS_AUTH_RESP_SIZE);
+ pSMB->req_no_secext.CaseSensitivePasswordLength =
+ cpu_to_le16(CIFS_AUTH_RESP_SIZE);
+
+ /* calculate ntlm response and session key */
+ rc = setup_ntlm_response(ses, sess_data->nls_cp);
+ if (rc) {
+ cifs_dbg(VFS, "Error %d during NTLM authentication\n",
+ rc);
+ goto out;
+ }
- /* copy ntlm response */
- memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
- CIFS_AUTH_RESP_SIZE);
- bcc_ptr += CIFS_AUTH_RESP_SIZE;
- memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
- CIFS_AUTH_RESP_SIZE);
- bcc_ptr += CIFS_AUTH_RESP_SIZE;
+ /* copy ntlm response */
+ memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
+ CIFS_AUTH_RESP_SIZE);
+ bcc_ptr += CIFS_AUTH_RESP_SIZE;
+ memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
+ CIFS_AUTH_RESP_SIZE);
+ bcc_ptr += CIFS_AUTH_RESP_SIZE;
+ } else {
+ pSMB->req_no_secext.CaseInsensitivePasswordLength = 0;
+ pSMB->req_no_secext.CaseSensitivePasswordLength = 0;
+ }
if (ses->capabilities & CAP_UNICODE) {
/* unicode strings must be word aligned */
@@ -878,22 +895,26 @@ sess_auth_ntlmv2(struct sess_data *sess_data)
/* LM2 password would be here if we supported it */
pSMB->req_no_secext.CaseInsensitivePasswordLength = 0;
- /* calculate nlmv2 response and session key */
- rc = setup_ntlmv2_rsp(ses, sess_data->nls_cp);
- if (rc) {
- cifs_dbg(VFS, "Error %d during NTLMv2 authentication\n", rc);
- goto out;
- }
+ if (ses->user_name != NULL) {
+ /* calculate nlmv2 response and session key */
+ rc = setup_ntlmv2_rsp(ses, sess_data->nls_cp);
+ if (rc) {
+ cifs_dbg(VFS, "Error %d during NTLMv2 authentication\n", rc);
+ goto out;
+ }
- memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
- ses->auth_key.len - CIFS_SESS_KEY_SIZE);
- bcc_ptr += ses->auth_key.len - CIFS_SESS_KEY_SIZE;
+ memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
+ ses->auth_key.len - CIFS_SESS_KEY_SIZE);
+ bcc_ptr += ses->auth_key.len - CIFS_SESS_KEY_SIZE;
- /* set case sensitive password length after tilen may get
- * assigned, tilen is 0 otherwise.
- */
- pSMB->req_no_secext.CaseSensitivePasswordLength =
- cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE);
+ /* set case sensitive password length after tilen may get
+ * assigned, tilen is 0 otherwise.
+ */
+ pSMB->req_no_secext.CaseSensitivePasswordLength =
+ cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE);
+ } else {
+ pSMB->req_no_secext.CaseSensitivePasswordLength = 0;
+ }
if (ses->capabilities & CAP_UNICODE) {
if (sess_data->iov[0].iov_len % 2) {
diff --git a/fs/cifs/smb2glob.h b/fs/cifs/smb2glob.h
index bc0bb9c34f72..0ffa18094335 100644
--- a/fs/cifs/smb2glob.h
+++ b/fs/cifs/smb2glob.h
@@ -44,6 +44,7 @@
#define SMB2_OP_DELETE 7
#define SMB2_OP_HARDLINK 8
#define SMB2_OP_SET_EOF 9
+#define SMB2_OP_RMDIR 10
/* Used when constructing chained read requests. */
#define CHAINED_REQUEST 1
diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c
index 899bbc86f73e..4f0231e685a9 100644
--- a/fs/cifs/smb2inode.c
+++ b/fs/cifs/smb2inode.c
@@ -80,6 +80,10 @@ smb2_open_op_close(const unsigned int xid, struct cifs_tcon *tcon,
* SMB2_open() call.
*/
break;
+ case SMB2_OP_RMDIR:
+ tmprc = SMB2_rmdir(xid, tcon, fid.persistent_fid,
+ fid.volatile_fid);
+ break;
case SMB2_OP_RENAME:
tmprc = SMB2_rename(xid, tcon, fid.persistent_fid,
fid.volatile_fid, (__le16 *)data);
@@ -191,8 +195,8 @@ smb2_rmdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
struct cifs_sb_info *cifs_sb)
{
return smb2_open_op_close(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN,
- CREATE_NOT_FILE | CREATE_DELETE_ON_CLOSE,
- NULL, SMB2_OP_DELETE);
+ CREATE_NOT_FILE,
+ NULL, SMB2_OP_RMDIR);
}
int
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 767555518d40..82c5f57382b2 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -1109,21 +1109,25 @@ parse_lease_state(struct TCP_Server_Info *server, struct smb2_create_rsp *rsp,
{
char *data_offset;
struct create_context *cc;
- unsigned int next = 0;
+ unsigned int next;
+ unsigned int remaining;
char *name;
data_offset = (char *)rsp + 4 + le32_to_cpu(rsp->CreateContextsOffset);
+ remaining = le32_to_cpu(rsp->CreateContextsLength);
cc = (struct create_context *)data_offset;
- do {
- cc = (struct create_context *)((char *)cc + next);
+ while (remaining >= sizeof(struct create_context)) {
name = le16_to_cpu(cc->NameOffset) + (char *)cc;
- if (le16_to_cpu(cc->NameLength) != 4 ||
- strncmp(name, "RqLs", 4)) {
- next = le32_to_cpu(cc->Next);
- continue;
- }
- return server->ops->parse_lease_buf(cc, epoch);
- } while (next != 0);
+ if (le16_to_cpu(cc->NameLength) == 4 &&
+ strncmp(name, "RqLs", 4) == 0)
+ return server->ops->parse_lease_buf(cc, epoch);
+
+ next = le32_to_cpu(cc->Next);
+ if (!next)
+ break;
+ remaining -= next;
+ cc = (struct create_context *)((char *)cc + next);
+ }
return 0;
}
@@ -2573,6 +2577,22 @@ SMB2_rename(const unsigned int xid, struct cifs_tcon *tcon,
}
int
+SMB2_rmdir(const unsigned int xid, struct cifs_tcon *tcon,
+ u64 persistent_fid, u64 volatile_fid)
+{
+ __u8 delete_pending = 1;
+ void *data;
+ unsigned int size;
+
+ data = &delete_pending;
+ size = 1; /* sizeof __u8 */
+
+ return send_set_info(xid, tcon, persistent_fid, volatile_fid,
+ current->tgid, FILE_DISPOSITION_INFORMATION, 1, &data,
+ &size);
+}
+
+int
SMB2_set_hardlink(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_fid, u64 volatile_fid, __le16 *target_file)
{
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index 79dc650c18b2..9bc59f9c12fb 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -140,6 +140,8 @@ extern int SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
extern int SMB2_rename(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_fid, u64 volatile_fid,
__le16 *target_file);
+extern int SMB2_rmdir(const unsigned int xid, struct cifs_tcon *tcon,
+ u64 persistent_fid, u64 volatile_fid);
extern int SMB2_set_hardlink(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_fid, u64 volatile_fid,
__le16 *target_file);
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 2a24c524fb9a..87abe8ed074c 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -576,14 +576,16 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst,
cifs_in_send_dec(server);
cifs_save_when_sent(mid);
- if (rc < 0)
+ if (rc < 0) {
server->sequence_number -= 2;
+ cifs_delete_mid(mid);
+ }
+
mutex_unlock(&server->srv_mutex);
if (rc == 0)
return 0;
- cifs_delete_mid(mid);
add_credits_and_wake_if(server, credits, optype);
return rc;
}
diff --git a/fs/coredump.c b/fs/coredump.c
index 1777331eee76..dfc87c5f5a54 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -32,6 +32,9 @@
#include <linux/pipe_fs_i.h>
#include <linux/oom.h>
#include <linux/compat.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/path.h>
#include <asm/uaccess.h>
#include <asm/mmu_context.h>
@@ -627,6 +630,8 @@ void do_coredump(const siginfo_t *siginfo)
}
} else {
struct inode *inode;
+ int open_flags = O_CREAT | O_RDWR | O_NOFOLLOW |
+ O_LARGEFILE | O_EXCL;
if (cprm.limit < binfmt->min_coredump)
goto fail_unlock;
@@ -665,10 +670,27 @@ void do_coredump(const siginfo_t *siginfo)
* what matters is that at least one of the two processes
* writes its coredump successfully, not which one.
*/
- cprm.file = filp_open(cn.corename,
- O_CREAT | 2 | O_NOFOLLOW |
- O_LARGEFILE | O_EXCL,
- 0600);
+ if (need_suid_safe) {
+ /*
+ * Using user namespaces, normal user tasks can change
+ * their current->fs->root to point to arbitrary
+ * directories. Since the intention of the "only dump
+ * with a fully qualified path" rule is to control where
+ * coredumps may be placed using root privileges,
+ * current->fs->root must not be used. Instead, use the
+ * root directory of init_task.
+ */
+ struct path root;
+
+ task_lock(&init_task);
+ get_fs_root(init_task.fs, &root);
+ task_unlock(&init_task);
+ cprm.file = file_open_root(root.dentry, root.mnt,
+ cn.corename, open_flags, 0600);
+ path_put(&root);
+ } else {
+ cprm.file = filp_open(cn.corename, open_flags, 0600);
+ }
if (IS_ERR(cprm.file))
goto fail_unlock;
diff --git a/fs/dcache.c b/fs/dcache.c
index 5c33aeb0f68f..18effa378f97 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -269,9 +269,6 @@ static inline int dname_external(const struct dentry *dentry)
return dentry->d_name.name != dentry->d_iname;
}
-/*
- * Make sure other CPUs see the inode attached before the type is set.
- */
static inline void __d_set_inode_and_type(struct dentry *dentry,
struct inode *inode,
unsigned type_flags)
@@ -279,28 +276,18 @@ static inline void __d_set_inode_and_type(struct dentry *dentry,
unsigned flags;
dentry->d_inode = inode;
- smp_wmb();
flags = READ_ONCE(dentry->d_flags);
flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU);
flags |= type_flags;
WRITE_ONCE(dentry->d_flags, flags);
}
-/*
- * Ideally, we want to make sure that other CPUs see the flags cleared before
- * the inode is detached, but this is really a violation of RCU principles
- * since the ordering suggests we should always set inode before flags.
- *
- * We should instead replace or discard the entire dentry - but that sucks
- * performancewise on mass deletion/rename.
- */
static inline void __d_clear_type_and_inode(struct dentry *dentry)
{
unsigned flags = READ_ONCE(dentry->d_flags);
flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU);
WRITE_ONCE(dentry->d_flags, flags);
- smp_wmb();
dentry->d_inode = NULL;
}
@@ -370,9 +357,11 @@ static void dentry_unlink_inode(struct dentry * dentry)
__releases(dentry->d_inode->i_lock)
{
struct inode *inode = dentry->d_inode;
+
+ raw_write_seqcount_begin(&dentry->d_seq);
__d_clear_type_and_inode(dentry);
hlist_del_init(&dentry->d_u.d_alias);
- dentry_rcuwalk_invalidate(dentry);
+ raw_write_seqcount_end(&dentry->d_seq);
spin_unlock(&dentry->d_lock);
spin_unlock(&inode->i_lock);
if (!inode->i_nlink)
@@ -1677,7 +1666,8 @@ void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op)
DCACHE_OP_REVALIDATE |
DCACHE_OP_WEAK_REVALIDATE |
DCACHE_OP_DELETE |
- DCACHE_OP_SELECT_INODE));
+ DCACHE_OP_SELECT_INODE |
+ DCACHE_OP_REAL));
dentry->d_op = op;
if (!op)
return;
@@ -1695,6 +1685,8 @@ void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op)
dentry->d_flags |= DCACHE_OP_PRUNE;
if (op->d_select_inode)
dentry->d_flags |= DCACHE_OP_SELECT_INODE;
+ if (op->d_real)
+ dentry->d_flags |= DCACHE_OP_REAL;
}
EXPORT_SYMBOL(d_set_d_op);
@@ -1757,8 +1749,9 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode)
spin_lock(&dentry->d_lock);
if (inode)
hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry);
+ raw_write_seqcount_begin(&dentry->d_seq);
__d_set_inode_and_type(dentry, inode, add_flags);
- dentry_rcuwalk_invalidate(dentry);
+ raw_write_seqcount_end(&dentry->d_seq);
spin_unlock(&dentry->d_lock);
fsnotify_d_instantiate(dentry, inode);
}
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index b7fcc0de0b2f..0f5d05bf2131 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -457,7 +457,7 @@ struct dentry *debugfs_create_automount(const char *name,
if (unlikely(!inode))
return failed_creating(dentry);
- inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
+ make_empty_dir_inode(inode);
inode->i_flags |= S_AUTOMOUNT;
inode->i_private = data;
dentry->d_fsdata = (void *)f;
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index c35ffdc12bba..706de324f2a6 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -575,6 +575,26 @@ void devpts_kill_index(struct inode *ptmx_inode, int idx)
mutex_unlock(&allocated_ptys_lock);
}
+/*
+ * pty code needs to hold extra references in case of last /dev/tty close
+ */
+
+void devpts_add_ref(struct inode *ptmx_inode)
+{
+ struct super_block *sb = pts_sb_from_inode(ptmx_inode);
+
+ atomic_inc(&sb->s_active);
+ ihold(ptmx_inode);
+}
+
+void devpts_del_ref(struct inode *ptmx_inode)
+{
+ struct super_block *sb = pts_sb_from_inode(ptmx_inode);
+
+ iput(ptmx_inode);
+ deactivate_super(sb);
+}
+
/**
* devpts_pty_new -- create a new inode in /dev/pts/
* @ptmx_inode: inode of the master
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 602e8441bc0f..01171d8a6ee9 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -472,8 +472,8 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio)
dio->io_error = -EIO;
if (dio->is_async && dio->rw == READ && dio->should_dirty) {
- bio_check_pages_dirty(bio); /* transfers ownership */
err = bio->bi_error;
+ bio_check_pages_dirty(bio); /* transfers ownership */
} else {
bio_for_each_segment_all(bvec, bio, i) {
struct page *page = bvec->bv_page;
diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c
index 90001da9abfd..66842e55c48c 100644
--- a/fs/efivarfs/file.c
+++ b/fs/efivarfs/file.c
@@ -10,6 +10,7 @@
#include <linux/efi.h>
#include <linux/fs.h>
#include <linux/slab.h>
+#include <linux/mount.h>
#include "internal.h"
@@ -103,9 +104,78 @@ out_free:
return size;
}
+static int
+efivarfs_ioc_getxflags(struct file *file, void __user *arg)
+{
+ struct inode *inode = file->f_mapping->host;
+ unsigned int i_flags;
+ unsigned int flags = 0;
+
+ i_flags = inode->i_flags;
+ if (i_flags & S_IMMUTABLE)
+ flags |= FS_IMMUTABLE_FL;
+
+ if (copy_to_user(arg, &flags, sizeof(flags)))
+ return -EFAULT;
+ return 0;
+}
+
+static int
+efivarfs_ioc_setxflags(struct file *file, void __user *arg)
+{
+ struct inode *inode = file->f_mapping->host;
+ unsigned int flags;
+ unsigned int i_flags = 0;
+ int error;
+
+ if (!inode_owner_or_capable(inode))
+ return -EACCES;
+
+ if (copy_from_user(&flags, arg, sizeof(flags)))
+ return -EFAULT;
+
+ if (flags & ~FS_IMMUTABLE_FL)
+ return -EOPNOTSUPP;
+
+ if (!capable(CAP_LINUX_IMMUTABLE))
+ return -EPERM;
+
+ if (flags & FS_IMMUTABLE_FL)
+ i_flags |= S_IMMUTABLE;
+
+
+ error = mnt_want_write_file(file);
+ if (error)
+ return error;
+
+ mutex_lock(&inode->i_mutex);
+ inode_set_flags(inode, i_flags, S_IMMUTABLE);
+ mutex_unlock(&inode->i_mutex);
+
+ mnt_drop_write_file(file);
+
+ return 0;
+}
+
+long
+efivarfs_file_ioctl(struct file *file, unsigned int cmd, unsigned long p)
+{
+ void __user *arg = (void __user *)p;
+
+ switch (cmd) {
+ case FS_IOC_GETFLAGS:
+ return efivarfs_ioc_getxflags(file, arg);
+ case FS_IOC_SETFLAGS:
+ return efivarfs_ioc_setxflags(file, arg);
+ }
+
+ return -ENOTTY;
+}
+
const struct file_operations efivarfs_file_operations = {
.open = simple_open,
.read = efivarfs_file_read,
.write = efivarfs_file_write,
.llseek = no_llseek,
+ .unlocked_ioctl = efivarfs_file_ioctl,
};
diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c
index 3381b9da9ee6..e2ab6d0497f2 100644
--- a/fs/efivarfs/inode.c
+++ b/fs/efivarfs/inode.c
@@ -15,7 +15,8 @@
#include "internal.h"
struct inode *efivarfs_get_inode(struct super_block *sb,
- const struct inode *dir, int mode, dev_t dev)
+ const struct inode *dir, int mode,
+ dev_t dev, bool is_removable)
{
struct inode *inode = new_inode(sb);
@@ -23,6 +24,7 @@ struct inode *efivarfs_get_inode(struct super_block *sb,
inode->i_ino = get_next_ino();
inode->i_mode = mode;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ inode->i_flags = is_removable ? 0 : S_IMMUTABLE;
switch (mode & S_IFMT) {
case S_IFREG:
inode->i_fop = &efivarfs_file_operations;
@@ -102,22 +104,17 @@ static void efivarfs_hex_to_guid(const char *str, efi_guid_t *guid)
static int efivarfs_create(struct inode *dir, struct dentry *dentry,
umode_t mode, bool excl)
{
- struct inode *inode;
+ struct inode *inode = NULL;
struct efivar_entry *var;
int namelen, i = 0, err = 0;
+ bool is_removable = false;
if (!efivarfs_valid_name(dentry->d_name.name, dentry->d_name.len))
return -EINVAL;
- inode = efivarfs_get_inode(dir->i_sb, dir, mode, 0);
- if (!inode)
- return -ENOMEM;
-
var = kzalloc(sizeof(struct efivar_entry), GFP_KERNEL);
- if (!var) {
- err = -ENOMEM;
- goto out;
- }
+ if (!var)
+ return -ENOMEM;
/* length of the variable name itself: remove GUID and separator */
namelen = dentry->d_name.len - EFI_VARIABLE_GUID_LEN - 1;
@@ -125,6 +122,16 @@ static int efivarfs_create(struct inode *dir, struct dentry *dentry,
efivarfs_hex_to_guid(dentry->d_name.name + namelen + 1,
&var->var.VendorGuid);
+ if (efivar_variable_is_removable(var->var.VendorGuid,
+ dentry->d_name.name, namelen))
+ is_removable = true;
+
+ inode = efivarfs_get_inode(dir->i_sb, dir, mode, 0, is_removable);
+ if (!inode) {
+ err = -ENOMEM;
+ goto out;
+ }
+
for (i = 0; i < namelen; i++)
var->var.VariableName[i] = dentry->d_name.name[i];
@@ -138,7 +145,8 @@ static int efivarfs_create(struct inode *dir, struct dentry *dentry,
out:
if (err) {
kfree(var);
- iput(inode);
+ if (inode)
+ iput(inode);
}
return err;
}
diff --git a/fs/efivarfs/internal.h b/fs/efivarfs/internal.h
index b5ff16addb7c..b4505188e799 100644
--- a/fs/efivarfs/internal.h
+++ b/fs/efivarfs/internal.h
@@ -15,7 +15,8 @@ extern const struct file_operations efivarfs_file_operations;
extern const struct inode_operations efivarfs_dir_inode_operations;
extern bool efivarfs_valid_name(const char *str, int len);
extern struct inode *efivarfs_get_inode(struct super_block *sb,
- const struct inode *dir, int mode, dev_t dev);
+ const struct inode *dir, int mode, dev_t dev,
+ bool is_removable);
extern struct list_head efivarfs_list;
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index 86a2121828c3..abb244b06024 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -118,8 +118,9 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor,
struct dentry *dentry, *root = sb->s_root;
unsigned long size = 0;
char *name;
- int len, i;
+ int len;
int err = -ENOMEM;
+ bool is_removable = false;
entry = kzalloc(sizeof(*entry), GFP_KERNEL);
if (!entry)
@@ -128,15 +129,17 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor,
memcpy(entry->var.VariableName, name16, name_size);
memcpy(&(entry->var.VendorGuid), &vendor, sizeof(efi_guid_t));
- len = ucs2_strlen(entry->var.VariableName);
+ len = ucs2_utf8size(entry->var.VariableName);
/* name, plus '-', plus GUID, plus NUL*/
name = kmalloc(len + 1 + EFI_VARIABLE_GUID_LEN + 1, GFP_KERNEL);
if (!name)
goto fail;
- for (i = 0; i < len; i++)
- name[i] = entry->var.VariableName[i] & 0xFF;
+ ucs2_as_utf8(name, entry->var.VariableName, len);
+
+ if (efivar_variable_is_removable(entry->var.VendorGuid, name, len))
+ is_removable = true;
name[len] = '-';
@@ -144,7 +147,8 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor,
name[len + EFI_VARIABLE_GUID_LEN+1] = '\0';
- inode = efivarfs_get_inode(sb, d_inode(root), S_IFREG | 0644, 0);
+ inode = efivarfs_get_inode(sb, d_inode(root), S_IFREG | 0644, 0,
+ is_removable);
if (!inode)
goto fail_name;
@@ -200,7 +204,7 @@ static int efivarfs_fill_super(struct super_block *sb, void *data, int silent)
sb->s_d_op = &efivarfs_d_ops;
sb->s_time_gran = 1;
- inode = efivarfs_get_inode(sb, NULL, S_IFDIR | 0755, 0);
+ inode = efivarfs_get_inode(sb, NULL, S_IFDIR | 0755, 0, true);
if (!inode)
return -ENOMEM;
inode->i_op = &efivarfs_dir_inode_operations;
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index ec0668a60678..fe1f50fe764f 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -191,7 +191,6 @@ static int ext4_init_block_bitmap(struct super_block *sb,
/* If checksum is bad mark all blocks used to prevent allocation
* essentially implementing a per-group read-only flag. */
if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) {
- ext4_error(sb, "Checksum bad for group %u", block_group);
grp = ext4_get_group_info(sb, block_group);
if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
percpu_counter_sub(&sbi->s_freeclusters_counter,
@@ -442,14 +441,16 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
}
ext4_lock_group(sb, block_group);
if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
-
err = ext4_init_block_bitmap(sb, bh, block_group, desc);
set_bitmap_uptodate(bh);
set_buffer_uptodate(bh);
ext4_unlock_group(sb, block_group);
unlock_buffer(bh);
- if (err)
+ if (err) {
+ ext4_error(sb, "Failed to init block bitmap for group "
+ "%u: %d", block_group, err);
goto out;
+ }
goto verify;
}
ext4_unlock_group(sb, block_group);
diff --git a/fs/ext4/crypto_key.c b/fs/ext4/crypto_key.c
index c5882b36e558..9a16d1e75a49 100644
--- a/fs/ext4/crypto_key.c
+++ b/fs/ext4/crypto_key.c
@@ -213,9 +213,11 @@ retry:
res = -ENOKEY;
goto out;
}
+ down_read(&keyring_key->sem);
ukp = user_key_payload(keyring_key);
if (ukp->datalen != sizeof(struct ext4_encryption_key)) {
res = -EINVAL;
+ up_read(&keyring_key->sem);
goto out;
}
master_key = (struct ext4_encryption_key *)ukp->data;
@@ -226,10 +228,12 @@ retry:
"ext4: key size incorrect: %d\n",
master_key->size);
res = -ENOKEY;
+ up_read(&keyring_key->sem);
goto out;
}
res = ext4_derive_key_aes(ctx.nonce, master_key->raw,
raw_key);
+ up_read(&keyring_key->sem);
if (res)
goto out;
got_key:
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index cc7ca4e87144..b7e921d207fb 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -850,6 +850,29 @@ do { \
#include "extents_status.h"
/*
+ * Lock subclasses for i_data_sem in the ext4_inode_info structure.
+ *
+ * These are needed to avoid lockdep false positives when we need to
+ * allocate blocks to the quota inode during ext4_map_blocks(), while
+ * holding i_data_sem for a normal (non-quota) inode. Since we don't
+ * do quota tracking for the quota inode, this avoids deadlock (as
+ * well as infinite recursion, since it isn't turtles all the way
+ * down...)
+ *
+ * I_DATA_SEM_NORMAL - Used for most inodes
+ * I_DATA_SEM_OTHER - Used by move_inode.c for the second normal inode
+ * where the second inode has larger inode number
+ * than the first
+ * I_DATA_SEM_QUOTA - Used for quota inodes only
+ */
+enum {
+ I_DATA_SEM_NORMAL = 0,
+ I_DATA_SEM_OTHER,
+ I_DATA_SEM_QUOTA,
+};
+
+
+/*
* fourth extended file system inode data in memory
*/
struct ext4_inode_info {
@@ -910,6 +933,15 @@ struct ext4_inode_info {
* by other means, so we have i_data_sem.
*/
struct rw_semaphore i_data_sem;
+ /*
+ * i_mmap_sem is for serializing page faults with truncate / punch hole
+ * operations. We have to make sure that new page cannot be faulted in
+ * a section of the inode that is being punched. We cannot easily use
+ * i_data_sem for this since we need protection for the whole punch
+ * operation and i_data_sem ranks below transaction start so we have
+ * to occasionally drop it.
+ */
+ struct rw_semaphore i_mmap_sem;
struct inode vfs_inode;
struct jbd2_inode *jinode;
@@ -2484,6 +2516,7 @@ extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
loff_t lstart, loff_t lend);
extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
+extern int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf);
extern qsize_t *ext4_get_reserved_space(struct inode *inode);
extern void ext4_da_update_reserve_space(struct inode *inode,
int used, int quota_claim);
@@ -2848,6 +2881,9 @@ static inline int ext4_update_inode_size(struct inode *inode, loff_t newsize)
return changed;
}
+int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset,
+ loff_t len);
+
struct ext4_group_info {
unsigned long bb_state;
struct rb_root bb_free_root;
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 551353b1b17a..3578b25fccfd 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4685,10 +4685,6 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
if (len <= EXT_UNWRITTEN_MAX_LEN)
flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
- /* Wait all existing dio workers, newcomers will block on i_mutex */
- ext4_inode_block_unlocked_dio(inode);
- inode_dio_wait(inode);
-
/*
* credits to insert 1 extent into extent tree
*/
@@ -4752,8 +4748,6 @@ retry:
goto retry;
}
- ext4_inode_resume_unlocked_dio(inode);
-
return ret > 0 ? ret2 : ret;
}
@@ -4770,7 +4764,6 @@ static long ext4_zero_range(struct file *file, loff_t offset,
int partial_begin, partial_end;
loff_t start, end;
ext4_lblk_t lblk;
- struct address_space *mapping = inode->i_mapping;
unsigned int blkbits = inode->i_blkbits;
trace_ext4_zero_range(inode, offset, len, mode);
@@ -4786,17 +4779,6 @@ static long ext4_zero_range(struct file *file, loff_t offset,
}
/*
- * Write out all dirty pages to avoid race conditions
- * Then release them.
- */
- if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
- ret = filemap_write_and_wait_range(mapping, offset,
- offset + len - 1);
- if (ret)
- return ret;
- }
-
- /*
* Round up offset. This is not fallocate, we neet to zero out
* blocks, so convert interior block aligned part of the range to
* unwritten and possibly manually zero out unaligned parts of the
@@ -4839,6 +4821,10 @@ static long ext4_zero_range(struct file *file, loff_t offset,
if (mode & FALLOC_FL_KEEP_SIZE)
flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
+ /* Wait all existing dio workers, newcomers will block on i_mutex */
+ ext4_inode_block_unlocked_dio(inode);
+ inode_dio_wait(inode);
+
/* Preallocate the range including the unaligned edges */
if (partial_begin || partial_end) {
ret = ext4_alloc_file_blocks(file,
@@ -4847,7 +4833,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
round_down(offset, 1 << blkbits)) >> blkbits,
new_size, flags, mode);
if (ret)
- goto out_mutex;
+ goto out_dio;
}
@@ -4856,16 +4842,23 @@ static long ext4_zero_range(struct file *file, loff_t offset,
flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN |
EXT4_EX_NOCACHE);
- /* Now release the pages and zero block aligned part of pages*/
+ /*
+ * Prevent page faults from reinstantiating pages we have
+ * released from page cache.
+ */
+ down_write(&EXT4_I(inode)->i_mmap_sem);
+ ret = ext4_update_disksize_before_punch(inode, offset, len);
+ if (ret) {
+ up_write(&EXT4_I(inode)->i_mmap_sem);
+ goto out_dio;
+ }
+ /* Now release the pages and zero block aligned part of pages */
truncate_pagecache_range(inode, start, end - 1);
inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
- /* Wait all existing dio workers, newcomers will block on i_mutex */
- ext4_inode_block_unlocked_dio(inode);
- inode_dio_wait(inode);
-
ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
flags, mode);
+ up_write(&EXT4_I(inode)->i_mmap_sem);
if (ret)
goto out_dio;
}
@@ -4998,8 +4991,13 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
goto out;
}
+ /* Wait all existing dio workers, newcomers will block on i_mutex */
+ ext4_inode_block_unlocked_dio(inode);
+ inode_dio_wait(inode);
+
ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
flags, mode);
+ ext4_inode_resume_unlocked_dio(inode);
if (ret)
goto out;
@@ -5494,21 +5492,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
return ret;
}
- /*
- * Need to round down offset to be aligned with page size boundary
- * for page size > block size.
- */
- ioffset = round_down(offset, PAGE_SIZE);
-
- /* Write out all dirty pages */
- ret = filemap_write_and_wait_range(inode->i_mapping, ioffset,
- LLONG_MAX);
- if (ret)
- return ret;
-
- /* Take mutex lock */
mutex_lock(&inode->i_mutex);
-
/*
* There is no need to overlap collapse range with EOF, in which case
* it is effectively a truncate operation
@@ -5524,17 +5508,43 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
goto out_mutex;
}
- truncate_pagecache(inode, ioffset);
-
/* Wait for existing dio to complete */
ext4_inode_block_unlocked_dio(inode);
inode_dio_wait(inode);
+ /*
+ * Prevent page faults from reinstantiating pages we have released from
+ * page cache.
+ */
+ down_write(&EXT4_I(inode)->i_mmap_sem);
+ /*
+ * Need to round down offset to be aligned with page size boundary
+ * for page size > block size.
+ */
+ ioffset = round_down(offset, PAGE_SIZE);
+ /*
+ * Write tail of the last page before removed range since it will get
+ * removed from the page cache below.
+ */
+ ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, offset);
+ if (ret)
+ goto out_mmap;
+ /*
+ * Write data that will be shifted to preserve them when discarding
+ * page cache below. We are also protected from pages becoming dirty
+ * by i_mmap_sem.
+ */
+ ret = filemap_write_and_wait_range(inode->i_mapping, offset + len,
+ LLONG_MAX);
+ if (ret)
+ goto out_mmap;
+ truncate_pagecache(inode, ioffset);
+
credits = ext4_writepage_trans_blocks(inode);
handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
- goto out_dio;
+ goto out_mmap;
}
down_write(&EXT4_I(inode)->i_data_sem);
@@ -5573,7 +5583,8 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
out_stop:
ext4_journal_stop(handle);
-out_dio:
+out_mmap:
+ up_write(&EXT4_I(inode)->i_mmap_sem);
ext4_inode_resume_unlocked_dio(inode);
out_mutex:
mutex_unlock(&inode->i_mutex);
@@ -5627,21 +5638,7 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
return ret;
}
- /*
- * Need to round down to align start offset to page size boundary
- * for page size > block size.
- */
- ioffset = round_down(offset, PAGE_SIZE);
-
- /* Write out all dirty pages */
- ret = filemap_write_and_wait_range(inode->i_mapping, ioffset,
- LLONG_MAX);
- if (ret)
- return ret;
-
- /* Take mutex lock */
mutex_lock(&inode->i_mutex);
-
/* Currently just for extent based files */
if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
ret = -EOPNOTSUPP;
@@ -5660,17 +5657,32 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
goto out_mutex;
}
- truncate_pagecache(inode, ioffset);
-
/* Wait for existing dio to complete */
ext4_inode_block_unlocked_dio(inode);
inode_dio_wait(inode);
+ /*
+ * Prevent page faults from reinstantiating pages we have released from
+ * page cache.
+ */
+ down_write(&EXT4_I(inode)->i_mmap_sem);
+ /*
+ * Need to round down to align start offset to page size boundary
+ * for page size > block size.
+ */
+ ioffset = round_down(offset, PAGE_SIZE);
+ /* Write out all dirty pages */
+ ret = filemap_write_and_wait_range(inode->i_mapping, ioffset,
+ LLONG_MAX);
+ if (ret)
+ goto out_mmap;
+ truncate_pagecache(inode, ioffset);
+
credits = ext4_writepage_trans_blocks(inode);
handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
- goto out_dio;
+ goto out_mmap;
}
/* Expand file to avoid data loss if there is error while shifting */
@@ -5741,7 +5753,8 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
out_stop:
ext4_journal_stop(handle);
-out_dio:
+out_mmap:
+ up_write(&EXT4_I(inode)->i_mmap_sem);
ext4_inode_resume_unlocked_dio(inode);
out_mutex:
mutex_unlock(&inode->i_mutex);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 113837e7ba98..0d24ebcd7c9e 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -209,15 +209,18 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
int result;
handle_t *handle = NULL;
- struct super_block *sb = file_inode(vma->vm_file)->i_sb;
+ struct inode *inode = file_inode(vma->vm_file);
+ struct super_block *sb = inode->i_sb;
bool write = vmf->flags & FAULT_FLAG_WRITE;
if (write) {
sb_start_pagefault(sb);
file_update_time(vma->vm_file);
+ down_read(&EXT4_I(inode)->i_mmap_sem);
handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
EXT4_DATA_TRANS_BLOCKS(sb));
- }
+ } else
+ down_read(&EXT4_I(inode)->i_mmap_sem);
if (IS_ERR(handle))
result = VM_FAULT_SIGBUS;
@@ -228,8 +231,10 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
if (write) {
if (!IS_ERR(handle))
ext4_journal_stop(handle);
+ up_read(&EXT4_I(inode)->i_mmap_sem);
sb_end_pagefault(sb);
- }
+ } else
+ up_read(&EXT4_I(inode)->i_mmap_sem);
return result;
}
@@ -246,10 +251,12 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
if (write) {
sb_start_pagefault(sb);
file_update_time(vma->vm_file);
+ down_read(&EXT4_I(inode)->i_mmap_sem);
handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
ext4_chunk_trans_blocks(inode,
PMD_SIZE / PAGE_SIZE));
- }
+ } else
+ down_read(&EXT4_I(inode)->i_mmap_sem);
if (IS_ERR(handle))
result = VM_FAULT_SIGBUS;
@@ -260,30 +267,71 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
if (write) {
if (!IS_ERR(handle))
ext4_journal_stop(handle);
+ up_read(&EXT4_I(inode)->i_mmap_sem);
sb_end_pagefault(sb);
- }
+ } else
+ up_read(&EXT4_I(inode)->i_mmap_sem);
return result;
}
static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
{
- return dax_mkwrite(vma, vmf, ext4_get_block_dax,
- ext4_end_io_unwritten);
+ int err;
+ struct inode *inode = file_inode(vma->vm_file);
+
+ sb_start_pagefault(inode->i_sb);
+ file_update_time(vma->vm_file);
+ down_read(&EXT4_I(inode)->i_mmap_sem);
+ err = __dax_mkwrite(vma, vmf, ext4_get_block_dax,
+ ext4_end_io_unwritten);
+ up_read(&EXT4_I(inode)->i_mmap_sem);
+ sb_end_pagefault(inode->i_sb);
+
+ return err;
+}
+
+/*
+ * Handle write fault for VM_MIXEDMAP mappings. Similarly to ext4_dax_mkwrite()
+ * handler we check for races agaist truncate. Note that since we cycle through
+ * i_mmap_sem, we are sure that also any hole punching that began before we
+ * were called is finished by now and so if it included part of the file we
+ * are working on, our pte will get unmapped and the check for pte_same() in
+ * wp_pfn_shared() fails. Thus fault gets retried and things work out as
+ * desired.
+ */
+static int ext4_dax_pfn_mkwrite(struct vm_area_struct *vma,
+ struct vm_fault *vmf)
+{
+ struct inode *inode = file_inode(vma->vm_file);
+ struct super_block *sb = inode->i_sb;
+ int ret = VM_FAULT_NOPAGE;
+ loff_t size;
+
+ sb_start_pagefault(sb);
+ file_update_time(vma->vm_file);
+ down_read(&EXT4_I(inode)->i_mmap_sem);
+ size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ if (vmf->pgoff >= size)
+ ret = VM_FAULT_SIGBUS;
+ up_read(&EXT4_I(inode)->i_mmap_sem);
+ sb_end_pagefault(sb);
+
+ return ret;
}
static const struct vm_operations_struct ext4_dax_vm_ops = {
.fault = ext4_dax_fault,
.pmd_fault = ext4_dax_pmd_fault,
.page_mkwrite = ext4_dax_mkwrite,
- .pfn_mkwrite = dax_pfn_mkwrite,
+ .pfn_mkwrite = ext4_dax_pfn_mkwrite,
};
#else
#define ext4_dax_vm_ops ext4_file_vm_ops
#endif
static const struct vm_operations_struct ext4_file_vm_ops = {
- .fault = filemap_fault,
+ .fault = ext4_filemap_fault,
.map_pages = filemap_map_pages,
.page_mkwrite = ext4_page_mkwrite,
};
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 1b8024d26f65..5388207d2832 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -76,7 +76,6 @@ static int ext4_init_inode_bitmap(struct super_block *sb,
/* If checksum is bad mark all blocks and inodes use to prevent
* allocation, essentially implementing a per-group read-only flag. */
if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) {
- ext4_error(sb, "Checksum bad for group %u", block_group);
grp = ext4_get_group_info(sb, block_group);
if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
percpu_counter_sub(&sbi->s_freeclusters_counter,
@@ -191,8 +190,11 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
set_buffer_verified(bh);
ext4_unlock_group(sb, block_group);
unlock_buffer(bh);
- if (err)
+ if (err) {
+ ext4_error(sb, "Failed to init inode bitmap for group "
+ "%u: %d", block_group, err);
goto out;
+ }
return bh;
}
ext4_unlock_group(sb, block_group);
@@ -1141,25 +1143,20 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
unsigned long max_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count);
ext4_group_t block_group;
int bit;
- struct buffer_head *bitmap_bh;
+ struct buffer_head *bitmap_bh = NULL;
struct inode *inode = NULL;
- long err = -EIO;
+ int err = -EFSCORRUPTED;
- /* Error cases - e2fsck has already cleaned up for us */
- if (ino > max_ino) {
- ext4_warning(sb, "bad orphan ino %lu! e2fsck was run?", ino);
- err = -EFSCORRUPTED;
- goto error;
- }
+ if (ino < EXT4_FIRST_INO(sb) || ino > max_ino)
+ goto bad_orphan;
block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
if (IS_ERR(bitmap_bh)) {
- err = PTR_ERR(bitmap_bh);
- ext4_warning(sb, "inode bitmap error %ld for orphan %lu",
- ino, err);
- goto error;
+ ext4_error(sb, "inode bitmap error %ld for orphan %lu",
+ ino, PTR_ERR(bitmap_bh));
+ return (struct inode *) bitmap_bh;
}
/* Having the inode bit set should be a 100% indicator that this
@@ -1170,15 +1167,21 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
goto bad_orphan;
inode = ext4_iget(sb, ino);
- if (IS_ERR(inode))
- goto iget_failed;
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
+ ext4_error(sb, "couldn't read orphan inode %lu (err %d)",
+ ino, err);
+ return inode;
+ }
/*
- * If the orphans has i_nlinks > 0 then it should be able to be
- * truncated, otherwise it won't be removed from the orphan list
- * during processing and an infinite loop will result.
+ * If the orphans has i_nlinks > 0 then it should be able to
+ * be truncated, otherwise it won't be removed from the orphan
+ * list during processing and an infinite loop will result.
+ * Similarly, it must not be a bad inode.
*/
- if (inode->i_nlink && !ext4_can_truncate(inode))
+ if ((inode->i_nlink && !ext4_can_truncate(inode)) ||
+ is_bad_inode(inode))
goto bad_orphan;
if (NEXT_ORPHAN(inode) > max_ino)
@@ -1186,29 +1189,25 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
brelse(bitmap_bh);
return inode;
-iget_failed:
- err = PTR_ERR(inode);
- inode = NULL;
bad_orphan:
- ext4_warning(sb, "bad orphan inode %lu! e2fsck was run?", ino);
- printk(KERN_WARNING "ext4_test_bit(bit=%d, block=%llu) = %d\n",
- bit, (unsigned long long)bitmap_bh->b_blocknr,
- ext4_test_bit(bit, bitmap_bh->b_data));
- printk(KERN_WARNING "inode=%p\n", inode);
+ ext4_error(sb, "bad orphan inode %lu", ino);
+ if (bitmap_bh)
+ printk(KERN_ERR "ext4_test_bit(bit=%d, block=%llu) = %d\n",
+ bit, (unsigned long long)bitmap_bh->b_blocknr,
+ ext4_test_bit(bit, bitmap_bh->b_data));
if (inode) {
- printk(KERN_WARNING "is_bad_inode(inode)=%d\n",
+ printk(KERN_ERR "is_bad_inode(inode)=%d\n",
is_bad_inode(inode));
- printk(KERN_WARNING "NEXT_ORPHAN(inode)=%u\n",
+ printk(KERN_ERR "NEXT_ORPHAN(inode)=%u\n",
NEXT_ORPHAN(inode));
- printk(KERN_WARNING "max_ino=%lu\n", max_ino);
- printk(KERN_WARNING "i_nlink=%u\n", inode->i_nlink);
+ printk(KERN_ERR "max_ino=%lu\n", max_ino);
+ printk(KERN_ERR "i_nlink=%u\n", inode->i_nlink);
/* Avoid freeing blocks if we got a bad deleted inode */
if (inode->i_nlink == 0)
inode->i_blocks = 0;
iput(inode);
}
brelse(bitmap_bh);
-error:
return ERR_PTR(err);
}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index ea433a7f4bca..e31d762eedce 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -657,6 +657,34 @@ has_zeroout:
return retval;
}
+/*
+ * Update EXT4_MAP_FLAGS in bh->b_state. For buffer heads attached to pages
+ * we have to be careful as someone else may be manipulating b_state as well.
+ */
+static void ext4_update_bh_state(struct buffer_head *bh, unsigned long flags)
+{
+ unsigned long old_state;
+ unsigned long new_state;
+
+ flags &= EXT4_MAP_FLAGS;
+
+ /* Dummy buffer_head? Set non-atomically. */
+ if (!bh->b_page) {
+ bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | flags;
+ return;
+ }
+ /*
+ * Someone else may be modifying b_state. Be careful! This is ugly but
+ * once we get rid of using bh as a container for mapping information
+ * to pass to / from get_block functions, this can go away.
+ */
+ do {
+ old_state = READ_ONCE(bh->b_state);
+ new_state = (old_state & ~EXT4_MAP_FLAGS) | flags;
+ } while (unlikely(
+ cmpxchg(&bh->b_state, old_state, new_state) != old_state));
+}
+
/* Maximum number of blocks we map for direct IO at once. */
#define DIO_MAX_BLOCKS 4096
@@ -693,7 +721,7 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
ext4_io_end_t *io_end = ext4_inode_aio(inode);
map_bh(bh, inode->i_sb, map.m_pblk);
- bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
+ ext4_update_bh_state(bh, map.m_flags);
if (IS_DAX(inode) && buffer_unwritten(bh)) {
/*
* dgc: I suspect unwritten conversion on ext4+DAX is
@@ -1669,7 +1697,7 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
return ret;
map_bh(bh, inode->i_sb, map.m_pblk);
- bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
+ ext4_update_bh_state(bh, map.m_flags);
if (buffer_unwritten(bh)) {
/* A delayed write to unwritten bh should be marked
@@ -3559,6 +3587,35 @@ int ext4_can_truncate(struct inode *inode)
}
/*
+ * We have to make sure i_disksize gets properly updated before we truncate
+ * page cache due to hole punching or zero range. Otherwise i_disksize update
+ * can get lost as it may have been postponed to submission of writeback but
+ * that will never happen after we truncate page cache.
+ */
+int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset,
+ loff_t len)
+{
+ handle_t *handle;
+ loff_t size = i_size_read(inode);
+
+ WARN_ON(!mutex_is_locked(&inode->i_mutex));
+ if (offset > size || offset + len < size)
+ return 0;
+
+ if (EXT4_I(inode)->i_disksize >= size)
+ return 0;
+
+ handle = ext4_journal_start(inode, EXT4_HT_MISC, 1);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+ ext4_update_i_disksize(inode, size);
+ ext4_mark_inode_dirty(handle, inode);
+ ext4_journal_stop(handle);
+
+ return 0;
+}
+
+/*
* ext4_punch_hole: punches a hole in a file by releaseing the blocks
* associated with the given offset and length
*
@@ -3623,17 +3680,26 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
}
+ /* Wait all existing dio workers, newcomers will block on i_mutex */
+ ext4_inode_block_unlocked_dio(inode);
+ inode_dio_wait(inode);
+
+ /*
+ * Prevent page faults from reinstantiating pages we have released from
+ * page cache.
+ */
+ down_write(&EXT4_I(inode)->i_mmap_sem);
first_block_offset = round_up(offset, sb->s_blocksize);
last_block_offset = round_down((offset + length), sb->s_blocksize) - 1;
/* Now release the pages and zero block aligned part of pages*/
- if (last_block_offset > first_block_offset)
+ if (last_block_offset > first_block_offset) {
+ ret = ext4_update_disksize_before_punch(inode, offset, length);
+ if (ret)
+ goto out_dio;
truncate_pagecache_range(inode, first_block_offset,
last_block_offset);
-
- /* Wait all existing dio workers, newcomers will block on i_mutex */
- ext4_inode_block_unlocked_dio(inode);
- inode_dio_wait(inode);
+ }
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
credits = ext4_writepage_trans_blocks(inode);
@@ -3680,16 +3746,12 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
if (IS_SYNC(inode))
ext4_handle_sync(handle);
- /* Now release the pages again to reduce race window */
- if (last_block_offset > first_block_offset)
- truncate_pagecache_range(inode, first_block_offset,
- last_block_offset);
-
inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
ext4_mark_inode_dirty(handle, inode);
out_stop:
ext4_journal_stop(handle);
out_dio:
+ up_write(&EXT4_I(inode)->i_mmap_sem);
ext4_inode_resume_unlocked_dio(inode);
out_mutex:
mutex_unlock(&inode->i_mutex);
@@ -4823,6 +4885,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
} else
ext4_wait_for_tail_page_commit(inode);
}
+ down_write(&EXT4_I(inode)->i_mmap_sem);
/*
* Truncate pagecache after we've waited for commit
* in data=journal mode to make pages freeable.
@@ -4830,6 +4893,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
truncate_pagecache(inode, inode->i_size);
if (shrink)
ext4_truncate(inode);
+ up_write(&EXT4_I(inode)->i_mmap_sem);
}
if (!rc) {
@@ -5081,6 +5145,8 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
might_sleep();
trace_ext4_mark_inode_dirty(inode, _RET_IP_);
err = ext4_reserve_inode_write(handle, inode, &iloc);
+ if (err)
+ return err;
if (ext4_handle_valid(handle) &&
EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
!ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) {
@@ -5111,9 +5177,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
}
}
}
- if (!err)
- err = ext4_mark_iloc_dirty(handle, inode, &iloc);
- return err;
+ return ext4_mark_iloc_dirty(handle, inode, &iloc);
}
/*
@@ -5278,6 +5342,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
sb_start_pagefault(inode->i_sb);
file_update_time(vma->vm_file);
+
+ down_read(&EXT4_I(inode)->i_mmap_sem);
/* Delalloc case is easy... */
if (test_opt(inode->i_sb, DELALLOC) &&
!ext4_should_journal_data(inode) &&
@@ -5347,6 +5413,19 @@ retry_alloc:
out_ret:
ret = block_page_mkwrite_return(ret);
out:
+ up_read(&EXT4_I(inode)->i_mmap_sem);
sb_end_pagefault(inode->i_sb);
return ret;
}
+
+int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ struct inode *inode = file_inode(vma->vm_file);
+ int err;
+
+ down_read(&EXT4_I(inode)->i_mmap_sem);
+ err = filemap_fault(vma, vmf);
+ up_read(&EXT4_I(inode)->i_mmap_sem);
+
+ return err;
+}
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 61eaf74dca37..cf734170daa9 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1259,6 +1259,7 @@ static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
static int mb_find_order_for_block(struct ext4_buddy *e4b, int block)
{
int order = 1;
+ int bb_incr = 1 << (e4b->bd_blkbits - 1);
void *bb;
BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
@@ -1271,7 +1272,8 @@ static int mb_find_order_for_block(struct ext4_buddy *e4b, int block)
/* this block is part of buddy of order 'order' */
return order;
}
- bb += 1 << (e4b->bd_blkbits - order);
+ bb += bb_incr;
+ bb_incr >>= 1;
order++;
}
return 0;
@@ -2576,7 +2578,7 @@ int ext4_mb_init(struct super_block *sb)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
unsigned i, j;
- unsigned offset;
+ unsigned offset, offset_incr;
unsigned max;
int ret;
@@ -2605,11 +2607,13 @@ int ext4_mb_init(struct super_block *sb)
i = 1;
offset = 0;
+ offset_incr = 1 << (sb->s_blocksize_bits - 1);
max = sb->s_blocksize << 2;
do {
sbi->s_mb_offsets[i] = offset;
sbi->s_mb_maxs[i] = max;
- offset += 1 << (sb->s_blocksize_bits - i);
+ offset += offset_incr;
+ offset_incr = offset_incr >> 1;
max = max >> 1;
i++;
} while (i <= sb->s_blocksize_bits + 1);
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index fb6f11709ae6..796ff0eafd3c 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -60,10 +60,10 @@ ext4_double_down_write_data_sem(struct inode *first, struct inode *second)
{
if (first < second) {
down_write(&EXT4_I(first)->i_data_sem);
- down_write_nested(&EXT4_I(second)->i_data_sem, SINGLE_DEPTH_NESTING);
+ down_write_nested(&EXT4_I(second)->i_data_sem, I_DATA_SEM_OTHER);
} else {
down_write(&EXT4_I(second)->i_data_sem);
- down_write_nested(&EXT4_I(first)->i_data_sem, SINGLE_DEPTH_NESTING);
+ down_write_nested(&EXT4_I(first)->i_data_sem, I_DATA_SEM_OTHER);
}
}
@@ -265,11 +265,12 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
ext4_lblk_t orig_blk_offset, donor_blk_offset;
unsigned long blocksize = orig_inode->i_sb->s_blocksize;
unsigned int tmp_data_size, data_size, replaced_size;
- int err2, jblocks, retries = 0;
+ int i, err2, jblocks, retries = 0;
int replaced_count = 0;
int from = data_offset_in_page << orig_inode->i_blkbits;
int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
struct super_block *sb = orig_inode->i_sb;
+ struct buffer_head *bh = NULL;
/*
* It needs twice the amount of ordinary journal buffers because
@@ -380,8 +381,17 @@ data_copy:
}
/* Perform all necessary steps similar write_begin()/write_end()
* but keeping in mind that i_size will not change */
- *err = __block_write_begin(pagep[0], from, replaced_size,
- ext4_get_block);
+ if (!page_has_buffers(pagep[0]))
+ create_empty_buffers(pagep[0], 1 << orig_inode->i_blkbits, 0);
+ bh = page_buffers(pagep[0]);
+ for (i = 0; i < data_offset_in_page; i++)
+ bh = bh->b_this_page;
+ for (i = 0; i < block_len_in_page; i++) {
+ *err = ext4_get_block(orig_inode, orig_blk_offset + i, bh, 0);
+ if (*err < 0)
+ break;
+ bh = bh->b_this_page;
+ }
if (!*err)
*err = block_commit_write(pagep[0], from, from + replaced_size);
@@ -474,6 +484,13 @@ mext_check_arguments(struct inode *orig_inode,
return -EBUSY;
}
+ if (IS_NOQUOTA(orig_inode) || IS_NOQUOTA(donor_inode)) {
+ ext4_debug("ext4 move extent: The argument files should "
+ "not be quota files [ino:orig %lu, donor %lu]\n",
+ orig_inode->i_ino, donor_inode->i_ino);
+ return -EBUSY;
+ }
+
/* Ext4 move extent supports only extent based file */
if (!(ext4_test_inode_flag(orig_inode, EXT4_INODE_EXTENTS))) {
ext4_debug("ext4 move extent: orig file is not extents "
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index a969ab39f302..91bf36f22dbf 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2809,7 +2809,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
* list entries can cause panics at unmount time.
*/
mutex_lock(&sbi->s_orphan_lock);
- list_del(&EXT4_I(inode)->i_orphan);
+ list_del_init(&EXT4_I(inode)->i_orphan);
mutex_unlock(&sbi->s_orphan_lock);
}
}
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index ad62d7acc315..34038e3598d5 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -198,7 +198,7 @@ static struct ext4_new_flex_group_data *alloc_flex_gd(unsigned long flexbg_size)
if (flex_gd == NULL)
goto out3;
- if (flexbg_size >= UINT_MAX / sizeof(struct ext4_new_flex_group_data))
+ if (flexbg_size >= UINT_MAX / sizeof(struct ext4_new_group_data))
goto out2;
flex_gd->count = flexbg_size;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index c9ab67da6e5a..852c26806af2 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -958,6 +958,7 @@ static void init_once(void *foo)
INIT_LIST_HEAD(&ei->i_orphan);
init_rwsem(&ei->xattr_sem);
init_rwsem(&ei->i_data_sem);
+ init_rwsem(&ei->i_mmap_sem);
inode_init_once(&ei->vfs_inode);
}
@@ -1292,9 +1293,9 @@ static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
return -1;
}
if (ext4_has_feature_quota(sb)) {
- ext4_msg(sb, KERN_ERR, "Cannot set journaled quota options "
- "when QUOTA feature is enabled");
- return -1;
+ ext4_msg(sb, KERN_INFO, "Journaled quota options "
+ "ignored when QUOTA feature is enabled");
+ return 1;
}
qname = match_strdup(args);
if (!qname) {
@@ -1657,10 +1658,10 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
return -1;
}
if (ext4_has_feature_quota(sb)) {
- ext4_msg(sb, KERN_ERR,
- "Cannot set journaled quota options "
+ ext4_msg(sb, KERN_INFO,
+ "Quota format mount options ignored "
"when QUOTA feature is enabled");
- return -1;
+ return 1;
}
sbi->s_jquota_fmt = m->mount_opt;
#endif
@@ -1721,11 +1722,11 @@ static int parse_options(char *options, struct super_block *sb,
#ifdef CONFIG_QUOTA
if (ext4_has_feature_quota(sb) &&
(test_opt(sb, USRQUOTA) || test_opt(sb, GRPQUOTA))) {
- ext4_msg(sb, KERN_ERR, "Cannot set quota options when QUOTA "
- "feature is enabled");
- return 0;
- }
- if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
+ ext4_msg(sb, KERN_INFO, "Quota feature enabled, usrquota and grpquota "
+ "mount options ignored.");
+ clear_opt(sb, USRQUOTA);
+ clear_opt(sb, GRPQUOTA);
+ } else if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA])
clear_opt(sb, USRQUOTA);
@@ -4936,6 +4937,20 @@ static int ext4_quota_on_mount(struct super_block *sb, int type)
EXT4_SB(sb)->s_jquota_fmt, type);
}
+static void lockdep_set_quota_inode(struct inode *inode, int subclass)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+
+ /* The first argument of lockdep_set_subclass has to be
+ * *exactly* the same as the argument to init_rwsem() --- in
+ * this case, in init_once() --- or lockdep gets unhappy
+ * because the name of the lock is set using the
+ * stringification of the argument to init_rwsem().
+ */
+ (void) ei; /* shut up clang warning if !CONFIG_LOCKDEP */
+ lockdep_set_subclass(&ei->i_data_sem, subclass);
+}
+
/*
* Standard function to be called on quota_on
*/
@@ -4975,8 +4990,12 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
if (err)
return err;
}
-
- return dquot_quota_on(sb, type, format_id, path);
+ lockdep_set_quota_inode(path->dentry->d_inode, I_DATA_SEM_QUOTA);
+ err = dquot_quota_on(sb, type, format_id, path);
+ if (err)
+ lockdep_set_quota_inode(path->dentry->d_inode,
+ I_DATA_SEM_NORMAL);
+ return err;
}
static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
@@ -5002,8 +5021,11 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
/* Don't account quota for quota files to avoid recursion */
qf_inode->i_flags |= S_NOQUOTA;
+ lockdep_set_quota_inode(qf_inode, I_DATA_SEM_QUOTA);
err = dquot_enable(qf_inode, type, format_id, flags);
iput(qf_inode);
+ if (err)
+ lockdep_set_quota_inode(qf_inode, I_DATA_SEM_NORMAL);
return err;
}
diff --git a/fs/ext4/truncate.h b/fs/ext4/truncate.h
index 011ba6670d99..c70d06a383e2 100644
--- a/fs/ext4/truncate.h
+++ b/fs/ext4/truncate.h
@@ -10,8 +10,10 @@
*/
static inline void ext4_truncate_failed_write(struct inode *inode)
{
+ down_write(&EXT4_I(inode)->i_mmap_sem);
truncate_inode_pages(inode->i_mapping, inode->i_size);
ext4_truncate(inode);
+ up_write(&EXT4_I(inode)->i_mmap_sem);
}
/*
diff --git a/fs/fhandle.c b/fs/fhandle.c
index d59712dfa3e7..ca3c3dd01789 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -228,7 +228,7 @@ long do_handle_open(int mountdirfd,
path_put(&path);
return fd;
}
- file = file_open_root(path.dentry, path.mnt, "", open_flag);
+ file = file_open_root(path.dentry, path.mnt, "", open_flag, 0);
if (IS_ERR(file)) {
put_unused_fd(fd);
retval = PTR_ERR(file);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 023f6a1f23cd..60d6fc2e0e4b 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -223,6 +223,9 @@ static void wb_wait_for_completion(struct backing_dev_info *bdi,
#define WB_FRN_HIST_MAX_SLOTS (WB_FRN_HIST_THR_SLOTS / 2 + 1)
/* one round can affect upto 5 slots */
+static atomic_t isw_nr_in_flight = ATOMIC_INIT(0);
+static struct workqueue_struct *isw_wq;
+
void __inode_attach_wb(struct inode *inode, struct page *page)
{
struct backing_dev_info *bdi = inode_to_bdi(inode);
@@ -278,13 +281,15 @@ locked_inode_to_wb_and_lock_list(struct inode *inode)
wb_get(wb);
spin_unlock(&inode->i_lock);
spin_lock(&wb->list_lock);
- wb_put(wb); /* not gonna deref it anymore */
/* i_wb may have changed inbetween, can't use inode_to_wb() */
- if (likely(wb == inode->i_wb))
- return wb; /* @inode already has ref */
+ if (likely(wb == inode->i_wb)) {
+ wb_put(wb); /* @inode already has ref */
+ return wb;
+ }
spin_unlock(&wb->list_lock);
+ wb_put(wb);
cpu_relax();
spin_lock(&inode->i_lock);
}
@@ -424,6 +429,8 @@ skip_switch:
iput(inode);
kfree(isw);
+
+ atomic_dec(&isw_nr_in_flight);
}
static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head)
@@ -433,7 +440,7 @@ static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head)
/* needs to grab bh-unsafe locks, bounce to work item */
INIT_WORK(&isw->work, inode_switch_wbs_work_fn);
- schedule_work(&isw->work);
+ queue_work(isw_wq, &isw->work);
}
/**
@@ -469,7 +476,8 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
/* while holding I_WB_SWITCH, no one else can update the association */
spin_lock(&inode->i_lock);
- if (inode->i_state & (I_WB_SWITCH | I_FREEING) ||
+ if (!(inode->i_sb->s_flags & MS_ACTIVE) ||
+ inode->i_state & (I_WB_SWITCH | I_FREEING) ||
inode_to_wb(inode) == isw->new_wb) {
spin_unlock(&inode->i_lock);
goto out_free;
@@ -480,6 +488,8 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
ihold(inode);
isw->inode = inode;
+ atomic_inc(&isw_nr_in_flight);
+
/*
* In addition to synchronizing among switchers, I_WB_SWITCH tells
* the RCU protected stat update paths to grab the mapping's
@@ -842,6 +852,33 @@ restart:
wb_put(last_wb);
}
+/**
+ * cgroup_writeback_umount - flush inode wb switches for umount
+ *
+ * This function is called when a super_block is about to be destroyed and
+ * flushes in-flight inode wb switches. An inode wb switch goes through
+ * RCU and then workqueue, so the two need to be flushed in order to ensure
+ * that all previously scheduled switches are finished. As wb switches are
+ * rare occurrences and synchronize_rcu() can take a while, perform
+ * flushing iff wb switches are in flight.
+ */
+void cgroup_writeback_umount(void)
+{
+ if (atomic_read(&isw_nr_in_flight)) {
+ synchronize_rcu();
+ flush_workqueue(isw_wq);
+ }
+}
+
+static int __init cgroup_writeback_init(void)
+{
+ isw_wq = alloc_workqueue("inode_switch_wbs", 0, 0);
+ if (!isw_wq)
+ return -ENOMEM;
+ return 0;
+}
+fs_initcall(cgroup_writeback_init);
+
#else /* CONFIG_CGROUP_WRITEBACK */
static struct bdi_writeback *
@@ -1304,10 +1341,10 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
* we go e.g. from filesystem. Flusher thread uses __writeback_single_inode()
* and does more profound writeback list handling in writeback_sb_inodes().
*/
-static int
-writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
- struct writeback_control *wbc)
+static int writeback_single_inode(struct inode *inode,
+ struct writeback_control *wbc)
{
+ struct bdi_writeback *wb;
int ret = 0;
spin_lock(&inode->i_lock);
@@ -1345,7 +1382,8 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
ret = __writeback_single_inode(inode, wbc);
wbc_detach_inode(wbc);
- spin_lock(&wb->list_lock);
+
+ wb = inode_to_wb_and_lock_list(inode);
spin_lock(&inode->i_lock);
/*
* If inode is clean, remove it from writeback lists. Otherwise don't
@@ -1420,6 +1458,7 @@ static long writeback_sb_inodes(struct super_block *sb,
while (!list_empty(&wb->b_io)) {
struct inode *inode = wb_inode(wb->b_io.prev);
+ struct bdi_writeback *tmp_wb;
if (inode->i_sb != sb) {
if (work->sb) {
@@ -1510,15 +1549,23 @@ static long writeback_sb_inodes(struct super_block *sb,
cond_resched();
}
-
- spin_lock(&wb->list_lock);
+ /*
+ * Requeue @inode if still dirty. Be careful as @inode may
+ * have been switched to another wb in the meantime.
+ */
+ tmp_wb = inode_to_wb_and_lock_list(inode);
spin_lock(&inode->i_lock);
if (!(inode->i_state & I_DIRTY_ALL))
wrote++;
- requeue_inode(inode, wb, &wbc);
+ requeue_inode(inode, tmp_wb, &wbc);
inode_sync_complete(inode);
spin_unlock(&inode->i_lock);
+ if (unlikely(tmp_wb != wb)) {
+ spin_unlock(&tmp_wb->list_lock);
+ spin_lock(&wb->list_lock);
+ }
+
/*
* bail out to wb_writeback() often enough to check
* background threshold and other termination conditions.
@@ -2305,7 +2352,6 @@ EXPORT_SYMBOL(sync_inodes_sb);
*/
int write_inode_now(struct inode *inode, int sync)
{
- struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
struct writeback_control wbc = {
.nr_to_write = LONG_MAX,
.sync_mode = sync ? WB_SYNC_ALL : WB_SYNC_NONE,
@@ -2317,7 +2363,7 @@ int write_inode_now(struct inode *inode, int sync)
wbc.nr_to_write = 0;
might_sleep();
- return writeback_single_inode(inode, wb, &wbc);
+ return writeback_single_inode(inode, &wbc);
}
EXPORT_SYMBOL(write_inode_now);
@@ -2334,7 +2380,7 @@ EXPORT_SYMBOL(write_inode_now);
*/
int sync_inode(struct inode *inode, struct writeback_control *wbc)
{
- return writeback_single_inode(inode, &inode_to_bdi(inode)->wb, wbc);
+ return writeback_single_inode(inode, wbc);
}
EXPORT_SYMBOL(sync_inode);
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index 8e3ee1936c7e..c5b6b7165489 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -90,7 +90,7 @@ static struct list_head *cuse_conntbl_head(dev_t devt)
static ssize_t cuse_read_iter(struct kiocb *kiocb, struct iov_iter *to)
{
- struct fuse_io_priv io = { .async = 0, .file = kiocb->ki_filp };
+ struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(kiocb->ki_filp);
loff_t pos = 0;
return fuse_direct_io(&io, to, &pos, FUSE_DIO_CUSE);
@@ -98,7 +98,7 @@ static ssize_t cuse_read_iter(struct kiocb *kiocb, struct iov_iter *to)
static ssize_t cuse_write_iter(struct kiocb *kiocb, struct iov_iter *from)
{
- struct fuse_io_priv io = { .async = 0, .file = kiocb->ki_filp };
+ struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(kiocb->ki_filp);
loff_t pos = 0;
/*
* No locking or generic_write_checks(), the server is
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 570ca4053c80..c2e340d6ec6e 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -528,6 +528,11 @@ static void fuse_release_user_pages(struct fuse_req *req, int write)
}
}
+static void fuse_io_release(struct kref *kref)
+{
+ kfree(container_of(kref, struct fuse_io_priv, refcnt));
+}
+
static ssize_t fuse_get_res_by_io(struct fuse_io_priv *io)
{
if (io->err)
@@ -585,8 +590,9 @@ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos)
}
io->iocb->ki_complete(io->iocb, res, 0);
- kfree(io);
}
+
+ kref_put(&io->refcnt, fuse_io_release);
}
static void fuse_aio_complete_req(struct fuse_conn *fc, struct fuse_req *req)
@@ -613,6 +619,7 @@ static size_t fuse_async_req_send(struct fuse_conn *fc, struct fuse_req *req,
size_t num_bytes, struct fuse_io_priv *io)
{
spin_lock(&io->lock);
+ kref_get(&io->refcnt);
io->size += num_bytes;
io->reqs++;
spin_unlock(&io->lock);
@@ -691,7 +698,7 @@ static void fuse_short_read(struct fuse_req *req, struct inode *inode,
static int fuse_do_readpage(struct file *file, struct page *page)
{
- struct fuse_io_priv io = { .async = 0, .file = file };
+ struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(file);
struct inode *inode = page->mapping->host;
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_req *req;
@@ -984,7 +991,7 @@ static size_t fuse_send_write_pages(struct fuse_req *req, struct file *file,
size_t res;
unsigned offset;
unsigned i;
- struct fuse_io_priv io = { .async = 0, .file = file };
+ struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(file);
for (i = 0; i < req->num_pages; i++)
fuse_wait_on_page_writeback(inode, req->pages[i]->index);
@@ -1398,7 +1405,7 @@ static ssize_t __fuse_direct_read(struct fuse_io_priv *io,
static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
- struct fuse_io_priv io = { .async = 0, .file = iocb->ki_filp };
+ struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb->ki_filp);
return __fuse_direct_read(&io, to, &iocb->ki_pos);
}
@@ -1406,7 +1413,7 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
- struct fuse_io_priv io = { .async = 0, .file = file };
+ struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(file);
ssize_t res;
if (is_bad_inode(inode))
@@ -2786,6 +2793,7 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
loff_t i_size;
size_t count = iov_iter_count(iter);
struct fuse_io_priv *io;
+ bool is_sync = is_sync_kiocb(iocb);
pos = offset;
inode = file->f_mapping->host;
@@ -2806,6 +2814,7 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
if (!io)
return -ENOMEM;
spin_lock_init(&io->lock);
+ kref_init(&io->refcnt);
io->reqs = 1;
io->bytes = -1;
io->size = 0;
@@ -2825,12 +2834,18 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
* to wait on real async I/O requests, so we must submit this request
* synchronously.
*/
- if (!is_sync_kiocb(iocb) && (offset + count > i_size) &&
+ if (!is_sync && (offset + count > i_size) &&
iov_iter_rw(iter) == WRITE)
io->async = false;
- if (io->async && is_sync_kiocb(iocb))
+ if (io->async && is_sync) {
+ /*
+ * Additional reference to keep io around after
+ * calling fuse_aio_complete()
+ */
+ kref_get(&io->refcnt);
io->done = &wait;
+ }
if (iov_iter_rw(iter) == WRITE) {
ret = fuse_direct_io(io, iter, &pos, FUSE_DIO_WRITE);
@@ -2843,14 +2858,14 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
fuse_aio_complete(io, ret < 0 ? ret : 0, -1);
/* we have a non-extending, async request, so return */
- if (!is_sync_kiocb(iocb))
+ if (!is_sync)
return -EIOCBQUEUED;
wait_for_completion(&wait);
ret = fuse_get_res_by_io(io);
}
- kfree(io);
+ kref_put(&io->refcnt, fuse_io_release);
if (iov_iter_rw(iter) == WRITE) {
if (ret > 0)
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 405113101db8..604cd42dafef 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -22,6 +22,7 @@
#include <linux/rbtree.h>
#include <linux/poll.h>
#include <linux/workqueue.h>
+#include <linux/kref.h>
/** Max number of pages that can be used in a single read request */
#define FUSE_MAX_PAGES_PER_REQ 32
@@ -243,6 +244,7 @@ struct fuse_args {
/** The request IO state (for asynchronous processing) */
struct fuse_io_priv {
+ struct kref refcnt;
int async;
spinlock_t lock;
unsigned reqs;
@@ -256,6 +258,13 @@ struct fuse_io_priv {
struct completion *done;
};
+#define FUSE_IO_PRIV_SYNC(f) \
+{ \
+ .refcnt = { ATOMIC_INIT(1) }, \
+ .async = 0, \
+ .file = f, \
+}
+
/**
* Request flags
*
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 2ac99db3750e..5a7b3229b956 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -730,15 +730,13 @@ static int hostfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
init_special_inode(inode, mode, dev);
err = do_mknod(name, mode, MAJOR(dev), MINOR(dev));
- if (!err)
+ if (err)
goto out_free;
err = read_name(inode, name);
__putname(name);
if (err)
goto out_put;
- if (err)
- goto out_put;
d_instantiate(dentry, inode);
return 0;
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index ae4d5a1fa4c9..bffb908acbd4 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -375,12 +375,11 @@ static int hpfs_unlink(struct inode *dir, struct dentry *dentry)
struct inode *inode = d_inode(dentry);
dnode_secno dno;
int r;
- int rep = 0;
int err;
hpfs_lock(dir->i_sb);
hpfs_adjust_length(name, &len);
-again:
+
err = -ENOENT;
de = map_dirent(dir, hpfs_i(dir)->i_dno, name, len, &dno, &qbh);
if (!de)
@@ -400,33 +399,9 @@ again:
hpfs_error(dir->i_sb, "there was error when removing dirent");
err = -EFSERROR;
break;
- case 2: /* no space for deleting, try to truncate file */
-
+ case 2: /* no space for deleting */
err = -ENOSPC;
- if (rep++)
- break;
-
- dentry_unhash(dentry);
- if (!d_unhashed(dentry)) {
- hpfs_unlock(dir->i_sb);
- return -ENOSPC;
- }
- if (generic_permission(inode, MAY_WRITE) ||
- !S_ISREG(inode->i_mode) ||
- get_write_access(inode)) {
- d_rehash(dentry);
- } else {
- struct iattr newattrs;
- /*pr_info("truncating file before delete.\n");*/
- newattrs.ia_size = 0;
- newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME;
- err = notify_change(dentry, &newattrs, NULL);
- put_write_access(inode);
- if (!err)
- goto again;
- }
- hpfs_unlock(dir->i_sb);
- return -ENOSPC;
+ break;
default:
drop_nlink(inode);
err = 0;
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index a561591896bd..3713fd52b44b 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -15,6 +15,7 @@
#include <linux/sched.h>
#include <linux/bitmap.h>
#include <linux/slab.h>
+#include <linux/seq_file.h>
/* Mark the filesystem dirty, so that chkdsk checks it when os/2 booted */
@@ -453,10 +454,6 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
int lowercase, eas, chk, errs, chkdsk, timeshift;
int o;
struct hpfs_sb_info *sbi = hpfs_sb(s);
- char *new_opts = kstrdup(data, GFP_KERNEL);
-
- if (!new_opts)
- return -ENOMEM;
sync_filesystem(s);
@@ -493,17 +490,44 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
if (!(*flags & MS_RDONLY)) mark_dirty(s, 1);
- replace_mount_options(s, new_opts);
-
hpfs_unlock(s);
return 0;
out_err:
hpfs_unlock(s);
- kfree(new_opts);
return -EINVAL;
}
+static int hpfs_show_options(struct seq_file *seq, struct dentry *root)
+{
+ struct hpfs_sb_info *sbi = hpfs_sb(root->d_sb);
+
+ seq_printf(seq, ",uid=%u", from_kuid_munged(&init_user_ns, sbi->sb_uid));
+ seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, sbi->sb_gid));
+ seq_printf(seq, ",umask=%03o", (~sbi->sb_mode & 0777));
+ if (sbi->sb_lowercase)
+ seq_printf(seq, ",case=lower");
+ if (!sbi->sb_chk)
+ seq_printf(seq, ",check=none");
+ if (sbi->sb_chk == 2)
+ seq_printf(seq, ",check=strict");
+ if (!sbi->sb_err)
+ seq_printf(seq, ",errors=continue");
+ if (sbi->sb_err == 2)
+ seq_printf(seq, ",errors=panic");
+ if (!sbi->sb_chkdsk)
+ seq_printf(seq, ",chkdsk=no");
+ if (sbi->sb_chkdsk == 2)
+ seq_printf(seq, ",chkdsk=always");
+ if (!sbi->sb_eas)
+ seq_printf(seq, ",eas=no");
+ if (sbi->sb_eas == 1)
+ seq_printf(seq, ",eas=ro");
+ if (sbi->sb_timeshift)
+ seq_printf(seq, ",timeshift=%d", sbi->sb_timeshift);
+ return 0;
+}
+
/* Super operations */
static const struct super_operations hpfs_sops =
@@ -514,7 +538,7 @@ static const struct super_operations hpfs_sops =
.put_super = hpfs_put_super,
.statfs = hpfs_statfs,
.remount_fs = hpfs_remount_fs,
- .show_options = generic_show_options,
+ .show_options = hpfs_show_options,
};
static int hpfs_fill_super(struct super_block *s, void *options, int silent)
@@ -537,8 +561,6 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
int o;
- save_mount_options(s, options);
-
sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
if (!sbi) {
return -ENOMEM;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index de4bdfac0cec..595ebdb41846 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -463,6 +463,7 @@ hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end)
*/
vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) {
unsigned long v_offset;
+ unsigned long v_end;
/*
* Can the expression below overflow on 32-bit arches?
@@ -475,15 +476,17 @@ hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end)
else
v_offset = 0;
- if (end) {
- end = ((end - start) << PAGE_SHIFT) +
- vma->vm_start + v_offset;
- if (end > vma->vm_end)
- end = vma->vm_end;
- } else
- end = vma->vm_end;
+ if (!end)
+ v_end = vma->vm_end;
+ else {
+ v_end = ((end - vma->vm_pgoff) << PAGE_SHIFT)
+ + vma->vm_start;
+ if (v_end > vma->vm_end)
+ v_end = vma->vm_end;
+ }
- unmap_hugepage_range(vma, vma->vm_start + v_offset, end, NULL);
+ unmap_hugepage_range(vma, vma->vm_start + v_offset, v_end,
+ NULL);
}
}
diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c
index 735d7522a3a9..204659a5f6db 100644
--- a/fs/isofs/rock.c
+++ b/fs/isofs/rock.c
@@ -203,6 +203,8 @@ int get_rock_ridge_filename(struct iso_directory_record *de,
int retnamlen = 0;
int truncate = 0;
int ret = 0;
+ char *p;
+ int len;
if (!ISOFS_SB(inode->i_sb)->s_rock)
return 0;
@@ -267,12 +269,17 @@ repeat:
rr->u.NM.flags);
break;
}
- if ((strlen(retname) + rr->len - 5) >= 254) {
+ len = rr->len - 5;
+ if (retnamlen + len >= 254) {
truncate = 1;
break;
}
- strncat(retname, rr->u.NM.name, rr->len - 5);
- retnamlen += rr->len - 5;
+ p = memchr(rr->u.NM.name, '\0', len);
+ if (unlikely(p))
+ len = p - rr->u.NM.name;
+ memcpy(retname + retnamlen, rr->u.NM.name, len);
+ retnamlen += len;
+ retname[retnamlen] = '\0';
break;
case SIG('R', 'E'):
kfree(rs.buffer);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 81e622681c82..624a57a9c4aa 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1408,11 +1408,12 @@ out:
/**
* jbd2_mark_journal_empty() - Mark on disk journal as empty.
* @journal: The journal to update.
+ * @write_op: With which operation should we write the journal sb
*
* Update a journal's dynamic superblock fields to show that journal is empty.
* Write updated superblock to disk waiting for IO to complete.
*/
-static void jbd2_mark_journal_empty(journal_t *journal)
+static void jbd2_mark_journal_empty(journal_t *journal, int write_op)
{
journal_superblock_t *sb = journal->j_superblock;
@@ -1430,7 +1431,7 @@ static void jbd2_mark_journal_empty(journal_t *journal)
sb->s_start = cpu_to_be32(0);
read_unlock(&journal->j_state_lock);
- jbd2_write_superblock(journal, WRITE_FUA);
+ jbd2_write_superblock(journal, write_op);
/* Log is no longer empty */
write_lock(&journal->j_state_lock);
@@ -1716,7 +1717,13 @@ int jbd2_journal_destroy(journal_t *journal)
if (journal->j_sb_buffer) {
if (!is_journal_aborted(journal)) {
mutex_lock(&journal->j_checkpoint_mutex);
- jbd2_mark_journal_empty(journal);
+
+ write_lock(&journal->j_state_lock);
+ journal->j_tail_sequence =
+ ++journal->j_transaction_sequence;
+ write_unlock(&journal->j_state_lock);
+
+ jbd2_mark_journal_empty(journal, WRITE_FLUSH_FUA);
mutex_unlock(&journal->j_checkpoint_mutex);
} else
err = -EIO;
@@ -1975,7 +1982,7 @@ int jbd2_journal_flush(journal_t *journal)
* the magic code for a fully-recovered superblock. Any future
* commits of data to the journal will restore the current
* s_start value. */
- jbd2_mark_journal_empty(journal);
+ jbd2_mark_journal_empty(journal, WRITE_FUA);
mutex_unlock(&journal->j_checkpoint_mutex);
write_lock(&journal->j_state_lock);
J_ASSERT(!journal->j_running_transaction);
@@ -2021,7 +2028,7 @@ int jbd2_journal_wipe(journal_t *journal, int write)
if (write) {
/* Lock to make assertions happy... */
mutex_lock(&journal->j_checkpoint_mutex);
- jbd2_mark_journal_empty(journal);
+ jbd2_mark_journal_empty(journal, WRITE_FUA);
mutex_unlock(&journal->j_checkpoint_mutex);
}
diff --git a/fs/jffs2/README.Locking b/fs/jffs2/README.Locking
index 3ea36554107f..8918ac905a3b 100644
--- a/fs/jffs2/README.Locking
+++ b/fs/jffs2/README.Locking
@@ -2,10 +2,6 @@
JFFS2 LOCKING DOCUMENTATION
---------------------------
-At least theoretically, JFFS2 does not require the Big Kernel Lock
-(BKL), which was always helpfully obtained for it by Linux 2.4 VFS
-code. It has its own locking, as described below.
-
This document attempts to describe the existing locking rules for
JFFS2. It is not expected to remain perfectly up to date, but ought to
be fairly close.
@@ -69,6 +65,7 @@ Ordering constraints:
any f->sem held.
2. Never attempt to lock two file mutexes in one thread.
No ordering rules have been made for doing so.
+ 3. Never lock a page cache page with f->sem held.
erase_completion_lock spinlock
diff --git a/fs/jffs2/build.c b/fs/jffs2/build.c
index a3750f902adc..c1f04947d7dc 100644
--- a/fs/jffs2/build.c
+++ b/fs/jffs2/build.c
@@ -49,7 +49,8 @@ next_inode(int *i, struct jffs2_inode_cache *ic, struct jffs2_sb_info *c)
static void jffs2_build_inode_pass1(struct jffs2_sb_info *c,
- struct jffs2_inode_cache *ic)
+ struct jffs2_inode_cache *ic,
+ int *dir_hardlinks)
{
struct jffs2_full_dirent *fd;
@@ -68,19 +69,21 @@ static void jffs2_build_inode_pass1(struct jffs2_sb_info *c,
dbg_fsbuild("child \"%s\" (ino #%u) of dir ino #%u doesn't exist!\n",
fd->name, fd->ino, ic->ino);
jffs2_mark_node_obsolete(c, fd->raw);
+ /* Clear the ic/raw union so it doesn't cause problems later. */
+ fd->ic = NULL;
continue;
}
+ /* From this point, fd->raw is no longer used so we can set fd->ic */
+ fd->ic = child_ic;
+ child_ic->pino_nlink++;
+ /* If we appear (at this stage) to have hard-linked directories,
+ * set a flag to trigger a scan later */
if (fd->type == DT_DIR) {
- if (child_ic->pino_nlink) {
- JFFS2_ERROR("child dir \"%s\" (ino #%u) of dir ino #%u appears to be a hard link\n",
- fd->name, fd->ino, ic->ino);
- /* TODO: What do we do about it? */
- } else {
- child_ic->pino_nlink = ic->ino;
- }
- } else
- child_ic->pino_nlink++;
+ child_ic->flags |= INO_FLAGS_IS_DIR;
+ if (child_ic->pino_nlink > 1)
+ *dir_hardlinks = 1;
+ }
dbg_fsbuild("increased nlink for child \"%s\" (ino #%u)\n", fd->name, fd->ino);
/* Can't free scan_dents so far. We might need them in pass 2 */
@@ -94,8 +97,7 @@ static void jffs2_build_inode_pass1(struct jffs2_sb_info *c,
*/
static int jffs2_build_filesystem(struct jffs2_sb_info *c)
{
- int ret;
- int i;
+ int ret, i, dir_hardlinks = 0;
struct jffs2_inode_cache *ic;
struct jffs2_full_dirent *fd;
struct jffs2_full_dirent *dead_fds = NULL;
@@ -119,7 +121,7 @@ static int jffs2_build_filesystem(struct jffs2_sb_info *c)
/* Now scan the directory tree, increasing nlink according to every dirent found. */
for_each_inode(i, c, ic) {
if (ic->scan_dents) {
- jffs2_build_inode_pass1(c, ic);
+ jffs2_build_inode_pass1(c, ic, &dir_hardlinks);
cond_resched();
}
}
@@ -155,6 +157,20 @@ static int jffs2_build_filesystem(struct jffs2_sb_info *c)
}
dbg_fsbuild("pass 2a complete\n");
+
+ if (dir_hardlinks) {
+ /* If we detected directory hardlinks earlier, *hopefully*
+ * they are gone now because some of the links were from
+ * dead directories which still had some old dirents lying
+ * around and not yet garbage-collected, but which have
+ * been discarded above. So clear the pino_nlink field
+ * in each directory, so that the final scan below can
+ * print appropriate warnings. */
+ for_each_inode(i, c, ic) {
+ if (ic->flags & INO_FLAGS_IS_DIR)
+ ic->pino_nlink = 0;
+ }
+ }
dbg_fsbuild("freeing temporary data structures\n");
/* Finally, we can scan again and free the dirent structs */
@@ -162,6 +178,33 @@ static int jffs2_build_filesystem(struct jffs2_sb_info *c)
while(ic->scan_dents) {
fd = ic->scan_dents;
ic->scan_dents = fd->next;
+ /* We do use the pino_nlink field to count nlink of
+ * directories during fs build, so set it to the
+ * parent ino# now. Now that there's hopefully only
+ * one. */
+ if (fd->type == DT_DIR) {
+ if (!fd->ic) {
+ /* We'll have complained about it and marked the coresponding
+ raw node obsolete already. Just skip it. */
+ continue;
+ }
+
+ /* We *have* to have set this in jffs2_build_inode_pass1() */
+ BUG_ON(!(fd->ic->flags & INO_FLAGS_IS_DIR));
+
+ /* We clear ic->pino_nlink ∀ directories' ic *only* if dir_hardlinks
+ * is set. Otherwise, we know this should never trigger anyway, so
+ * we don't do the check. And ic->pino_nlink still contains the nlink
+ * value (which is 1). */
+ if (dir_hardlinks && fd->ic->pino_nlink) {
+ JFFS2_ERROR("child dir \"%s\" (ino #%u) of dir ino #%u is also hard linked from dir ino #%u\n",
+ fd->name, fd->ino, ic->ino, fd->ic->pino_nlink);
+ /* Should we unlink it from its previous parent? */
+ }
+
+ /* For directories, ic->pino_nlink holds that parent inode # */
+ fd->ic->pino_nlink = ic->ino;
+ }
jffs2_free_full_dirent(fd);
}
ic->scan_dents = NULL;
@@ -240,11 +283,7 @@ static void jffs2_build_remove_unlinked_inode(struct jffs2_sb_info *c,
/* Reduce nlink of the child. If it's now zero, stick it on the
dead_fds list to be cleaned up later. Else just free the fd */
-
- if (fd->type == DT_DIR)
- child_ic->pino_nlink = 0;
- else
- child_ic->pino_nlink--;
+ child_ic->pino_nlink--;
if (!child_ic->pino_nlink) {
dbg_fsbuild("inode #%u (\"%s\") now has no links; adding to dead_fds list.\n",
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index d211b8e18566..30c4c9ebb693 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -843,9 +843,14 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
pr_notice("%s(): Link succeeded, unlink failed (err %d). You now have a hard link\n",
__func__, ret);
- /* Might as well let the VFS know */
- d_instantiate(new_dentry, d_inode(old_dentry));
- ihold(d_inode(old_dentry));
+ /*
+ * We can't keep the target in dcache after that.
+ * For one thing, we can't afford dentry aliases for directories.
+ * For another, if there was a victim, we _can't_ set new inode
+ * for that sucker and we have to trigger mount eviction - the
+ * caller won't do it on its own since we are returning an error.
+ */
+ d_invalidate(new_dentry);
new_dir_i->i_mtime = new_dir_i->i_ctime = ITIME(now);
return ret;
}
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index f509f62e12f6..3361979d728c 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -137,39 +137,33 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
struct page *pg;
struct inode *inode = mapping->host;
struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
- struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
- struct jffs2_raw_inode ri;
- uint32_t alloc_len = 0;
pgoff_t index = pos >> PAGE_CACHE_SHIFT;
uint32_t pageofs = index << PAGE_CACHE_SHIFT;
int ret = 0;
- jffs2_dbg(1, "%s()\n", __func__);
-
- if (pageofs > inode->i_size) {
- ret = jffs2_reserve_space(c, sizeof(ri), &alloc_len,
- ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE);
- if (ret)
- return ret;
- }
-
- mutex_lock(&f->sem);
pg = grab_cache_page_write_begin(mapping, index, flags);
- if (!pg) {
- if (alloc_len)
- jffs2_complete_reservation(c);
- mutex_unlock(&f->sem);
+ if (!pg)
return -ENOMEM;
- }
*pagep = pg;
- if (alloc_len) {
+ jffs2_dbg(1, "%s()\n", __func__);
+
+ if (pageofs > inode->i_size) {
/* Make new hole frag from old EOF to new page */
+ struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
+ struct jffs2_raw_inode ri;
struct jffs2_full_dnode *fn;
+ uint32_t alloc_len;
jffs2_dbg(1, "Writing new hole frag 0x%x-0x%x between current EOF and new page\n",
(unsigned int)inode->i_size, pageofs);
+ ret = jffs2_reserve_space(c, sizeof(ri), &alloc_len,
+ ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE);
+ if (ret)
+ goto out_page;
+
+ mutex_lock(&f->sem);
memset(&ri, 0, sizeof(ri));
ri.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
@@ -196,6 +190,7 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
if (IS_ERR(fn)) {
ret = PTR_ERR(fn);
jffs2_complete_reservation(c);
+ mutex_unlock(&f->sem);
goto out_page;
}
ret = jffs2_add_full_dnode_to_inode(c, f, fn);
@@ -210,10 +205,12 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
jffs2_mark_node_obsolete(c, fn->raw);
jffs2_free_full_dnode(fn);
jffs2_complete_reservation(c);
+ mutex_unlock(&f->sem);
goto out_page;
}
jffs2_complete_reservation(c);
inode->i_size = pageofs;
+ mutex_unlock(&f->sem);
}
/*
@@ -222,18 +219,18 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
* case of a short-copy.
*/
if (!PageUptodate(pg)) {
+ mutex_lock(&f->sem);
ret = jffs2_do_readpage_nolock(inode, pg);
+ mutex_unlock(&f->sem);
if (ret)
goto out_page;
}
- mutex_unlock(&f->sem);
jffs2_dbg(1, "end write_begin(). pg->flags %lx\n", pg->flags);
return ret;
out_page:
unlock_page(pg);
page_cache_release(pg);
- mutex_unlock(&f->sem);
return ret;
}
diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c
index 5a2dec2b064c..95d5880a63ee 100644
--- a/fs/jffs2/gc.c
+++ b/fs/jffs2/gc.c
@@ -1296,14 +1296,17 @@ static int jffs2_garbage_collect_dnode(struct jffs2_sb_info *c, struct jffs2_era
BUG_ON(start > orig_start);
}
- /* First, use readpage() to read the appropriate page into the page cache */
- /* Q: What happens if we actually try to GC the _same_ page for which commit_write()
- * triggered garbage collection in the first place?
- * A: I _think_ it's OK. read_cache_page shouldn't deadlock, we'll write out the
- * page OK. We'll actually write it out again in commit_write, which is a little
- * suboptimal, but at least we're correct.
- */
+ /* The rules state that we must obtain the page lock *before* f->sem, so
+ * drop f->sem temporarily. Since we also hold c->alloc_sem, nothing's
+ * actually going to *change* so we're safe; we only allow reading.
+ *
+ * It is important to note that jffs2_write_begin() will ensure that its
+ * page is marked Uptodate before allocating space. That means that if we
+ * end up here trying to GC the *same* page that jffs2_write_begin() is
+ * trying to write out, read_cache_page() will not deadlock. */
+ mutex_unlock(&f->sem);
pg_ptr = jffs2_gc_fetch_page(c, f, start, &pg);
+ mutex_lock(&f->sem);
if (IS_ERR(pg_ptr)) {
pr_warn("read_cache_page() returned error: %ld\n",
diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h
index fa35ff79ab35..0637271f3770 100644
--- a/fs/jffs2/nodelist.h
+++ b/fs/jffs2/nodelist.h
@@ -194,6 +194,7 @@ struct jffs2_inode_cache {
#define INO_STATE_CLEARING 6 /* In clear_inode() */
#define INO_FLAGS_XATTR_CHECKED 0x01 /* has no duplicate xattr_ref */
+#define INO_FLAGS_IS_DIR 0x02 /* is a directory */
#define RAWNODE_CLASS_INODE_CACHE 0
#define RAWNODE_CLASS_XATTR_DATUM 1
@@ -249,7 +250,10 @@ struct jffs2_readinode_info
struct jffs2_full_dirent
{
- struct jffs2_raw_node_ref *raw;
+ union {
+ struct jffs2_raw_node_ref *raw;
+ struct jffs2_inode_cache *ic; /* Just during part of build */
+ };
struct jffs2_full_dirent *next;
uint32_t version;
uint32_t ino; /* == zero for unlink */
diff --git a/fs/locks.c b/fs/locks.c
index 0d2b3267e2a3..6333263b7bc8 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -2182,7 +2182,6 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
goto out;
}
-again:
error = flock_to_posix_lock(filp, file_lock, &flock);
if (error)
goto out;
@@ -2224,19 +2223,22 @@ again:
* Attempt to detect a close/fcntl race and recover by
* releasing the lock that was just acquired.
*/
- /*
- * we need that spin_lock here - it prevents reordering between
- * update of i_flctx->flc_posix and check for it done in close().
- * rcu_read_lock() wouldn't do.
- */
- spin_lock(&current->files->file_lock);
- f = fcheck(fd);
- spin_unlock(&current->files->file_lock);
- if (!error && f != filp && flock.l_type != F_UNLCK) {
- flock.l_type = F_UNLCK;
- goto again;
+ if (!error && file_lock->fl_type != F_UNLCK) {
+ /*
+ * We need that spin_lock here - it prevents reordering between
+ * update of i_flctx->flc_posix and check for it done in
+ * close(). rcu_read_lock() wouldn't do.
+ */
+ spin_lock(&current->files->file_lock);
+ f = fcheck(fd);
+ spin_unlock(&current->files->file_lock);
+ if (f != filp) {
+ file_lock->fl_type = F_UNLCK;
+ error = do_lock_file_wait(filp, cmd, file_lock);
+ WARN_ON_ONCE(error);
+ error = -EBADF;
+ }
}
-
out:
locks_free_lock(file_lock);
return error;
@@ -2322,7 +2324,6 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd,
goto out;
}
-again:
error = flock64_to_posix_lock(filp, file_lock, &flock);
if (error)
goto out;
@@ -2364,14 +2365,22 @@ again:
* Attempt to detect a close/fcntl race and recover by
* releasing the lock that was just acquired.
*/
- spin_lock(&current->files->file_lock);
- f = fcheck(fd);
- spin_unlock(&current->files->file_lock);
- if (!error && f != filp && flock.l_type != F_UNLCK) {
- flock.l_type = F_UNLCK;
- goto again;
+ if (!error && file_lock->fl_type != F_UNLCK) {
+ /*
+ * We need that spin_lock here - it prevents reordering between
+ * update of i_flctx->flc_posix and check for it done in
+ * close(). rcu_read_lock() wouldn't do.
+ */
+ spin_lock(&current->files->file_lock);
+ f = fcheck(fd);
+ spin_unlock(&current->files->file_lock);
+ if (f != filp) {
+ file_lock->fl_type = F_UNLCK;
+ error = do_lock_file_wait(filp, cmd, file_lock);
+ WARN_ON_ONCE(error);
+ error = -EBADF;
+ }
}
-
out:
locks_free_lock(file_lock);
return error;
diff --git a/fs/namei.c b/fs/namei.c
index 0c3974cd3ecd..209ca7737cb2 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1711,6 +1711,11 @@ static inline int should_follow_link(struct nameidata *nd, struct path *link,
return 0;
if (!follow)
return 0;
+ /* make sure that d_is_symlink above matches inode */
+ if (nd->flags & LOOKUP_RCU) {
+ if (read_seqcount_retry(&link->dentry->d_seq, seq))
+ return -ECHILD;
+ }
return pick_link(nd, link, inode, seq);
}
@@ -1742,11 +1747,11 @@ static int walk_component(struct nameidata *nd, int flags)
if (err < 0)
return err;
- inode = d_backing_inode(path.dentry);
seq = 0; /* we are already out of RCU mode */
err = -ENOENT;
if (d_is_negative(path.dentry))
goto out_path_put;
+ inode = d_backing_inode(path.dentry);
}
if (flags & WALK_PUT)
@@ -2901,22 +2906,10 @@ no_open:
dentry = lookup_real(dir, dentry, nd->flags);
if (IS_ERR(dentry))
return PTR_ERR(dentry);
-
- if (create_error) {
- int open_flag = op->open_flag;
-
- error = create_error;
- if ((open_flag & O_EXCL)) {
- if (!dentry->d_inode)
- goto out;
- } else if (!dentry->d_inode) {
- goto out;
- } else if ((open_flag & O_TRUNC) &&
- d_is_reg(dentry)) {
- goto out;
- }
- /* will fail later, go on to get the right error */
- }
+ }
+ if (create_error && !dentry->d_inode) {
+ error = create_error;
+ goto out;
}
looked_up:
path->dentry = dentry;
@@ -3130,12 +3123,12 @@ retry_lookup:
return error;
BUG_ON(nd->flags & LOOKUP_RCU);
- inode = d_backing_inode(path.dentry);
seq = 0; /* out of RCU mode, so the value doesn't matter */
if (unlikely(d_is_negative(path.dentry))) {
path_to_nameidata(&path, nd);
return -ENOENT;
}
+ inode = d_backing_inode(path.dentry);
finish_lookup:
if (nd->depth)
put_link(nd);
@@ -3144,11 +3137,6 @@ finish_lookup:
if (unlikely(error))
return error;
- if (unlikely(d_is_symlink(path.dentry)) && !(open_flag & O_PATH)) {
- path_to_nameidata(&path, nd);
- return -ELOOP;
- }
-
if ((nd->flags & LOOKUP_RCU) || nd->path.mnt != path.mnt) {
path_to_nameidata(&path, nd);
} else {
@@ -3167,6 +3155,10 @@ finish_open:
return error;
}
audit_inode(nd->name, nd->path.dentry, 0);
+ if (unlikely(d_is_symlink(nd->path.dentry)) && !(open_flag & O_PATH)) {
+ error = -ELOOP;
+ goto out;
+ }
error = -EISDIR;
if ((open_flag & O_CREAT) && d_is_dir(nd->path.dentry))
goto out;
@@ -3210,6 +3202,10 @@ opened:
goto exit_fput;
}
out:
+ if (unlikely(error > 0)) {
+ WARN_ON(1);
+ error = -EINVAL;
+ }
if (got_write)
mnt_drop_write(nd->path.mnt);
path_put(&save_parent);
@@ -4187,7 +4183,11 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
bool new_is_dir = false;
unsigned max_links = new_dir->i_sb->s_max_links;
- if (source == target)
+ /*
+ * Check source == target.
+ * On overlayfs need to look at underlying inodes.
+ */
+ if (vfs_select_inode(old_dentry, 0) == vfs_select_inode(new_dentry, 0))
return 0;
error = may_delete(old_dir, old_dentry, is_dir);
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index f0e3e9e747dd..03446c5a3ec1 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -633,7 +633,7 @@ ncp_fill_cache(struct file *file, struct dir_context *ctx,
d_rehash(newdent);
} else {
spin_lock(&dentry->d_lock);
- NCP_FINFO(inode)->flags &= ~NCPI_DIR_CACHE;
+ NCP_FINFO(dir)->flags &= ~NCPI_DIR_CACHE;
spin_unlock(&dentry->d_lock);
}
} else {
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index ce5a21861074..5fc2162afb67 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -377,7 +377,7 @@ int nfs_readdir_xdr_filler(struct page **pages, nfs_readdir_descriptor_t *desc,
again:
timestamp = jiffies;
gencount = nfs_inc_attr_generation_counter();
- error = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred, entry->cookie, pages,
+ error = NFS_PROTO(inode)->readdir(file_dentry(file), cred, entry->cookie, pages,
NFS_SERVER(inode)->dtsize, desc->plus);
if (error < 0) {
/* We requested READDIRPLUS, but the server doesn't grok it */
@@ -560,7 +560,7 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en
count++;
if (desc->plus != 0)
- nfs_prime_dcache(desc->file->f_path.dentry, entry);
+ nfs_prime_dcache(file_dentry(desc->file), entry);
status = nfs_readdir_add_to_array(entry, page);
if (status != 0)
@@ -864,7 +864,7 @@ static bool nfs_dir_mapping_need_revalidate(struct inode *dir)
*/
static int nfs_readdir(struct file *file, struct dir_context *ctx)
{
- struct dentry *dentry = file->f_path.dentry;
+ struct dentry *dentry = file_dentry(file);
struct inode *inode = d_inode(dentry);
nfs_readdir_descriptor_t my_desc,
*desc = &my_desc;
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 03516c80855a..2a2e2d8ddee5 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -145,7 +145,7 @@ static bool ff_mirror_match_fh(const struct nfs4_ff_layout_mirror *m1,
return false;
for (i = 0; i < m1->fh_versions_cnt; i++) {
bool found_fh = false;
- for (j = 0; j < m2->fh_versions_cnt; i++) {
+ for (j = 0; j < m2->fh_versions_cnt; j++) {
if (nfs_compare_fh(&m1->fh_versions[i],
&m2->fh_versions[j]) == 0) {
found_fh = true;
@@ -1859,11 +1859,9 @@ ff_layout_encode_layoutreturn(struct pnfs_layout_hdr *lo,
start = xdr_reserve_space(xdr, 4);
BUG_ON(!start);
- if (ff_layout_encode_ioerr(flo, xdr, args))
- goto out;
-
+ ff_layout_encode_ioerr(flo, xdr, args);
ff_layout_encode_iostats(flo, xdr, args);
-out:
+
*start = cpu_to_be32((xdr->p - start - 1) * 4);
dprintk("%s: Return\n", __func__);
}
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index c7e8b87da5b2..f714b98cfd74 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -927,7 +927,7 @@ int nfs_open(struct inode *inode, struct file *filp)
{
struct nfs_open_context *ctx;
- ctx = alloc_nfs_open_context(filp->f_path.dentry, filp->f_mode);
+ ctx = alloc_nfs_open_context(file_dentry(filp), filp->f_mode);
if (IS_ERR(ctx))
return PTR_ERR(ctx);
nfs_file_set_open_context(filp, ctx);
@@ -1641,6 +1641,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
unsigned long invalid = 0;
unsigned long now = jiffies;
unsigned long save_cache_validity;
+ bool cache_revalidated = true;
dfprintk(VFS, "NFS: %s(%s/%lu fh_crc=0x%08x ct=%d info=0x%x)\n",
__func__, inode->i_sb->s_id, inode->i_ino,
@@ -1702,22 +1703,28 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
nfs_force_lookup_revalidate(inode);
inode->i_version = fattr->change_attr;
}
- } else
+ } else {
nfsi->cache_validity |= save_cache_validity;
+ cache_revalidated = false;
+ }
if (fattr->valid & NFS_ATTR_FATTR_MTIME) {
memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
- } else if (server->caps & NFS_CAP_MTIME)
+ } else if (server->caps & NFS_CAP_MTIME) {
nfsi->cache_validity |= save_cache_validity &
(NFS_INO_INVALID_ATTR
| NFS_INO_REVAL_FORCED);
+ cache_revalidated = false;
+ }
if (fattr->valid & NFS_ATTR_FATTR_CTIME) {
memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
- } else if (server->caps & NFS_CAP_CTIME)
+ } else if (server->caps & NFS_CAP_CTIME) {
nfsi->cache_validity |= save_cache_validity &
(NFS_INO_INVALID_ATTR
| NFS_INO_REVAL_FORCED);
+ cache_revalidated = false;
+ }
/* Check if our cached file size is stale */
if (fattr->valid & NFS_ATTR_FATTR_SIZE) {
@@ -1737,19 +1744,23 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
(long long)cur_isize,
(long long)new_isize);
}
- } else
+ } else {
nfsi->cache_validity |= save_cache_validity &
(NFS_INO_INVALID_ATTR
| NFS_INO_REVAL_PAGECACHE
| NFS_INO_REVAL_FORCED);
+ cache_revalidated = false;
+ }
if (fattr->valid & NFS_ATTR_FATTR_ATIME)
memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime));
- else if (server->caps & NFS_CAP_ATIME)
+ else if (server->caps & NFS_CAP_ATIME) {
nfsi->cache_validity |= save_cache_validity &
(NFS_INO_INVALID_ATIME
| NFS_INO_REVAL_FORCED);
+ cache_revalidated = false;
+ }
if (fattr->valid & NFS_ATTR_FATTR_MODE) {
if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) {
@@ -1758,36 +1769,42 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
inode->i_mode = newmode;
invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
}
- } else if (server->caps & NFS_CAP_MODE)
+ } else if (server->caps & NFS_CAP_MODE) {
nfsi->cache_validity |= save_cache_validity &
(NFS_INO_INVALID_ATTR
| NFS_INO_INVALID_ACCESS
| NFS_INO_INVALID_ACL
| NFS_INO_REVAL_FORCED);
+ cache_revalidated = false;
+ }
if (fattr->valid & NFS_ATTR_FATTR_OWNER) {
if (!uid_eq(inode->i_uid, fattr->uid)) {
invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
inode->i_uid = fattr->uid;
}
- } else if (server->caps & NFS_CAP_OWNER)
+ } else if (server->caps & NFS_CAP_OWNER) {
nfsi->cache_validity |= save_cache_validity &
(NFS_INO_INVALID_ATTR
| NFS_INO_INVALID_ACCESS
| NFS_INO_INVALID_ACL
| NFS_INO_REVAL_FORCED);
+ cache_revalidated = false;
+ }
if (fattr->valid & NFS_ATTR_FATTR_GROUP) {
if (!gid_eq(inode->i_gid, fattr->gid)) {
invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
inode->i_gid = fattr->gid;
}
- } else if (server->caps & NFS_CAP_OWNER_GROUP)
+ } else if (server->caps & NFS_CAP_OWNER_GROUP) {
nfsi->cache_validity |= save_cache_validity &
(NFS_INO_INVALID_ATTR
| NFS_INO_INVALID_ACCESS
| NFS_INO_INVALID_ACL
| NFS_INO_REVAL_FORCED);
+ cache_revalidated = false;
+ }
if (fattr->valid & NFS_ATTR_FATTR_NLINK) {
if (inode->i_nlink != fattr->nlink) {
@@ -1796,19 +1813,22 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
invalid |= NFS_INO_INVALID_DATA;
set_nlink(inode, fattr->nlink);
}
- } else if (server->caps & NFS_CAP_NLINK)
+ } else if (server->caps & NFS_CAP_NLINK) {
nfsi->cache_validity |= save_cache_validity &
(NFS_INO_INVALID_ATTR
| NFS_INO_REVAL_FORCED);
+ cache_revalidated = false;
+ }
if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) {
/*
* report the blocks in 512byte units
*/
inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used);
- }
- if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
+ } else if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
inode->i_blocks = fattr->du.nfs2.blocks;
+ else
+ cache_revalidated = false;
/* Update attrtimeo value if we're out of the unstable period */
if (invalid & NFS_INO_INVALID_ATTR) {
@@ -1818,9 +1838,13 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
/* Set barrier to be more recent than all outstanding updates */
nfsi->attr_gencount = nfs_inc_attr_generation_counter();
} else {
- if (!time_in_range_open(now, nfsi->attrtimeo_timestamp, nfsi->attrtimeo_timestamp + nfsi->attrtimeo)) {
- if ((nfsi->attrtimeo <<= 1) > NFS_MAXATTRTIMEO(inode))
- nfsi->attrtimeo = NFS_MAXATTRTIMEO(inode);
+ if (cache_revalidated) {
+ if (!time_in_range_open(now, nfsi->attrtimeo_timestamp,
+ nfsi->attrtimeo_timestamp + nfsi->attrtimeo)) {
+ nfsi->attrtimeo <<= 1;
+ if (nfsi->attrtimeo > NFS_MAXATTRTIMEO(inode))
+ nfsi->attrtimeo = NFS_MAXATTRTIMEO(inode);
+ }
nfsi->attrtimeo_timestamp = now;
}
/* Set the barrier to be more recent than this fattr */
@@ -1829,7 +1853,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
}
/* Don't declare attrcache up to date if there were no attrs! */
- if (fattr->valid != 0)
+ if (cache_revalidated)
invalid &= ~NFS_INO_INVALID_ATTR;
/* Don't invalidate the data if we were to blame */
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index db9b5fea5b3e..679e003818b1 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -26,7 +26,7 @@ static int
nfs4_file_open(struct inode *inode, struct file *filp)
{
struct nfs_open_context *ctx;
- struct dentry *dentry = filp->f_path.dentry;
+ struct dentry *dentry = file_dentry(filp);
struct dentry *parent = NULL;
struct inode *dir;
unsigned openflags = filp->f_flags;
@@ -57,7 +57,7 @@ nfs4_file_open(struct inode *inode, struct file *filp)
parent = dget_parent(dentry);
dir = d_inode(parent);
- ctx = alloc_nfs_open_context(filp->f_path.dentry, filp->f_mode);
+ ctx = alloc_nfs_open_context(file_dentry(filp), filp->f_mode);
err = PTR_ERR(ctx);
if (IS_ERR(ctx))
goto out;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 89818036f035..98a44157353a 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1385,6 +1385,7 @@ static void __update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_s
* Protect the call to nfs4_state_set_mode_locked and
* serialise the stateid update
*/
+ spin_lock(&state->owner->so_lock);
write_seqlock(&state->seqlock);
if (deleg_stateid != NULL) {
nfs4_stateid_copy(&state->stateid, deleg_stateid);
@@ -1393,7 +1394,6 @@ static void __update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_s
if (open_stateid != NULL)
nfs_set_open_stateid_locked(state, open_stateid, fmode);
write_sequnlock(&state->seqlock);
- spin_lock(&state->owner->so_lock);
update_open_stateflags(state, fmode);
spin_unlock(&state->owner->so_lock);
}
@@ -2461,9 +2461,9 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
dentry = d_add_unique(dentry, igrab(state->inode));
if (dentry == NULL) {
dentry = opendata->dentry;
- } else if (dentry != ctx->dentry) {
+ } else {
dput(ctx->dentry);
- ctx->dentry = dget(dentry);
+ ctx->dentry = dentry;
}
nfs_set_verifier(dentry,
nfs_save_change_attribute(d_inode(opendata->dir)));
@@ -8054,7 +8054,6 @@ static void nfs4_layoutreturn_release(void *calldata)
pnfs_set_layout_stateid(lo, &lrp->res.stateid, true);
pnfs_mark_matching_lsegs_invalid(lo, &freeme, &lrp->args.range);
pnfs_clear_layoutreturn_waitbit(lo);
- lo->plh_block_lgets--;
spin_unlock(&lo->plh_inode->i_lock);
pnfs_free_lseg_list(&freeme);
pnfs_put_layout_hdr(lrp->args.layout);
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index a9f096c7e99f..7d5351cd67fb 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -877,6 +877,7 @@ nfsd4_secinfo(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
&exp, &dentry);
if (err)
return err;
+ fh_unlock(&cstate->current_fh);
if (d_really_is_negative(dentry)) {
exp_put(exp);
err = nfserr_noent;
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 51c9e9ca39a4..12935209deca 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1072,8 +1072,9 @@ nfsd4_decode_rename(struct nfsd4_compoundargs *argp, struct nfsd4_rename *rename
READ_BUF(4);
rename->rn_snamelen = be32_to_cpup(p++);
- READ_BUF(rename->rn_snamelen + 4);
+ READ_BUF(rename->rn_snamelen);
SAVEMEM(rename->rn_sname, rename->rn_snamelen);
+ READ_BUF(4);
rename->rn_tnamelen = be32_to_cpup(p++);
READ_BUF(rename->rn_tnamelen);
SAVEMEM(rename->rn_tname, rename->rn_tnamelen);
@@ -1155,13 +1156,14 @@ nfsd4_decode_setclientid(struct nfsd4_compoundargs *argp, struct nfsd4_setclient
READ_BUF(8);
setclientid->se_callback_prog = be32_to_cpup(p++);
setclientid->se_callback_netid_len = be32_to_cpup(p++);
-
- READ_BUF(setclientid->se_callback_netid_len + 4);
+ READ_BUF(setclientid->se_callback_netid_len);
SAVEMEM(setclientid->se_callback_netid_val, setclientid->se_callback_netid_len);
+ READ_BUF(4);
setclientid->se_callback_addr_len = be32_to_cpup(p++);
- READ_BUF(setclientid->se_callback_addr_len + 4);
+ READ_BUF(setclientid->se_callback_addr_len);
SAVEMEM(setclientid->se_callback_addr_val, setclientid->se_callback_addr_len);
+ READ_BUF(4);
setclientid->se_callback_ident = be32_to_cpup(p++);
DECODE_TAIL;
@@ -1815,8 +1817,9 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
READ_BUF(4);
argp->taglen = be32_to_cpup(p++);
- READ_BUF(argp->taglen + 8);
+ READ_BUF(argp->taglen);
SAVEMEM(argp->tag, argp->taglen);
+ READ_BUF(8);
argp->minorversion = be32_to_cpup(p++);
argp->opcnt = be32_to_cpup(p++);
max_reply += 4 + (XDR_QUADLEN(argp->taglen) << 2);
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 0cdf497c91ef..2162434728c0 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -322,3 +322,90 @@ struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type)
brelse(di_bh);
return acl;
}
+
+int ocfs2_acl_chmod(struct inode *inode, struct buffer_head *bh)
+{
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct posix_acl *acl;
+ int ret;
+
+ if (S_ISLNK(inode->i_mode))
+ return -EOPNOTSUPP;
+
+ if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+ return 0;
+
+ acl = ocfs2_get_acl_nolock(inode, ACL_TYPE_ACCESS, bh);
+ if (IS_ERR(acl) || !acl)
+ return PTR_ERR(acl);
+ ret = __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
+ if (ret)
+ return ret;
+ ret = ocfs2_set_acl(NULL, inode, NULL, ACL_TYPE_ACCESS,
+ acl, NULL, NULL);
+ posix_acl_release(acl);
+ return ret;
+}
+
+/*
+ * Initialize the ACLs of a new inode. If parent directory has default ACL,
+ * then clone to new inode. Called from ocfs2_mknod.
+ */
+int ocfs2_init_acl(handle_t *handle,
+ struct inode *inode,
+ struct inode *dir,
+ struct buffer_head *di_bh,
+ struct buffer_head *dir_bh,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_alloc_context *data_ac)
+{
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct posix_acl *acl = NULL;
+ int ret = 0, ret2;
+ umode_t mode;
+
+ if (!S_ISLNK(inode->i_mode)) {
+ if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) {
+ acl = ocfs2_get_acl_nolock(dir, ACL_TYPE_DEFAULT,
+ dir_bh);
+ if (IS_ERR(acl))
+ return PTR_ERR(acl);
+ }
+ if (!acl) {
+ mode = inode->i_mode & ~current_umask();
+ ret = ocfs2_acl_set_mode(inode, di_bh, handle, mode);
+ if (ret) {
+ mlog_errno(ret);
+ goto cleanup;
+ }
+ }
+ }
+ if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) {
+ if (S_ISDIR(inode->i_mode)) {
+ ret = ocfs2_set_acl(handle, inode, di_bh,
+ ACL_TYPE_DEFAULT, acl,
+ meta_ac, data_ac);
+ if (ret)
+ goto cleanup;
+ }
+ mode = inode->i_mode;
+ ret = __posix_acl_create(&acl, GFP_NOFS, &mode);
+ if (ret < 0)
+ return ret;
+
+ ret2 = ocfs2_acl_set_mode(inode, di_bh, handle, mode);
+ if (ret2) {
+ mlog_errno(ret2);
+ ret = ret2;
+ goto cleanup;
+ }
+ if (ret > 0) {
+ ret = ocfs2_set_acl(handle, inode,
+ di_bh, ACL_TYPE_ACCESS,
+ acl, meta_ac, data_ac);
+ }
+ }
+cleanup:
+ posix_acl_release(acl);
+ return ret;
+}
diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h
index 3fce68d08625..2783a75b3999 100644
--- a/fs/ocfs2/acl.h
+++ b/fs/ocfs2/acl.h
@@ -35,5 +35,10 @@ int ocfs2_set_acl(handle_t *handle,
struct posix_acl *acl,
struct ocfs2_alloc_context *meta_ac,
struct ocfs2_alloc_context *data_ac);
+extern int ocfs2_acl_chmod(struct inode *, struct buffer_head *);
+extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *,
+ struct buffer_head *, struct buffer_head *,
+ struct ocfs2_alloc_context *,
+ struct ocfs2_alloc_context *);
#endif /* OCFS2_ACL_H */
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 7f604727f487..e6795c7c76a8 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -956,6 +956,7 @@ clean_orphan:
tmp_ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh,
update_isize, end);
if (tmp_ret < 0) {
+ ocfs2_inode_unlock(inode, 1);
ret = tmp_ret;
mlog_errno(ret);
brelse(di_bh);
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
index e36d63ff1783..f90931335c6b 100644
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -262,6 +262,7 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
struct dlm_lock *lock, int flags, int type)
{
enum dlm_status status;
+ u8 old_owner = res->owner;
mlog(0, "type=%d, convert_type=%d, busy=%d\n", lock->ml.type,
lock->ml.convert_type, res->state & DLM_LOCK_RES_IN_PROGRESS);
@@ -287,6 +288,19 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
status = DLM_DENIED;
goto bail;
}
+
+ if (lock->ml.type == type && lock->ml.convert_type == LKM_IVMODE) {
+ mlog(0, "last convert request returned DLM_RECOVERING, but "
+ "owner has already queued and sent ast to me. res %.*s, "
+ "(cookie=%u:%llu, type=%d, conv=%d)\n",
+ res->lockname.len, res->lockname.name,
+ dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
+ lock->ml.type, lock->ml.convert_type);
+ status = DLM_NORMAL;
+ goto bail;
+ }
+
res->state |= DLM_LOCK_RES_IN_PROGRESS;
/* move lock to local convert queue */
/* do not alter lock refcount. switching lists. */
@@ -316,11 +330,19 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
spin_lock(&res->spinlock);
res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
lock->convert_pending = 0;
- /* if it failed, move it back to granted queue */
+ /* if it failed, move it back to granted queue.
+ * if master returns DLM_NORMAL and then down before sending ast,
+ * it may have already been moved to granted queue, reset to
+ * DLM_RECOVERING and retry convert */
if (status != DLM_NORMAL) {
if (status != DLM_NOTQUEUED)
dlm_error(status);
dlm_revert_pending_convert(res, lock);
+ } else if ((res->state & DLM_LOCK_RES_RECOVERING) ||
+ (old_owner != res->owner)) {
+ mlog(0, "res %.*s is in recovering or has been recovered.\n",
+ res->lockname.len, res->lockname.name);
+ status = DLM_RECOVERING;
}
bail:
spin_unlock(&res->spinlock);
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 84f2f8079466..4e2162b355db 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -2519,6 +2519,11 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
spin_lock(&dlm->master_lock);
ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name,
namelen, target, dlm->node_num);
+ /* get an extra reference on the mle.
+ * otherwise the assert_master from the new
+ * master will destroy this.
+ */
+ dlm_get_mle_inuse(mle);
spin_unlock(&dlm->master_lock);
spin_unlock(&dlm->spinlock);
@@ -2554,6 +2559,7 @@ fail:
if (mle_added) {
dlm_mle_detach_hb_events(dlm, mle);
dlm_put_mle(mle);
+ dlm_put_mle_inuse(mle);
} else if (mle) {
kmem_cache_free(dlm_mle_cache, mle);
mle = NULL;
@@ -2571,17 +2577,6 @@ fail:
* ensure that all assert_master work is flushed. */
flush_workqueue(dlm->dlm_worker);
- /* get an extra reference on the mle.
- * otherwise the assert_master from the new
- * master will destroy this.
- * also, make sure that all callers of dlm_get_mle
- * take both dlm->spinlock and dlm->master_lock */
- spin_lock(&dlm->spinlock);
- spin_lock(&dlm->master_lock);
- dlm_get_mle_inuse(mle);
- spin_unlock(&dlm->master_lock);
- spin_unlock(&dlm->spinlock);
-
/* notify new node and send all lock state */
/* call send_one_lockres with migration flag.
* this serves as notice to the target node that a
@@ -3312,6 +3307,15 @@ top:
mle->new_master != dead_node)
continue;
+ if (mle->new_master == dead_node && mle->inuse) {
+ mlog(ML_NOTICE, "%s: target %u died during "
+ "migration from %u, the MLE is "
+ "still keep used, ignore it!\n",
+ dlm->name, dead_node,
+ mle->master);
+ continue;
+ }
+
/* If we have reached this point, this mle needs to be
* removed from the list and freed. */
dlm_clean_migration_mle(dlm, mle);
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 9e4f862d20fe..4a338803e7e9 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -2064,7 +2064,6 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
dlm_lock_get(lock);
if (lock->convert_pending) {
/* move converting lock back to granted */
- BUG_ON(i != DLM_CONVERTING_LIST);
mlog(0, "node died with convert pending "
"on %.*s. move back to granted list.\n",
res->lockname.len, res->lockname.name);
@@ -2360,6 +2359,8 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
break;
}
}
+ dlm_lockres_clear_refmap_bit(dlm, res,
+ dead_node);
spin_unlock(&res->spinlock);
continue;
}
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 20276e340339..b002acf50203 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -1390,6 +1390,7 @@ static int __ocfs2_cluster_lock(struct ocfs2_super *osb,
unsigned int gen;
int noqueue_attempted = 0;
int dlm_locked = 0;
+ int kick_dc = 0;
if (!(lockres->l_flags & OCFS2_LOCK_INITIALIZED)) {
mlog_errno(-EINVAL);
@@ -1524,7 +1525,12 @@ update_holders:
unlock:
lockres_clear_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING);
+ /* ocfs2_unblock_lock reques on seeing OCFS2_LOCK_UPCONVERT_FINISHING */
+ kick_dc = (lockres->l_flags & OCFS2_LOCK_BLOCKED);
+
spin_unlock_irqrestore(&lockres->l_lock, flags);
+ if (kick_dc)
+ ocfs2_wake_downconvert_thread(osb);
out:
/*
* This is helping work around a lock inversion between the page lock
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 0e5b4515f92e..77d30cbd944d 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1268,20 +1268,20 @@ bail_unlock_rw:
if (size_change)
ocfs2_rw_unlock(inode, 1);
bail:
- brelse(bh);
/* Release quota pointers in case we acquired them */
for (qtype = 0; qtype < OCFS2_MAXQUOTAS; qtype++)
dqput(transfer_to[qtype]);
if (!status && attr->ia_valid & ATTR_MODE) {
- status = posix_acl_chmod(inode, inode->i_mode);
+ status = ocfs2_acl_chmod(inode, bh);
if (status < 0)
mlog_errno(status);
}
if (inode_locked)
ocfs2_inode_unlock(inode, 1);
+ brelse(bh);
return status;
}
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 3123408da935..62af9554541d 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -259,7 +259,6 @@ static int ocfs2_mknod(struct inode *dir,
struct ocfs2_dir_lookup_result lookup = { NULL, };
sigset_t oldset;
int did_block_signals = 0;
- struct posix_acl *default_acl = NULL, *acl = NULL;
struct ocfs2_dentry_lock *dl = NULL;
trace_ocfs2_mknod(dir, dentry, dentry->d_name.len, dentry->d_name.name,
@@ -367,12 +366,6 @@ static int ocfs2_mknod(struct inode *dir,
goto leave;
}
- status = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
- if (status) {
- mlog_errno(status);
- goto leave;
- }
-
handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb,
S_ISDIR(mode),
xattr_credits));
@@ -421,16 +414,8 @@ static int ocfs2_mknod(struct inode *dir,
inc_nlink(dir);
}
- if (default_acl) {
- status = ocfs2_set_acl(handle, inode, new_fe_bh,
- ACL_TYPE_DEFAULT, default_acl,
- meta_ac, data_ac);
- }
- if (!status && acl) {
- status = ocfs2_set_acl(handle, inode, new_fe_bh,
- ACL_TYPE_ACCESS, acl,
- meta_ac, data_ac);
- }
+ status = ocfs2_init_acl(handle, inode, dir, new_fe_bh, parent_fe_bh,
+ meta_ac, data_ac);
if (status < 0) {
mlog_errno(status);
@@ -472,10 +457,6 @@ static int ocfs2_mknod(struct inode *dir,
d_instantiate(dentry, inode);
status = 0;
leave:
- if (default_acl)
- posix_acl_release(default_acl);
- if (acl)
- posix_acl_release(acl);
if (status < 0 && did_quota_inode)
dquot_free_inode(inode);
if (handle)
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 252119860e6c..6a0c55d7dff0 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -4248,20 +4248,12 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
struct inode *inode = d_inode(old_dentry);
struct buffer_head *old_bh = NULL;
struct inode *new_orphan_inode = NULL;
- struct posix_acl *default_acl, *acl;
- umode_t mode;
if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)))
return -EOPNOTSUPP;
- mode = inode->i_mode;
- error = posix_acl_create(dir, &mode, &default_acl, &acl);
- if (error) {
- mlog_errno(error);
- return error;
- }
- error = ocfs2_create_inode_in_orphan(dir, mode,
+ error = ocfs2_create_inode_in_orphan(dir, inode->i_mode,
&new_orphan_inode);
if (error) {
mlog_errno(error);
@@ -4300,16 +4292,11 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
/* If the security isn't preserved, we need to re-initialize them. */
if (!preserve) {
error = ocfs2_init_security_and_acl(dir, new_orphan_inode,
- &new_dentry->d_name,
- default_acl, acl);
+ &new_dentry->d_name);
if (error)
mlog_errno(error);
}
out:
- if (default_acl)
- posix_acl_release(default_acl);
- if (acl)
- posix_acl_release(acl);
if (!error) {
error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode,
new_dentry);
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index e9164f09841b..877830b05e12 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -7197,12 +7197,10 @@ out:
*/
int ocfs2_init_security_and_acl(struct inode *dir,
struct inode *inode,
- const struct qstr *qstr,
- struct posix_acl *default_acl,
- struct posix_acl *acl)
+ const struct qstr *qstr)
{
- struct buffer_head *dir_bh = NULL;
int ret = 0;
+ struct buffer_head *dir_bh = NULL;
ret = ocfs2_init_security_get(inode, dir, qstr, NULL);
if (ret) {
@@ -7215,11 +7213,9 @@ int ocfs2_init_security_and_acl(struct inode *dir,
mlog_errno(ret);
goto leave;
}
-
- if (!ret && default_acl)
- ret = ocfs2_iop_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
- if (!ret && acl)
- ret = ocfs2_iop_set_acl(inode, acl, ACL_TYPE_ACCESS);
+ ret = ocfs2_init_acl(NULL, inode, dir, NULL, dir_bh, NULL, NULL);
+ if (ret)
+ mlog_errno(ret);
ocfs2_inode_unlock(dir, 0);
brelse(dir_bh);
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index f10d5b93c366..1633cc15ea1f 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -94,7 +94,5 @@ int ocfs2_reflink_xattrs(struct inode *old_inode,
bool preserve_security);
int ocfs2_init_security_and_acl(struct inode *dir,
struct inode *inode,
- const struct qstr *qstr,
- struct posix_acl *default_acl,
- struct posix_acl *acl);
+ const struct qstr *qstr);
#endif /* OCFS2_XATTR_H */
diff --git a/fs/open.c b/fs/open.c
index b6f1e96a7c0b..157b9940dd73 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -840,16 +840,12 @@ EXPORT_SYMBOL(file_path);
int vfs_open(const struct path *path, struct file *file,
const struct cred *cred)
{
- struct dentry *dentry = path->dentry;
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = vfs_select_inode(path->dentry, file->f_flags);
- file->f_path = *path;
- if (dentry->d_flags & DCACHE_OP_SELECT_INODE) {
- inode = dentry->d_op->d_select_inode(dentry, file->f_flags);
- if (IS_ERR(inode))
- return PTR_ERR(inode);
- }
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+ file->f_path = *path;
return do_dentry_open(file, inode, NULL, cred);
}
@@ -995,14 +991,12 @@ struct file *filp_open(const char *filename, int flags, umode_t mode)
EXPORT_SYMBOL(filp_open);
struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt,
- const char *filename, int flags)
+ const char *filename, int flags, umode_t mode)
{
struct open_flags op;
- int err = build_open_flags(flags, 0, &op);
+ int err = build_open_flags(flags, mode, &op);
if (err)
return ERR_PTR(err);
- if (flags & O_CREAT)
- return ERR_PTR(-EINVAL);
return do_file_open_root(dentry, mnt, filename, &op);
}
EXPORT_SYMBOL(file_open_root);
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 0a8983492d91..eff6319d5037 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -22,9 +22,9 @@
int ovl_copy_xattr(struct dentry *old, struct dentry *new)
{
- ssize_t list_size, size;
- char *buf, *name, *value;
- int error;
+ ssize_t list_size, size, value_size = 0;
+ char *buf, *name, *value = NULL;
+ int uninitialized_var(error);
if (!old->d_inode->i_op->getxattr ||
!new->d_inode->i_op->getxattr)
@@ -41,29 +41,40 @@ int ovl_copy_xattr(struct dentry *old, struct dentry *new)
if (!buf)
return -ENOMEM;
- error = -ENOMEM;
- value = kmalloc(XATTR_SIZE_MAX, GFP_KERNEL);
- if (!value)
- goto out;
-
list_size = vfs_listxattr(old, buf, list_size);
if (list_size <= 0) {
error = list_size;
- goto out_free_value;
+ goto out;
}
for (name = buf; name < (buf + list_size); name += strlen(name) + 1) {
- size = vfs_getxattr(old, name, value, XATTR_SIZE_MAX);
- if (size <= 0) {
+retry:
+ size = vfs_getxattr(old, name, value, value_size);
+ if (size == -ERANGE)
+ size = vfs_getxattr(old, name, NULL, 0);
+
+ if (size < 0) {
error = size;
- goto out_free_value;
+ break;
+ }
+
+ if (size > value_size) {
+ void *new;
+
+ new = krealloc(value, size, GFP_KERNEL);
+ if (!new) {
+ error = -ENOMEM;
+ break;
+ }
+ value = new;
+ value_size = size;
+ goto retry;
}
+
error = vfs_setxattr(new, name, value, size, 0);
if (error)
- goto out_free_value;
+ break;
}
-
-out_free_value:
kfree(value);
out:
kfree(buf);
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 692ceda3bc21..a2b1d7ce3e1a 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -618,7 +618,8 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir)
* sole user of this dentry. Too tricky... Just unhash for
* now.
*/
- d_drop(dentry);
+ if (!err)
+ d_drop(dentry);
mutex_unlock(&dir->i_mutex);
return err;
@@ -903,6 +904,13 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old,
if (!overwrite && new_is_dir && !old_opaque && new_opaque)
ovl_remove_opaque(newdentry);
+ /*
+ * Old dentry now lives in different location. Dentries in
+ * lowerstack are stale. We cannot drop them here because
+ * access to them is lockless. This could be only pure upper
+ * or opaque directory - numlower is zero. Or upper non-dir
+ * entry - its pureness is tracked by flag opaque.
+ */
if (old_opaque != new_opaque) {
ovl_dentry_set_opaque(old, new_opaque);
if (!overwrite)
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 4060ffde8722..05ac9a95e881 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -42,6 +42,19 @@ int ovl_setattr(struct dentry *dentry, struct iattr *attr)
int err;
struct dentry *upperdentry;
+ /*
+ * Check for permissions before trying to copy-up. This is redundant
+ * since it will be rechecked later by ->setattr() on upper dentry. But
+ * without this, copy-up can be triggered by just about anybody.
+ *
+ * We don't initialize inode->size, which just means that
+ * inode_newsize_ok() will always check against MAX_LFS_FILESIZE and not
+ * check for a swapfile (which this won't be anyway).
+ */
+ err = inode_change_ok(dentry->d_inode, attr);
+ if (err)
+ return err;
+
err = ovl_want_write(dentry);
if (err)
goto out;
@@ -52,6 +65,8 @@ int ovl_setattr(struct dentry *dentry, struct iattr *attr)
mutex_lock(&upperdentry->d_inode->i_mutex);
err = notify_change(upperdentry, attr, NULL);
+ if (!err)
+ ovl_copyattr(upperdentry->d_inode, dentry->d_inode);
mutex_unlock(&upperdentry->d_inode->i_mutex);
}
ovl_drop_write(dentry);
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index 70e9af551600..adcb1398c481 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -571,7 +571,8 @@ void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list)
(int) PTR_ERR(dentry));
continue;
}
- ovl_cleanup(upper->d_inode, dentry);
+ if (dentry->d_inode)
+ ovl_cleanup(upper->d_inode, dentry);
dput(dentry);
}
mutex_unlock(&upper->d_inode->i_mutex);
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index e38ee0fed24a..a1acc6004a91 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -9,6 +9,7 @@
#include <linux/fs.h>
#include <linux/namei.h>
+#include <linux/pagemap.h>
#include <linux/xattr.h>
#include <linux/security.h>
#include <linux/mount.h>
@@ -75,12 +76,14 @@ enum ovl_path_type ovl_path_type(struct dentry *dentry)
if (oe->__upperdentry) {
type = __OVL_PATH_UPPER;
- if (oe->numlower) {
- if (S_ISDIR(dentry->d_inode->i_mode))
- type |= __OVL_PATH_MERGE;
- } else if (!oe->opaque) {
+ /*
+ * Non-dir dentry can hold lower dentry from previous
+ * location. Its purity depends only on opaque flag.
+ */
+ if (oe->numlower && S_ISDIR(dentry->d_inode->i_mode))
+ type |= __OVL_PATH_MERGE;
+ else if (!oe->opaque)
type |= __OVL_PATH_PURE;
- }
} else {
if (oe->numlower > 1)
type |= __OVL_PATH_MERGE;
@@ -273,6 +276,37 @@ static void ovl_dentry_release(struct dentry *dentry)
}
}
+static struct dentry *ovl_d_real(struct dentry *dentry, struct inode *inode)
+{
+ struct dentry *real;
+
+ if (d_is_dir(dentry)) {
+ if (!inode || inode == d_inode(dentry))
+ return dentry;
+ goto bug;
+ }
+
+ real = ovl_dentry_upper(dentry);
+ if (real && (!inode || inode == d_inode(real)))
+ return real;
+
+ real = ovl_dentry_lower(dentry);
+ if (!real)
+ goto bug;
+
+ if (!inode || inode == d_inode(real))
+ return real;
+
+ /* Handle recursion */
+ if (real->d_flags & DCACHE_OP_REAL)
+ return real->d_op->d_real(real, inode);
+
+bug:
+ WARN(1, "ovl_d_real(%pd4, %s:%lu\n): real dentry not found\n", dentry,
+ inode ? inode->i_sb->s_id : "NULL", inode ? inode->i_ino : 0);
+ return dentry;
+}
+
static int ovl_dentry_revalidate(struct dentry *dentry, unsigned int flags)
{
struct ovl_entry *oe = dentry->d_fsdata;
@@ -317,10 +351,13 @@ static int ovl_dentry_weak_revalidate(struct dentry *dentry, unsigned int flags)
static const struct dentry_operations ovl_dentry_operations = {
.d_release = ovl_dentry_release,
.d_select_inode = ovl_d_select_inode,
+ .d_real = ovl_d_real,
};
static const struct dentry_operations ovl_reval_dentry_operations = {
.d_release = ovl_dentry_release,
+ .d_select_inode = ovl_d_select_inode,
+ .d_real = ovl_d_real,
.d_revalidate = ovl_dentry_revalidate,
.d_weak_revalidate = ovl_dentry_weak_revalidate,
};
@@ -910,6 +947,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
}
sb->s_stack_depth = 0;
+ sb->s_maxbytes = MAX_LFS_FILESIZE;
if (ufs->config.upperdir) {
if (!ufs->config.workdir) {
pr_err("overlayfs: missing 'workdir'\n");
@@ -1053,6 +1091,9 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
root_dentry->d_fsdata = oe;
+ ovl_copyattr(ovl_dentry_real(root_dentry)->d_inode,
+ root_dentry->d_inode);
+
sb->s_magic = OVERLAYFS_SUPER_MAGIC;
sb->s_op = &ovl_super_operations;
sb->s_root = root_dentry;
diff --git a/fs/pipe.c b/fs/pipe.c
index 42cf8ddf0e55..ab8dad3ccb6a 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -38,6 +38,12 @@ unsigned int pipe_max_size = 1048576;
*/
unsigned int pipe_min_size = PAGE_SIZE;
+/* Maximum allocatable pages per user. Hard limit is unset by default, soft
+ * matches default values.
+ */
+unsigned long pipe_user_pages_hard;
+unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR;
+
/*
* We use a start+len construction, which provides full use of the
* allocated memory.
@@ -583,20 +589,49 @@ pipe_fasync(int fd, struct file *filp, int on)
return retval;
}
+static void account_pipe_buffers(struct pipe_inode_info *pipe,
+ unsigned long old, unsigned long new)
+{
+ atomic_long_add(new - old, &pipe->user->pipe_bufs);
+}
+
+static bool too_many_pipe_buffers_soft(struct user_struct *user)
+{
+ return pipe_user_pages_soft &&
+ atomic_long_read(&user->pipe_bufs) >= pipe_user_pages_soft;
+}
+
+static bool too_many_pipe_buffers_hard(struct user_struct *user)
+{
+ return pipe_user_pages_hard &&
+ atomic_long_read(&user->pipe_bufs) >= pipe_user_pages_hard;
+}
+
struct pipe_inode_info *alloc_pipe_info(void)
{
struct pipe_inode_info *pipe;
pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL);
if (pipe) {
- pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * PIPE_DEF_BUFFERS, GFP_KERNEL);
+ unsigned long pipe_bufs = PIPE_DEF_BUFFERS;
+ struct user_struct *user = get_current_user();
+
+ if (!too_many_pipe_buffers_hard(user)) {
+ if (too_many_pipe_buffers_soft(user))
+ pipe_bufs = 1;
+ pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * pipe_bufs, GFP_KERNEL);
+ }
+
if (pipe->bufs) {
init_waitqueue_head(&pipe->wait);
pipe->r_counter = pipe->w_counter = 1;
- pipe->buffers = PIPE_DEF_BUFFERS;
+ pipe->buffers = pipe_bufs;
+ pipe->user = user;
+ account_pipe_buffers(pipe, 0, pipe_bufs);
mutex_init(&pipe->mutex);
return pipe;
}
+ free_uid(user);
kfree(pipe);
}
@@ -607,6 +642,8 @@ void free_pipe_info(struct pipe_inode_info *pipe)
{
int i;
+ account_pipe_buffers(pipe, pipe->buffers, 0);
+ free_uid(pipe->user);
for (i = 0; i < pipe->buffers; i++) {
struct pipe_buffer *buf = pipe->bufs + i;
if (buf->ops)
@@ -998,6 +1035,7 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages)
memcpy(bufs + head, pipe->bufs, tail * sizeof(struct pipe_buffer));
}
+ account_pipe_buffers(pipe, pipe->buffers, nr_pages);
pipe->curbuf = 0;
kfree(pipe->bufs);
pipe->bufs = bufs;
@@ -1069,6 +1107,11 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
if (!capable(CAP_SYS_RESOURCE) && size > pipe_max_size) {
ret = -EPERM;
goto out;
+ } else if ((too_many_pipe_buffers_hard(pipe->user) ||
+ too_many_pipe_buffers_soft(pipe->user)) &&
+ !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) {
+ ret = -EPERM;
+ goto out;
}
ret = pipe_set_size(pipe, nr_pages);
break;
diff --git a/fs/pnode.c b/fs/pnode.c
index 6367e1e435c6..99899705b105 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -198,10 +198,15 @@ static struct mount *next_group(struct mount *m, struct mount *origin)
/* all accesses are serialized by namespace_sem */
static struct user_namespace *user_ns;
-static struct mount *last_dest, *last_source, *dest_master;
+static struct mount *last_dest, *first_source, *last_source, *dest_master;
static struct mountpoint *mp;
static struct hlist_head *list;
+static inline bool peers(struct mount *m1, struct mount *m2)
+{
+ return m1->mnt_group_id == m2->mnt_group_id && m1->mnt_group_id;
+}
+
static int propagate_one(struct mount *m)
{
struct mount *child;
@@ -212,24 +217,26 @@ static int propagate_one(struct mount *m)
/* skip if mountpoint isn't covered by it */
if (!is_subdir(mp->m_dentry, m->mnt.mnt_root))
return 0;
- if (m->mnt_group_id == last_dest->mnt_group_id) {
+ if (peers(m, last_dest)) {
type = CL_MAKE_SHARED;
} else {
struct mount *n, *p;
+ bool done;
for (n = m; ; n = p) {
p = n->mnt_master;
- if (p == dest_master || IS_MNT_MARKED(p)) {
- while (last_dest->mnt_master != p) {
- last_source = last_source->mnt_master;
- last_dest = last_source->mnt_parent;
- }
- if (n->mnt_group_id != last_dest->mnt_group_id) {
- last_source = last_source->mnt_master;
- last_dest = last_source->mnt_parent;
- }
+ if (p == dest_master || IS_MNT_MARKED(p))
break;
- }
}
+ do {
+ struct mount *parent = last_source->mnt_parent;
+ if (last_source == first_source)
+ break;
+ done = parent->mnt_master == p;
+ if (done && peers(n, parent))
+ break;
+ last_source = last_source->mnt_master;
+ } while (!done);
+
type = CL_SLAVE;
/* beginning of peer group among the slaves? */
if (IS_MNT_SHARED(m))
@@ -281,6 +288,7 @@ int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp,
*/
user_ns = current->nsproxy->mnt_ns->user_ns;
last_dest = dest_mnt;
+ first_source = source_mnt;
last_source = source_mnt;
mp = dest_mp;
list = tree_list;
diff --git a/fs/proc/array.c b/fs/proc/array.c
index d73291f5f0fc..b6c00ce0e29e 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -395,7 +395,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
state = *get_task_state(task);
vsize = eip = esp = 0;
- permitted = ptrace_may_access(task, PTRACE_MODE_READ | PTRACE_MODE_NOAUDIT);
+ permitted = ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS | PTRACE_MODE_NOAUDIT);
mm = get_task_mm(task);
if (mm) {
vsize = task_vsize(mm);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 4bd5d3118acd..e8bbf6cdb437 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -403,7 +403,7 @@ static const struct file_operations proc_pid_cmdline_ops = {
static int proc_pid_auxv(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task)
{
- struct mm_struct *mm = mm_access(task, PTRACE_MODE_READ);
+ struct mm_struct *mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
if (mm && !IS_ERR(mm)) {
unsigned int nwords = 0;
do {
@@ -430,7 +430,8 @@ static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns,
wchan = get_wchan(task);
- if (wchan && ptrace_may_access(task, PTRACE_MODE_READ) && !lookup_symbol_name(wchan, symname))
+ if (wchan && ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)
+ && !lookup_symbol_name(wchan, symname))
seq_printf(m, "%s", symname);
else
seq_putc(m, '0');
@@ -444,7 +445,7 @@ static int lock_trace(struct task_struct *task)
int err = mutex_lock_killable(&task->signal->cred_guard_mutex);
if (err)
return err;
- if (!ptrace_may_access(task, PTRACE_MODE_ATTACH)) {
+ if (!ptrace_may_access(task, PTRACE_MODE_ATTACH_FSCREDS)) {
mutex_unlock(&task->signal->cred_guard_mutex);
return -EPERM;
}
@@ -697,7 +698,7 @@ static int proc_fd_access_allowed(struct inode *inode)
*/
task = get_proc_task(inode);
if (task) {
- allowed = ptrace_may_access(task, PTRACE_MODE_READ);
+ allowed = ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS);
put_task_struct(task);
}
return allowed;
@@ -732,7 +733,7 @@ static bool has_pid_permissions(struct pid_namespace *pid,
return true;
if (in_group_p(pid->pid_gid))
return true;
- return ptrace_may_access(task, PTRACE_MODE_READ);
+ return ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS);
}
@@ -809,7 +810,7 @@ struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode)
struct mm_struct *mm = ERR_PTR(-ESRCH);
if (task) {
- mm = mm_access(task, mode);
+ mm = mm_access(task, mode | PTRACE_MODE_FSCREDS);
put_task_struct(task);
if (!IS_ERR_OR_NULL(mm)) {
@@ -953,7 +954,8 @@ static ssize_t environ_read(struct file *file, char __user *buf,
int ret = 0;
struct mm_struct *mm = file->private_data;
- if (!mm)
+ /* Ensure the process spawned far enough to have an environment. */
+ if (!mm || !mm->env_end)
return 0;
page = (char *)__get_free_page(GFP_TEMPORARY);
@@ -1856,7 +1858,7 @@ static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags)
if (!task)
goto out_notask;
- mm = mm_access(task, PTRACE_MODE_READ);
+ mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
if (IS_ERR_OR_NULL(mm))
goto out;
@@ -2007,7 +2009,7 @@ static struct dentry *proc_map_files_lookup(struct inode *dir,
goto out;
result = -EACCES;
- if (!ptrace_may_access(task, PTRACE_MODE_READ))
+ if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
goto out_put_task;
result = -ENOENT;
@@ -2060,7 +2062,7 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx)
goto out;
ret = -EACCES;
- if (!ptrace_may_access(task, PTRACE_MODE_READ))
+ if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
goto out_put_task;
ret = 0;
@@ -2530,7 +2532,7 @@ static int do_io_accounting(struct task_struct *task, struct seq_file *m, int wh
if (result)
return result;
- if (!ptrace_may_access(task, PTRACE_MODE_READ)) {
+ if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) {
result = -EACCES;
goto out_unlock;
}
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index f6e8354b8cea..1b0ea4a5d89e 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -42,7 +42,7 @@ static const char *proc_ns_follow_link(struct dentry *dentry, void **cookie)
if (!task)
return error;
- if (ptrace_may_access(task, PTRACE_MODE_READ)) {
+ if (ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) {
error = ns_get_path(&ns_path, task, ns_ops);
if (!error)
nd_jump_link(&ns_path);
@@ -63,7 +63,7 @@ static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int bufl
if (!task)
return res;
- if (ptrace_may_access(task, PTRACE_MODE_READ)) {
+ if (ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) {
res = ns_get_name(name, sizeof(name), task, ns_ops);
if (res >= 0)
res = readlink_copy(buffer, buflen, name);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 187b3b5f242e..f6478301db00 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1435,6 +1435,32 @@ static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma,
return page;
}
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static struct page *can_gather_numa_stats_pmd(pmd_t pmd,
+ struct vm_area_struct *vma,
+ unsigned long addr)
+{
+ struct page *page;
+ int nid;
+
+ if (!pmd_present(pmd))
+ return NULL;
+
+ page = vm_normal_page_pmd(vma, addr, pmd);
+ if (!page)
+ return NULL;
+
+ if (PageReserved(page))
+ return NULL;
+
+ nid = page_to_nid(page);
+ if (!node_isset(nid, node_states[N_MEMORY]))
+ return NULL;
+
+ return page;
+}
+#endif
+
static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
unsigned long end, struct mm_walk *walk)
{
@@ -1444,13 +1470,13 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
pte_t *orig_pte;
pte_t *pte;
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
- pte_t huge_pte = *(pte_t *)pmd;
struct page *page;
- page = can_gather_numa_stats(huge_pte, vma, addr);
+ page = can_gather_numa_stats_pmd(*pmd, vma, addr);
if (page)
- gather_stats(page, md, pte_dirty(huge_pte),
+ gather_stats(page, md, pmd_dirty(*pmd),
HPAGE_PMD_SIZE/PAGE_SIZE);
spin_unlock(ptl);
return 0;
@@ -1458,6 +1484,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
if (pmd_trans_unstable(pmd))
return 0;
+#endif
orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
do {
struct page *page = can_gather_numa_stats(*pte, vma, addr);
@@ -1473,18 +1500,19 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask,
unsigned long addr, unsigned long end, struct mm_walk *walk)
{
+ pte_t huge_pte = huge_ptep_get(pte);
struct numa_maps *md;
struct page *page;
- if (!pte_present(*pte))
+ if (!pte_present(huge_pte))
return 0;
- page = pte_page(*pte);
+ page = pte_page(huge_pte);
if (!page)
return 0;
md = walk->private;
- gather_stats(page, md, pte_dirty(*pte), 1);
+ gather_stats(page, md, pte_dirty(huge_pte), 1);
return 0;
}
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index 8ebd9a334085..87645955990d 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -197,6 +197,8 @@ static int show_vfsstat(struct seq_file *m, struct vfsmount *mnt)
if (sb->s_op->show_devname) {
seq_puts(m, "device ");
err = sb->s_op->show_devname(m, mnt_path.dentry);
+ if (err)
+ goto out;
} else {
if (r->mnt_devname) {
seq_puts(m, "device ");
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index ef0d64b2a6d9..353ff31dcee1 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -1398,7 +1398,7 @@ static int dquot_active(const struct inode *inode)
static int __dquot_initialize(struct inode *inode, int type)
{
int cnt, init_needed = 0;
- struct dquot **dquots, *got[MAXQUOTAS];
+ struct dquot **dquots, *got[MAXQUOTAS] = {};
struct super_block *sb = inode->i_sb;
qsize_t rsv;
int ret = 0;
@@ -1415,7 +1415,6 @@ static int __dquot_initialize(struct inode *inode, int type)
int rc;
struct dquot *dquot;
- got[cnt] = NULL;
if (type != -1 && cnt != type)
continue;
/*
diff --git a/fs/splice.c b/fs/splice.c
index 4cf700d50b40..0f77e9682857 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -185,6 +185,9 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
unsigned int spd_pages = spd->nr_pages;
int ret, do_wakeup, page_nr;
+ if (!spd_pages)
+ return 0;
+
ret = 0;
do_wakeup = 0;
page_nr = 0;
diff --git a/fs/super.c b/fs/super.c
index 954aeb80e202..f5f4b328f860 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -415,6 +415,7 @@ void generic_shutdown_super(struct super_block *sb)
sb->s_flags &= ~MS_ACTIVE;
fsnotify_unmount_inodes(sb);
+ cgroup_writeback_umount();
evict_inodes(sb);
diff --git a/fs/timerfd.c b/fs/timerfd.c
index b94fa6c3c6eb..053818dd6c18 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -153,7 +153,7 @@ static ktime_t timerfd_get_remaining(struct timerfd_ctx *ctx)
if (isalarm(ctx))
remaining = alarm_expires_remaining(&ctx->t.alarm);
else
- remaining = hrtimer_expires_remaining(&ctx->t.tmr);
+ remaining = hrtimer_expires_remaining_adjusted(&ctx->t.tmr);
return remaining.tv64 < 0 ? ktime_set(0, 0): remaining;
}
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 8d0b3ade0ff0..566df9b5a6cb 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -2047,14 +2047,29 @@ void udf_write_aext(struct inode *inode, struct extent_position *epos,
epos->offset += adsize;
}
+/*
+ * Only 1 indirect extent in a row really makes sense but allow upto 16 in case
+ * someone does some weird stuff.
+ */
+#define UDF_MAX_INDIR_EXTS 16
+
int8_t udf_next_aext(struct inode *inode, struct extent_position *epos,
struct kernel_lb_addr *eloc, uint32_t *elen, int inc)
{
int8_t etype;
+ unsigned int indirections = 0;
while ((etype = udf_current_aext(inode, epos, eloc, elen, inc)) ==
(EXT_NEXT_EXTENT_ALLOCDECS >> 30)) {
int block;
+
+ if (++indirections > UDF_MAX_INDIR_EXTS) {
+ udf_err(inode->i_sb,
+ "too many indirect extents in inode %lu\n",
+ inode->i_ino);
+ return -1;
+ }
+
epos->block = *eloc;
epos->offset = sizeof(struct allocExtDesc);
brelse(epos->bh);
diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c
index ab478e62baae..e788a05aab83 100644
--- a/fs/udf/unicode.c
+++ b/fs/udf/unicode.c
@@ -128,11 +128,15 @@ int udf_CS0toUTF8(struct ustr *utf_o, const struct ustr *ocu_i)
if (c < 0x80U)
utf_o->u_name[utf_o->u_len++] = (uint8_t)c;
else if (c < 0x800U) {
+ if (utf_o->u_len > (UDF_NAME_LEN - 4))
+ break;
utf_o->u_name[utf_o->u_len++] =
(uint8_t)(0xc0 | (c >> 6));
utf_o->u_name[utf_o->u_len++] =
(uint8_t)(0x80 | (c & 0x3f));
} else {
+ if (utf_o->u_len > (UDF_NAME_LEN - 5))
+ break;
utf_o->u_name[utf_o->u_len++] =
(uint8_t)(0xe0 | (c >> 12));
utf_o->u_name[utf_o->u_len++] =
@@ -173,17 +177,22 @@ int udf_CS0toUTF8(struct ustr *utf_o, const struct ustr *ocu_i)
static int udf_UTF8toCS0(dstring *ocu, struct ustr *utf, int length)
{
unsigned c, i, max_val, utf_char;
- int utf_cnt, u_len;
+ int utf_cnt, u_len, u_ch;
memset(ocu, 0, sizeof(dstring) * length);
ocu[0] = 8;
max_val = 0xffU;
+ u_ch = 1;
try_again:
u_len = 0U;
utf_char = 0U;
utf_cnt = 0U;
for (i = 0U; i < utf->u_len; i++) {
+ /* Name didn't fit? */
+ if (u_len + 1 + u_ch >= length)
+ return 0;
+
c = (uint8_t)utf->u_name[i];
/* Complete a multi-byte UTF-8 character */
@@ -225,6 +234,7 @@ try_again:
if (max_val == 0xffU) {
max_val = 0xffffU;
ocu[0] = (uint8_t)0x10U;
+ u_ch = 2;
goto try_again;
}
goto error_out;
@@ -277,7 +287,7 @@ static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o,
c = (c << 8) | ocu[i++];
len = nls->uni2char(c, &utf_o->u_name[utf_o->u_len],
- UDF_NAME_LEN - utf_o->u_len);
+ UDF_NAME_LEN - 2 - utf_o->u_len);
/* Valid character? */
if (len >= 0)
utf_o->u_len += len;
@@ -295,15 +305,19 @@ static int udf_NLStoCS0(struct nls_table *nls, dstring *ocu, struct ustr *uni,
int len;
unsigned i, max_val;
uint16_t uni_char;
- int u_len;
+ int u_len, u_ch;
memset(ocu, 0, sizeof(dstring) * length);
ocu[0] = 8;
max_val = 0xffU;
+ u_ch = 1;
try_again:
u_len = 0U;
for (i = 0U; i < uni->u_len; i++) {
+ /* Name didn't fit? */
+ if (u_len + 1 + u_ch >= length)
+ return 0;
len = nls->char2uni(&uni->u_name[i], uni->u_len - i, &uni_char);
if (!len)
continue;
@@ -316,6 +330,7 @@ try_again:
if (uni_char > max_val) {
max_val = 0xffffU;
ocu[0] = (uint8_t)0x10U;
+ u_ch = 2;
goto try_again;
}
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 50311703135b..66cdb44616d5 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -287,6 +287,12 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address,
goto out;
/*
+ * We don't do userfault handling for the final child pid update.
+ */
+ if (current->flags & PF_EXITING)
+ goto out;
+
+ /*
* Check that we can return VM_FAULT_RETRY.
*
* NOTE: it should become possible to return VM_FAULT_RETRY
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 3479294c1d58..e1e7fe3b5424 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -535,6 +535,7 @@ xfs_agfl_write_verify(
}
const struct xfs_buf_ops xfs_agfl_buf_ops = {
+ .name = "xfs_agfl",
.verify_read = xfs_agfl_read_verify,
.verify_write = xfs_agfl_write_verify,
};
@@ -2339,6 +2340,7 @@ xfs_agf_write_verify(
}
const struct xfs_buf_ops xfs_agf_buf_ops = {
+ .name = "xfs_agf",
.verify_read = xfs_agf_read_verify,
.verify_write = xfs_agf_write_verify,
};
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c
index 90de071dd4c2..eb8bbfe85484 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.c
+++ b/fs/xfs/libxfs/xfs_alloc_btree.c
@@ -379,6 +379,7 @@ xfs_allocbt_write_verify(
}
const struct xfs_buf_ops xfs_allocbt_buf_ops = {
+ .name = "xfs_allocbt",
.verify_read = xfs_allocbt_read_verify,
.verify_write = xfs_allocbt_write_verify,
};
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index aa187f7ba2dd..01a5ecfedfcf 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -328,6 +328,7 @@ xfs_attr3_leaf_read_verify(
}
const struct xfs_buf_ops xfs_attr3_leaf_buf_ops = {
+ .name = "xfs_attr3_leaf",
.verify_read = xfs_attr3_leaf_read_verify,
.verify_write = xfs_attr3_leaf_write_verify,
};
diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
index 5ab95ffa4ae9..f3ed9bf0b065 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -201,6 +201,7 @@ xfs_attr3_rmt_write_verify(
}
const struct xfs_buf_ops xfs_attr3_rmt_buf_ops = {
+ .name = "xfs_attr3_rmt",
.verify_read = xfs_attr3_rmt_read_verify,
.verify_write = xfs_attr3_rmt_write_verify,
};
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index 6b0cf6546a82..1637c37bfbaa 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -720,6 +720,7 @@ xfs_bmbt_write_verify(
}
const struct xfs_buf_ops xfs_bmbt_buf_ops = {
+ .name = "xfs_bmbt",
.verify_read = xfs_bmbt_read_verify,
.verify_write = xfs_bmbt_write_verify,
};
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index e89a0f8f827c..097bf7717d80 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -245,6 +245,7 @@ xfs_da3_node_read_verify(
}
const struct xfs_buf_ops xfs_da3_node_buf_ops = {
+ .name = "xfs_da3_node",
.verify_read = xfs_da3_node_read_verify,
.verify_write = xfs_da3_node_write_verify,
};
diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c
index 9c10e2b8cfcb..aa17cb788946 100644
--- a/fs/xfs/libxfs/xfs_dir2_block.c
+++ b/fs/xfs/libxfs/xfs_dir2_block.c
@@ -123,6 +123,7 @@ xfs_dir3_block_write_verify(
}
const struct xfs_buf_ops xfs_dir3_block_buf_ops = {
+ .name = "xfs_dir3_block",
.verify_read = xfs_dir3_block_read_verify,
.verify_write = xfs_dir3_block_write_verify,
};
diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c
index af71a84f343c..725fc7841fde 100644
--- a/fs/xfs/libxfs/xfs_dir2_data.c
+++ b/fs/xfs/libxfs/xfs_dir2_data.c
@@ -305,11 +305,13 @@ xfs_dir3_data_write_verify(
}
const struct xfs_buf_ops xfs_dir3_data_buf_ops = {
+ .name = "xfs_dir3_data",
.verify_read = xfs_dir3_data_read_verify,
.verify_write = xfs_dir3_data_write_verify,
};
static const struct xfs_buf_ops xfs_dir3_data_reada_buf_ops = {
+ .name = "xfs_dir3_data_reada",
.verify_read = xfs_dir3_data_reada_verify,
.verify_write = xfs_dir3_data_write_verify,
};
diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c
index 3923e1f94697..b887fb2a2bcf 100644
--- a/fs/xfs/libxfs/xfs_dir2_leaf.c
+++ b/fs/xfs/libxfs/xfs_dir2_leaf.c
@@ -245,11 +245,13 @@ xfs_dir3_leafn_write_verify(
}
const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops = {
+ .name = "xfs_dir3_leaf1",
.verify_read = xfs_dir3_leaf1_read_verify,
.verify_write = xfs_dir3_leaf1_write_verify,
};
const struct xfs_buf_ops xfs_dir3_leafn_buf_ops = {
+ .name = "xfs_dir3_leafn",
.verify_read = xfs_dir3_leafn_read_verify,
.verify_write = xfs_dir3_leafn_write_verify,
};
diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c
index 70b0cb2fd556..63ee03db796c 100644
--- a/fs/xfs/libxfs/xfs_dir2_node.c
+++ b/fs/xfs/libxfs/xfs_dir2_node.c
@@ -150,6 +150,7 @@ xfs_dir3_free_write_verify(
}
const struct xfs_buf_ops xfs_dir3_free_buf_ops = {
+ .name = "xfs_dir3_free",
.verify_read = xfs_dir3_free_read_verify,
.verify_write = xfs_dir3_free_write_verify,
};
diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c
index 5331b7f0460c..3cc3cf767474 100644
--- a/fs/xfs/libxfs/xfs_dquot_buf.c
+++ b/fs/xfs/libxfs/xfs_dquot_buf.c
@@ -54,7 +54,7 @@ xfs_dqcheck(
xfs_dqid_t id,
uint type, /* used only when IO_dorepair is true */
uint flags,
- char *str)
+ const char *str)
{
xfs_dqblk_t *d = (xfs_dqblk_t *)ddq;
int errs = 0;
@@ -207,7 +207,8 @@ xfs_dquot_buf_verify_crc(
STATIC bool
xfs_dquot_buf_verify(
struct xfs_mount *mp,
- struct xfs_buf *bp)
+ struct xfs_buf *bp,
+ int warn)
{
struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr;
xfs_dqid_t id = 0;
@@ -240,8 +241,7 @@ xfs_dquot_buf_verify(
if (i == 0)
id = be32_to_cpu(ddq->d_id);
- error = xfs_dqcheck(mp, ddq, id + i, 0, XFS_QMOPT_DOWARN,
- "xfs_dquot_buf_verify");
+ error = xfs_dqcheck(mp, ddq, id + i, 0, warn, __func__);
if (error)
return false;
}
@@ -256,7 +256,7 @@ xfs_dquot_buf_read_verify(
if (!xfs_dquot_buf_verify_crc(mp, bp))
xfs_buf_ioerror(bp, -EFSBADCRC);
- else if (!xfs_dquot_buf_verify(mp, bp))
+ else if (!xfs_dquot_buf_verify(mp, bp, XFS_QMOPT_DOWARN))
xfs_buf_ioerror(bp, -EFSCORRUPTED);
if (bp->b_error)
@@ -264,6 +264,25 @@ xfs_dquot_buf_read_verify(
}
/*
+ * readahead errors are silent and simply leave the buffer as !done so a real
+ * read will then be run with the xfs_dquot_buf_ops verifier. See
+ * xfs_inode_buf_verify() for why we use EIO and ~XBF_DONE here rather than
+ * reporting the failure.
+ */
+static void
+xfs_dquot_buf_readahead_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+
+ if (!xfs_dquot_buf_verify_crc(mp, bp) ||
+ !xfs_dquot_buf_verify(mp, bp, 0)) {
+ xfs_buf_ioerror(bp, -EIO);
+ bp->b_flags &= ~XBF_DONE;
+ }
+}
+
+/*
* we don't calculate the CRC here as that is done when the dquot is flushed to
* the buffer after the update is done. This ensures that the dquot in the
* buffer always has an up-to-date CRC value.
@@ -274,7 +293,7 @@ xfs_dquot_buf_write_verify(
{
struct xfs_mount *mp = bp->b_target->bt_mount;
- if (!xfs_dquot_buf_verify(mp, bp)) {
+ if (!xfs_dquot_buf_verify(mp, bp, XFS_QMOPT_DOWARN)) {
xfs_buf_ioerror(bp, -EFSCORRUPTED);
xfs_verifier_error(bp);
return;
@@ -282,7 +301,13 @@ xfs_dquot_buf_write_verify(
}
const struct xfs_buf_ops xfs_dquot_buf_ops = {
+ .name = "xfs_dquot",
.verify_read = xfs_dquot_buf_read_verify,
.verify_write = xfs_dquot_buf_write_verify,
};
+const struct xfs_buf_ops xfs_dquot_buf_ra_ops = {
+ .name = "xfs_dquot_ra",
+ .verify_read = xfs_dquot_buf_readahead_verify,
+ .verify_write = xfs_dquot_buf_write_verify,
+};
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 8774498ce0ff..e2536bb1c760 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -786,7 +786,7 @@ typedef struct xfs_agfl {
__be64 agfl_lsn;
__be32 agfl_crc;
__be32 agfl_bno[]; /* actually XFS_AGFL_SIZE(mp) */
-} xfs_agfl_t;
+} __attribute__((packed)) xfs_agfl_t;
#define XFS_AGFL_CRC_OFF offsetof(struct xfs_agfl, agfl_crc)
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 70c1db99f6a7..66d702e6b9ff 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -2572,6 +2572,7 @@ xfs_agi_write_verify(
}
const struct xfs_buf_ops xfs_agi_buf_ops = {
+ .name = "xfs_agi",
.verify_read = xfs_agi_read_verify,
.verify_write = xfs_agi_write_verify,
};
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index f39b285beb19..6dd44f9ea727 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -304,6 +304,7 @@ xfs_inobt_write_verify(
}
const struct xfs_buf_ops xfs_inobt_buf_ops = {
+ .name = "xfs_inobt",
.verify_read = xfs_inobt_read_verify,
.verify_write = xfs_inobt_write_verify,
};
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 268c00f4f83a..1aabfda669b0 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -62,11 +62,14 @@ xfs_inobp_check(
* has not had the inode cores stamped into it. Hence for readahead, the buffer
* may be potentially invalid.
*
- * If the readahead buffer is invalid, we don't want to mark it with an error,
- * but we do want to clear the DONE status of the buffer so that a followup read
- * will re-read it from disk. This will ensure that we don't get an unnecessary
- * warnings during log recovery and we don't get unnecssary panics on debug
- * kernels.
+ * If the readahead buffer is invalid, we need to mark it with an error and
+ * clear the DONE status of the buffer so that a followup read will re-read it
+ * from disk. We don't report the error otherwise to avoid warnings during log
+ * recovery and we don't get unnecssary panics on debug kernels. We use EIO here
+ * because all we want to do is say readahead failed; there is no-one to report
+ * the error to, so this will distinguish it from a non-ra verifier failure.
+ * Changes to this readahead error behavour also need to be reflected in
+ * xfs_dquot_buf_readahead_verify().
*/
static void
xfs_inode_buf_verify(
@@ -93,6 +96,7 @@ xfs_inode_buf_verify(
XFS_RANDOM_ITOBP_INOTOBP))) {
if (readahead) {
bp->b_flags &= ~XBF_DONE;
+ xfs_buf_ioerror(bp, -EIO);
return;
}
@@ -132,11 +136,13 @@ xfs_inode_buf_write_verify(
}
const struct xfs_buf_ops xfs_inode_buf_ops = {
+ .name = "xfs_inode",
.verify_read = xfs_inode_buf_read_verify,
.verify_write = xfs_inode_buf_write_verify,
};
const struct xfs_buf_ops xfs_inode_buf_ra_ops = {
+ .name = "xxfs_inode_ra",
.verify_read = xfs_inode_buf_readahead_verify,
.verify_write = xfs_inode_buf_write_verify,
};
diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h
index 1b0a08379759..f51078f1e92a 100644
--- a/fs/xfs/libxfs/xfs_quota_defs.h
+++ b/fs/xfs/libxfs/xfs_quota_defs.h
@@ -153,7 +153,7 @@ typedef __uint16_t xfs_qwarncnt_t;
#define XFS_QMOPT_RESBLK_MASK (XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_RES_RTBLKS)
extern int xfs_dqcheck(struct xfs_mount *mp, xfs_disk_dquot_t *ddq,
- xfs_dqid_t id, uint type, uint flags, char *str);
+ xfs_dqid_t id, uint type, uint flags, const char *str);
extern int xfs_calc_dquots_per_chunk(unsigned int nbblks);
#endif /* __XFS_QUOTA_H__ */
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index a0b071d881a0..8a53eaa349f4 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -679,11 +679,13 @@ xfs_sb_write_verify(
}
const struct xfs_buf_ops xfs_sb_buf_ops = {
+ .name = "xfs_sb",
.verify_read = xfs_sb_read_verify,
.verify_write = xfs_sb_write_verify,
};
const struct xfs_buf_ops xfs_sb_quiet_buf_ops = {
+ .name = "xfs_sb_quiet",
.verify_read = xfs_sb_quiet_read_verify,
.verify_write = xfs_sb_write_verify,
};
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index 5be529707903..15c3ceb845b9 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -49,6 +49,7 @@ extern const struct xfs_buf_ops xfs_inobt_buf_ops;
extern const struct xfs_buf_ops xfs_inode_buf_ops;
extern const struct xfs_buf_ops xfs_inode_buf_ra_ops;
extern const struct xfs_buf_ops xfs_dquot_buf_ops;
+extern const struct xfs_buf_ops xfs_dquot_buf_ra_ops;
extern const struct xfs_buf_ops xfs_sb_buf_ops;
extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops;
extern const struct xfs_buf_ops xfs_symlink_buf_ops;
diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c
index cb6fd20a4d3d..2e2c6716b623 100644
--- a/fs/xfs/libxfs/xfs_symlink_remote.c
+++ b/fs/xfs/libxfs/xfs_symlink_remote.c
@@ -168,6 +168,7 @@ xfs_symlink_write_verify(
}
const struct xfs_buf_ops xfs_symlink_buf_ops = {
+ .name = "xfs_symlink",
.verify_read = xfs_symlink_read_verify,
.verify_write = xfs_symlink_write_verify,
};
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index 0ef7c2ed3f8a..4fa14820e2e2 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -202,8 +202,10 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
sbp->namelen,
sbp->valuelen,
&sbp->name[sbp->namelen]);
- if (error)
+ if (error) {
+ kmem_free(sbuf);
return error;
+ }
if (context->seen_enough)
break;
cursor->offset++;
@@ -454,14 +456,13 @@ xfs_attr3_leaf_list_int(
args.rmtblkcnt = xfs_attr3_rmt_blocks(
args.dp->i_mount, valuelen);
retval = xfs_attr_rmtval_get(&args);
- if (retval)
- return retval;
- retval = context->put_listent(context,
- entry->flags,
- name_rmt->name,
- (int)name_rmt->namelen,
- valuelen,
- args.value);
+ if (!retval)
+ retval = context->put_listent(context,
+ entry->flags,
+ name_rmt->name,
+ (int)name_rmt->namelen,
+ valuelen,
+ args.value);
kmem_free(args.value);
} else {
retval = context->put_listent(context,
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 3243cdf97f33..39090fc56f09 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -604,6 +604,13 @@ found:
}
}
+ /*
+ * Clear b_error if this is a lookup from a caller that doesn't expect
+ * valid data to be found in the buffer.
+ */
+ if (!(flags & XBF_READ))
+ xfs_buf_ioerror(bp, 0);
+
XFS_STATS_INC(target->bt_mount, xb_get);
trace_xfs_buf_get(bp, flags, _RET_IP_);
return bp;
@@ -1520,6 +1527,16 @@ xfs_wait_buftarg(
LIST_HEAD(dispose);
int loop = 0;
+ /*
+ * We need to flush the buffer workqueue to ensure that all IO
+ * completion processing is 100% done. Just waiting on buffer locks is
+ * not sufficient for async IO as the reference count held over IO is
+ * not released until after the buffer lock is dropped. Hence we need to
+ * ensure here that all reference counts have been dropped before we
+ * start walking the LRU list.
+ */
+ drain_workqueue(btp->bt_mount->m_buf_workqueue);
+
/* loop until there is nothing left on the lru list. */
while (list_lru_count(&btp->bt_lru)) {
list_lru_walk(&btp->bt_lru, xfs_buftarg_wait_rele,
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index c79b717d9b88..c75721acd867 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -132,6 +132,7 @@ struct xfs_buf_map {
struct xfs_buf_map (map) = { .bm_bn = (blkno), .bm_len = (numblk) };
struct xfs_buf_ops {
+ char *name;
void (*verify_read)(struct xfs_buf *);
void (*verify_write)(struct xfs_buf *);
};
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 74d0e5966ebc..88693a98fac5 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -164,9 +164,9 @@ xfs_verifier_error(
{
struct xfs_mount *mp = bp->b_target->bt_mount;
- xfs_alert(mp, "Metadata %s detected at %pF, block 0x%llx",
+ xfs_alert(mp, "Metadata %s detected at %pF, %s block 0x%llx",
bp->b_error == -EFSBADCRC ? "CRC error" : "corruption",
- __return_address, bp->b_bn);
+ __return_address, bp->b_ops->name, bp->b_bn);
xfs_alert(mp, "Unmount and run xfs_repair");
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index ee3aaa0a5317..ca0d3eb44925 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -243,8 +243,8 @@ xfs_growfs_data_private(
agf->agf_roots[XFS_BTNUM_CNTi] = cpu_to_be32(XFS_CNT_BLOCK(mp));
agf->agf_levels[XFS_BTNUM_BNOi] = cpu_to_be32(1);
agf->agf_levels[XFS_BTNUM_CNTi] = cpu_to_be32(1);
- agf->agf_flfirst = 0;
- agf->agf_fllast = cpu_to_be32(XFS_AGFL_SIZE(mp) - 1);
+ agf->agf_flfirst = cpu_to_be32(1);
+ agf->agf_fllast = 0;
agf->agf_flcount = 0;
tmpsize = agsize - XFS_PREALLOC_BLOCKS(mp);
agf->agf_freeblks = cpu_to_be32(tmpsize);
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 8ee393996b7d..f0ce28cd311d 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -3220,13 +3220,14 @@ xfs_iflush_cluster(
* We need to check under the i_flags_lock for a valid inode
* here. Skip it if it is not valid or the wrong inode.
*/
- spin_lock(&ip->i_flags_lock);
- if (!ip->i_ino ||
+ spin_lock(&iq->i_flags_lock);
+ if (!iq->i_ino ||
+ __xfs_iflags_test(iq, XFS_ISTALE) ||
(XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) {
- spin_unlock(&ip->i_flags_lock);
+ spin_unlock(&iq->i_flags_lock);
continue;
}
- spin_unlock(&ip->i_flags_lock);
+ spin_unlock(&iq->i_flags_lock);
/*
* Do an un-protected check to see if the inode is dirty and
@@ -3342,7 +3343,7 @@ xfs_iflush(
struct xfs_buf **bpp)
{
struct xfs_mount *mp = ip->i_mount;
- struct xfs_buf *bp;
+ struct xfs_buf *bp = NULL;
struct xfs_dinode *dip;
int error;
@@ -3384,14 +3385,22 @@ xfs_iflush(
}
/*
- * Get the buffer containing the on-disk inode.
+ * Get the buffer containing the on-disk inode. We are doing a try-lock
+ * operation here, so we may get an EAGAIN error. In that case, we
+ * simply want to return with the inode still dirty.
+ *
+ * If we get any other error, we effectively have a corruption situation
+ * and we cannot flush the inode, so we treat it the same as failing
+ * xfs_iflush_int().
*/
error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &bp, XBF_TRYLOCK,
0);
- if (error || !bp) {
+ if (error == -EAGAIN) {
xfs_ifunlock(ip);
return error;
}
+ if (error)
+ goto corrupt_out;
/*
* First flush out the inode that xfs_iflush was called with.
@@ -3419,7 +3428,8 @@ xfs_iflush(
return 0;
corrupt_out:
- xfs_buf_relse(bp);
+ if (bp)
+ xfs_buf_relse(bp);
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
cluster_corrupt_out:
error = -EFSCORRUPTED;
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index c5ecaacdd218..5991cdcb9040 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -3204,6 +3204,7 @@ xlog_recover_dquot_ra_pass2(
struct xfs_disk_dquot *recddq;
struct xfs_dq_logformat *dq_f;
uint type;
+ int len;
if (mp->m_qflags == 0)
@@ -3224,8 +3225,12 @@ xlog_recover_dquot_ra_pass2(
ASSERT(dq_f);
ASSERT(dq_f->qlf_len == 1);
- xfs_buf_readahead(mp->m_ddev_targp, dq_f->qlf_blkno,
- XFS_FSB_TO_BB(mp, dq_f->qlf_len), NULL);
+ len = XFS_FSB_TO_BB(mp, dq_f->qlf_len);
+ if (xlog_peek_buffer_cancelled(log, dq_f->qlf_blkno, len, 0))
+ return;
+
+ xfs_buf_readahead(mp->m_ddev_targp, dq_f->qlf_blkno, len,
+ &xfs_dquot_buf_ra_ops);
}
STATIC void
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 36bd8825bfb0..ef64a1e1a66a 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1233,6 +1233,16 @@ xfs_fs_remount(
return -EINVAL;
}
+ if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 &&
+ xfs_sb_has_ro_compat_feature(sbp,
+ XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) {
+ xfs_warn(mp,
+"ro->rw transition prohibited on unknown (0x%x) ro-compat filesystem",
+ (sbp->sb_features_ro_compat &
+ XFS_SB_FEAT_RO_COMPAT_UNKNOWN));
+ return -EINVAL;
+ }
+
mp->m_flags &= ~XFS_MOUNT_RDONLY;
/*
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index aa67339b9537..4f18fd92ca13 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -497,7 +497,6 @@ xfsaild(
long tout = 0; /* milliseconds */
current->flags |= PF_MEMALLOC;
- set_freezable();
while (!kthread_should_stop()) {
if (tout && tout <= 20)