From 90f8572b0f021fdd1baa68e00a8c30482ee9e5f4 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 29 Jun 2015 14:42:03 -0500 Subject: vfs: Commit to never having exectuables on proc and sysfs. Today proc and sysfs do not contain any executable files. Several applications today mount proc or sysfs without noexec and nosuid and then depend on there being no exectuables files on proc or sysfs. Having any executable files show on proc or sysfs would cause a user space visible regression, and most likely security problems. Therefore commit to never allowing executables on proc and sysfs by adding a new flag to mark them as filesystems without executables and enforce that flag. Test the flag where MNT_NOEXEC is tested today, so that the only user visible effect will be that exectuables will be treated as if the execute bit is cleared. The filesystems proc and sysfs do not currently incoporate any executable files so this does not result in any user visible effects. This makes it unnecessary to vet changes to proc and sysfs tightly for adding exectuable files or changes to chattr that would modify existing files, as no matter what the individual file say they will not be treated as exectuable files by the vfs. Not having to vet changes to closely is important as without this we are only one proc_create call (or another goof up in the implementation of notify_change) from having problematic executables on proc. Those mistakes are all too easy to make and would create a situation where there are security issues or the assumptions of some program having to be broken (and cause userspace regressions). Signed-off-by: "Eric W. Biederman" --- include/linux/fs.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index a0653e560c26..42912f8d286e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1244,6 +1244,7 @@ struct mm_struct; /* sb->s_iflags */ #define SB_I_CGROUPWB 0x00000001 /* cgroup-aware writeback enabled */ +#define SB_I_NOEXEC 0x00000002 /* Ignore executables on this fs */ /* Possible states of 'frozen' field */ enum { @@ -3030,4 +3031,6 @@ static inline bool dir_relax(struct inode *inode) return !IS_DEADDIR(inode); } +extern bool path_noexec(const struct path *path); + #endif /* _LINUX_FS_H */ -- cgit v1.2.3 From c87fb4a378f93f114b9906e180d83877cee4e7f4 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 6 Aug 2015 12:47:02 -0400 Subject: lockd: NLM grace period shouldn't block NFSv4 opens NLM locks don't conflict with NFSv4 share reservations, so we're not going to learn anything new by watiting for them. They do conflict with NFSv4 locks and with delegations. Signed-off-by: J. Bruce Fields --- include/linux/fs.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index cc008c338f5a..9a9d314f7b27 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -942,12 +942,18 @@ struct lock_manager_operations { struct lock_manager { struct list_head list; + /* + * NFSv4 and up also want opens blocked during the grace period; + * NLM doesn't care: + */ + bool block_opens; }; struct net; void locks_start_grace(struct net *, struct lock_manager *); void locks_end_grace(struct lock_manager *); int locks_in_grace(struct net *); +int opens_in_grace(struct net *); /* that will die - we need it for nfs_lock_info */ #include -- cgit v1.2.3 From bee9182d955227f01ff3b80c4cb6acca9bb40b11 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 19 Jul 2015 23:48:20 +0200 Subject: introduce __sb_writers_{acquired,release}() helpers Preparation to hide the sb->s_writers internals from xfs and btrfs. Add 2 trivial define's they can use rather than play with ->s_writers directly. No changes in btrfs/transaction.o and xfs/xfs_aops.o. Signed-off-by: Oleg Nesterov Reviewed-by: Jan Kara --- include/linux/fs.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 84b783f277f7..acb7cad84edd 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1391,6 +1391,11 @@ extern struct timespec current_fs_time(struct super_block *sb); void __sb_end_write(struct super_block *sb, int level); int __sb_start_write(struct super_block *sb, int level, bool wait); +#define __sb_writers_acquired(sb, lev) \ + rwsem_acquire_read(&(sb)->s_writers.lock_map[(lev)-1], 0, 1, _THIS_IP_) +#define __sb_writers_release(sb, lev) \ + rwsem_release(&(sb)->s_writers.lock_map[(lev)-1], 1, _THIS_IP_) + /** * sb_end_write - drop write access to a superblock * @sb: the super we wrote to -- cgit v1.2.3 From 853b39a7c82826b8413048feec7bf08e98ce7a84 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 22 Jul 2015 20:21:13 +0200 Subject: shift percpu_counter_destroy() into destroy_super_work() Of course, this patch is ugly as hell. It will be (partially) reverted later. We add it to ensure that other WIP changes in percpu_rw_semaphore won't break fs/super.c. We do not even need this change right now, percpu_free_rwsem() is fine in atomic context. But we are going to change this, it will be might_sleep() after we merge the rcu_sync() patches. And even after that we do not really need destroy_super_work(), we will kill it in any case. Instead, destroy_super_rcu() should just check that rss->cb_state == CB_IDLE and do call_rcu() again in the (very unlikely) case this is not true. So this is just the temporary kludge which helps us to avoid the conflicts with the changes which will be (hopefully) routed via rcu tree. Signed-off-by: Oleg Nesterov Reviewed-by: Jan Kara --- include/linux/fs.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index acb7cad84edd..4bed78966c6b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -1375,7 +1376,7 @@ struct super_block { struct list_lru s_dentry_lru ____cacheline_aligned_in_smp; struct list_lru s_inode_lru ____cacheline_aligned_in_smp; struct rcu_head rcu; - + struct work_struct destroy_work; /* * Indicates how deep in a filesystem stack this SB is */ -- cgit v1.2.3 From 8129ed29644bf56ed17ec1bbbeed5c568b43d6a0 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 11 Aug 2015 17:05:04 +0200 Subject: change sb_writers to use percpu_rw_semaphore We can remove everything from struct sb_writers except frozen and add the array of percpu_rw_semaphore's instead. This patch doesn't remove sb_writers->wait_unfrozen yet, we keep it for get_super_thawed(). We will probably remove it later. This change tries to address the following problems: - Firstly, __sb_start_write() looks simply buggy. It does __sb_end_write() if it sees ->frozen, but if it migrates to another CPU before percpu_counter_dec(), sb_wait_write() can wrongly succeed if there is another task which holds the same "semaphore": sb_wait_write() can miss the result of the previous percpu_counter_inc() but see the result of this percpu_counter_dec(). - As Dave Hansen reports, it is suboptimal. The trivial microbenchmark that writes to a tmpfs file in a loop runs 12% faster if we change this code to rely on RCU and kill the memory barriers. - This code doesn't look simple. It would be better to rely on the generic locking code. According to Dave, this change adds the same performance improvement. Note: with this change both freeze_super() and thaw_super() will do synchronize_sched_expedited() 3 times. This is just ugly. But: - This will be "fixed" by the rcu_sync changes we are going to merge. After that freeze_super()->percpu_down_write() will use synchronize_sched(), and thaw_super() won't use synchronize() at all. This doesn't need any changes in fs/super.c. - Once we merge rcu_sync changes, we can also change super.c so that all wb_write->rw_sem's will share the single ->rss in struct sb_writes, then freeze_super() will need only one synchronize_sched(). Signed-off-by: Oleg Nesterov Reviewed-by: Jan Kara --- include/linux/fs.h | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 4bed78966c6b..ce356f66cc2a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1,7 +1,6 @@ #ifndef _LINUX_FS_H #define _LINUX_FS_H - #include #include #include @@ -31,6 +30,7 @@ #include #include #include +#include #include #include @@ -1275,16 +1275,9 @@ enum { #define SB_FREEZE_LEVELS (SB_FREEZE_COMPLETE - 1) struct sb_writers { - /* Counters for counting writers at each level */ - struct percpu_counter counter[SB_FREEZE_LEVELS]; - wait_queue_head_t wait; /* queue for waiting for - writers / faults to finish */ - int frozen; /* Is sb frozen? */ - wait_queue_head_t wait_unfrozen; /* queue for waiting for - sb to be thawed */ -#ifdef CONFIG_DEBUG_LOCK_ALLOC - struct lockdep_map lock_map[SB_FREEZE_LEVELS]; -#endif + int frozen; /* Is sb frozen? */ + wait_queue_head_t wait_unfrozen; /* for get_super_thawed() */ + struct percpu_rw_semaphore rw_sem[SB_FREEZE_LEVELS]; }; struct super_block { @@ -1393,9 +1386,9 @@ void __sb_end_write(struct super_block *sb, int level); int __sb_start_write(struct super_block *sb, int level, bool wait); #define __sb_writers_acquired(sb, lev) \ - rwsem_acquire_read(&(sb)->s_writers.lock_map[(lev)-1], 0, 1, _THIS_IP_) + percpu_rwsem_acquire(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_) #define __sb_writers_release(sb, lev) \ - rwsem_release(&(sb)->s_writers.lock_map[(lev)-1], 1, _THIS_IP_) + percpu_rwsem_release(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_) /** * sb_end_write - drop write access to a superblock -- cgit v1.2.3 From cbedaac63481dea52327127a9f1c60f092bd6b07 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 12 Mar 2015 08:19:11 -0400 Subject: inode: add hlist_fake to avoid the inode hash lock in evict Some filesystems don't use the VFS inode hash and fake the fact they are hashed so that all the writeback code works correctly. However, this means the evict() path still tries to remove the inode from the hash, meaning that the inode_hash_lock() needs to be taken unnecessarily. Hence under certain workloads the inode_hash_lock can be contended even if the inode is never actually hashed. To avoid this add hlist_fake to test if the inode isn't actually hashed to avoid taking the hash lock on inodes that have never been hashed. Based on Dave Chinner's inode: add IOP_NOTHASHED to avoid inode hash lock in evict basd on Al's suggestions. Thanks, Signed-off-by: Josef Bacik Reviewed-by: Jan Kara Reviewed-by: Christoph Hellwig Tested-by: Dave Chinner --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 84b783f277f7..4a40fa843040 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2608,7 +2608,7 @@ static inline void insert_inode_hash(struct inode *inode) extern void __remove_inode_hash(struct inode *); static inline void remove_inode_hash(struct inode *inode) { - if (!inode_unhashed(inode)) + if (!inode_unhashed(inode) && !hlist_fake(&inode->i_hash)) __remove_inode_hash(inode); } -- cgit v1.2.3 From 74278da9f70d84d715601fe794567a6d2bfdf078 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 4 Mar 2015 12:37:22 -0500 Subject: inode: convert inode_sb_list_lock to per-sb The process of reducing contention on per-superblock inode lists starts with moving the locking to match the per-superblock inode list. This takes the global lock out of the picture and reduces the contention problems to within a single filesystem. This doesn't get rid of contention as the locks still have global CPU scope, but it does isolate operations on different superblocks form each other. Signed-off-by: Dave Chinner Signed-off-by: Josef Bacik Reviewed-by: Jan Kara Reviewed-by: Christoph Hellwig Tested-by: Dave Chinner --- include/linux/fs.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 4a40fa843040..09bbd38485f9 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1309,7 +1309,6 @@ struct super_block { #endif const struct xattr_handler **s_xattr; - struct list_head s_inodes; /* all inodes */ struct hlist_bl_head s_anon; /* anonymous dentries for (nfs) exporting */ struct list_head s_mounts; /* list of mounts; _not_ for fs use */ struct block_device *s_bdev; @@ -1380,6 +1379,10 @@ struct super_block { * Indicates how deep in a filesystem stack this SB is */ int s_stack_depth; + + /* s_inode_list_lock protects s_inodes */ + spinlock_t s_inode_list_lock ____cacheline_aligned_in_smp; + struct list_head s_inodes; /* all inodes */ }; extern struct timespec current_fs_time(struct super_block *sb); -- cgit v1.2.3 From e97fedb9ef9868ff24d588be781906cf7c1b59ae Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 4 Mar 2015 13:40:00 -0500 Subject: sync: serialise per-superblock sync operations When competing sync(2) calls walk the same filesystem, they need to walk the list of inodes on the superblock to find all the inodes that we need to wait for IO completion on. However, when multiple wait_sb_inodes() calls do this at the same time, they contend on the the inode_sb_list_lock and the contention causes system wide slowdowns. In effect, concurrent sync(2) calls can take longer and burn more CPU than if they were serialised. Stop the worst of the contention by adding a per-sb mutex to wrap around wait_sb_inodes() so that we only execute one sync(2) IO completion walk per superblock superblock at a time and hence avoid contention being triggered by concurrent sync(2) calls. Signed-off-by: Dave Chinner Signed-off-by: Josef Bacik Reviewed-by: Jan Kara Reviewed-by: Christoph Hellwig Tested-by: Dave Chinner --- include/linux/fs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 09bbd38485f9..82dfc5519b4b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1375,6 +1375,8 @@ struct super_block { struct list_lru s_inode_lru ____cacheline_aligned_in_smp; struct rcu_head rcu; + struct mutex s_sync_lock; /* sync serialisation lock */ + /* * Indicates how deep in a filesystem stack this SB is */ -- cgit v1.2.3 From c7f5408493aeb01532927b2276316797a03ed6ee Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 4 Mar 2015 14:07:22 -0500 Subject: inode: rename i_wb_list to i_io_list There's a small consistency problem between the inode and writeback naming. Writeback calls the "for IO" inode queues b_io and b_more_io, but the inode calls these the "writeback list" or i_wb_list. This makes it hard to an new "under writeback" list to the inode, or call it an "under IO" list on the bdi because either way we'll have writeback on IO and IO on writeback and it'll just be confusing. I'm getting confused just writing this! So, rename the inode "for IO" list variable to i_io_list so we can add a new "writeback list" in a subsequent patch. Signed-off-by: Dave Chinner Signed-off-by: Josef Bacik Reviewed-by: Jan Kara Reviewed-by: Christoph Hellwig Tested-by: Dave Chinner --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 82dfc5519b4b..34cfa60db678 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -636,7 +636,7 @@ struct inode { unsigned long dirtied_time_when; struct hlist_node i_hash; - struct list_head i_wb_list; /* backing dev IO list */ + struct list_head i_io_list; /* backing dev IO list */ #ifdef CONFIG_CGROUP_WRITEBACK struct bdi_writeback *i_wb; /* the associated cgroup wb */ -- cgit v1.2.3 From 5477e70a6420a6b7ca96c8e21413ee1c96a84260 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 4 Sep 2015 15:48:04 -0700 Subject: mm: move ->mremap() from file_operations to vm_operations_struct vma->vm_ops->mremap() looks more natural and clean in move_vma(), and this way ->mremap() can have more users. Say, vdso. While at it, s/aio_ring_remap/aio_ring_mremap/. Note: this is the minimal change before ->mremap() finds another user in file_operations; this method should have more arguments, and it can be used to kill arch_remap(). Signed-off-by: Oleg Nesterov Acked-by: Pavel Emelyanov Acked-by: Kirill A. Shutemov Cc: David Rientjes Cc: Benjamin LaHaise Cc: Hugh Dickins Cc: Jeff Moyer Cc: Laurent Dufour Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fs.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index fbd780c33c5f..864203c10dbc 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1612,7 +1612,6 @@ struct file_operations { long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); long (*compat_ioctl) (struct file *, unsigned int, unsigned long); int (*mmap) (struct file *, struct vm_area_struct *); - int (*mremap)(struct file *, struct vm_area_struct *); int (*open) (struct inode *, struct file *); int (*flush) (struct file *, fl_owner_t id); int (*release) (struct inode *, struct file *); -- cgit v1.2.3 From c94c2acf84dc16cf4b989bb0bc849785b7ff52f5 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 8 Sep 2015 14:58:40 -0700 Subject: dax: move DAX-related functions to a new header In order to handle the !CONFIG_TRANSPARENT_HUGEPAGES case, we need to return VM_FAULT_FALLBACK from the inlined dax_pmd_fault(), which is defined in linux/mm.h. Given that we don't want to include in , the easiest solution is to move the DAX-related functions to a new header, . We could also have moved VM_FAULT_* definitions to a new header, or a different header that isn't quite such a boil-the-ocean header as , but this felt like the best option. Signed-off-by: Matthew Wilcox Cc: Hillf Danton Cc: "Kirill A. Shutemov" Cc: Theodore Ts'o Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fs.h | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index b2f9b9c25e41..72d8a844c692 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -52,7 +52,6 @@ struct swap_info_struct; struct seq_file; struct workqueue_struct; struct iov_iter; -struct vm_fault; extern void __init inode_init(void); extern void __init inode_init_early(void); @@ -2678,19 +2677,6 @@ extern loff_t fixed_size_llseek(struct file *file, loff_t offset, extern int generic_file_open(struct inode * inode, struct file * filp); extern int nonseekable_open(struct inode * inode, struct file * filp); -ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *, loff_t, - get_block_t, dio_iodone_t, int flags); -int dax_clear_blocks(struct inode *, sector_t block, long size); -int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t); -int dax_truncate_page(struct inode *, loff_t from, get_block_t); -int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t, - dax_iodone_t); -int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t, - dax_iodone_t); -int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *); -#define dax_mkwrite(vma, vmf, gb, iod) dax_fault(vma, vmf, gb, iod) -#define __dax_mkwrite(vma, vmf, gb, iod) __dax_fault(vma, vmf, gb, iod) - #ifdef CONFIG_BLOCK typedef void (dio_submit_t)(int rw, struct bio *bio, struct inode *inode, loff_t file_offset); -- cgit v1.2.3