From 9e1f1de02c2275d7172e18dc4e7c2065777611bf Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 3 Jun 2011 18:24:58 -0400 Subject: more conservative S_NOSEC handling Caching "we have already removed suid/caps" was overenthusiastic as merged. On network filesystems we might have had suid/caps set on another client, silently picked by this client on revalidate, all of that *without* clearing the S_NOSEC flag. AFAICS, the only reasonably sane way to deal with that is * new superblock flag; unless set, S_NOSEC is not going to be set. * local block filesystems set it in their ->mount() (more accurately, mount_bdev() does, so does btrfs ->mount(), users of mount_bdev() other than local block ones clear it) * if any network filesystem (or a cluster one) wants to use S_NOSEC, it'll need to set MS_NOSEC in sb->s_flags *AND* take care to clear S_NOSEC when inode attribute changes are picked from other clients. It's not an earth-shattering hole (anybody that can set suid on another client will almost certainly be able to write to the file before doing that anyway), but it's a bug that needs fixing. Signed-off-by: Al Viro --- include/linux/fs.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index c55d6b7cd5d6..646a1836152a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -208,6 +208,7 @@ struct inodes_stat_t { #define MS_KERNMOUNT (1<<22) /* this is a kern_mount call */ #define MS_I_VERSION (1<<23) /* Update inode I_version field */ #define MS_STRICTATIME (1<<24) /* Always perform atime updates */ +#define MS_NOSEC (1<<28) #define MS_BORN (1<<29) #define MS_ACTIVE (1<<30) #define MS_NOUSER (1<<31) @@ -2591,7 +2592,7 @@ static inline int is_sxid(mode_t mode) static inline void inode_has_no_xattr(struct inode *inode) { - if (!is_sxid(inode->i_mode)) + if (!is_sxid(inode->i_mode) && (inode->i_sb->s_flags & MS_NOSEC)) inode->i_flags |= S_NOSEC; } -- cgit v1.2.3 From 13e12d14e2dccc7995b8f15a5678a338ab4e6a8c Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 8 Jun 2011 15:18:19 -0700 Subject: vfs: reorganize 'struct inode' layout a bit This tries to make the 'struct inode' accesses denser in the data cache by moving a commonly accessed field (i_security) closer to other fields that are accessed often. It also makes 'i_state' just an 'unsigned int' rather than 'unsigned long', since we only use a few bits of that field, and moves it next to the existing 'i_flags' so that we potentially get better structure layout (although depending on config options, i_flags may already have packed in the same word as i_lock, so this improves packing only for the case of spinlock debugging) Out 'struct inode' is still way too big, and we should probably move some other fields around too (the acl fields in particular) for better data cache access density. Other fields (like the inode hash) are likely to be entirely irrelevant under most loads. Signed-off-by: Linus Torvalds --- include/linux/fs.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 646a1836152a..1c777878f1ea 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -744,9 +744,13 @@ struct inode { spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */ unsigned int i_flags; + unsigned int i_state; +#ifdef CONFIG_SECURITY + void *i_security; +#endif struct mutex i_mutex; - unsigned long i_state; + unsigned long dirtied_when; /* jiffies of first dirtying */ struct hlist_node i_hash; @@ -798,9 +802,6 @@ struct inode { atomic_t i_readcount; /* struct files open RO */ #endif atomic_t i_writecount; -#ifdef CONFIG_SECURITY - void *i_security; -#endif #ifdef CONFIG_FS_POSIX_ACL struct posix_acl *i_acl; struct posix_acl *i_default_acl; -- cgit v1.2.3 From e44ba033c5654dbfda53461c9b1f7dd9bd1d198f Mon Sep 17 00:00:00 2001 From: Vitaliy Ivanov Date: Mon, 20 Jun 2011 16:08:07 +0200 Subject: treewide: remove duplicate includes Many stupid corrections of duplicated includes based on the output of scripts/checkincludes.pl. Signed-off-by: Vitaliy Ivanov Signed-off-by: Jiri Kosina --- include/linux/fs.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 1c777878f1ea..450ca245df84 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -376,7 +376,6 @@ struct inodes_stat_t { #include #include -#include #include #include #include -- cgit v1.2.3 From 79568f5be06c91071697c065f01f3ebfbeb25a61 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 20 Jun 2011 20:13:49 -0700 Subject: vfs: i_state needs to be 'unsigned long' for now Commit 13e12d14e2dc ("vfs: reorganize 'struct inode' layout a bit") moved things around a bit changed i_state to be unsigned int instead of unsigned long. That was to help structure layout for the 64-bit case, and shrink 'struct inode' a bit (admittedly that only happened when spinlock debugging was on and i_flags didn't pack with i_lock). However, Meelis Roos reports that this results in unaligned exceptions on sprc, and it turns out that the bit-locking primitives that we use for the I_NEW bit want to use the bitops. Which want 'unsigned long', not 'unsigned int'. We really should fix the bit locking code to not have that kind of requirement, but that's a much bigger change. So for now, revert that field back to 'unsigned long' (but keep the other re-ordering changes from the commit that caused this). Andi points out that we have played games with this in 'struct page', so it's solvable with other hacks too, but since right now the struct inode size advantage only happens with some rare config options, it's not worth fighting. It _would_ be worth fixing the bitlocking code, though. Especially since there is no type safety in the bitlocking code (this never caused any warnings, and worked fine on x86-64, because the bitlocks take a 'void *' and x86-64 doesn't care that deeply about alignment). So it's currently a very easy problem to trigger by mistake and never notice. Reported-by: Meelis Roos Cc: Andi Kleen Cc: David Miller Signed-off-by: Linus Torvalds --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 1c777878f1ea..6e73e2e9ae33 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -744,7 +744,7 @@ struct inode { spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */ unsigned int i_flags; - unsigned int i_state; + unsigned long i_state; #ifdef CONFIG_SECURITY void *i_security; #endif -- cgit v1.2.3 From 08142579b6ca35883c1ed066a2681de6f6917062 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 27 Jun 2011 16:18:10 -0700 Subject: mm: fix assertion mapping->nrpages == 0 in end_writeback() Under heavy memory and filesystem load, users observe the assertion mapping->nrpages == 0 in end_writeback() trigger. This can be caused by page reclaim reclaiming the last page from a mapping in the following race: CPU0 CPU1 ... shrink_page_list() __remove_mapping() __delete_from_page_cache() radix_tree_delete() evict_inode() truncate_inode_pages() truncate_inode_pages_range() pagevec_lookup() - finds nothing end_writeback() mapping->nrpages != 0 -> BUG page->mapping = NULL mapping->nrpages-- Fix the problem by doing a reliable check of mapping->nrpages under mapping->tree_lock in end_writeback(). Analyzed by Jay , lost in LKML, and dug out by Miklos Szeredi . Cc: Jay Cc: Miklos Szeredi Signed-off-by: Jan Kara Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 6e73e2e9ae33..b5b979247863 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -639,6 +639,7 @@ struct address_space { struct prio_tree_root i_mmap; /* tree of private and shared mappings */ struct list_head i_mmap_nonlinear;/*list VM_NONLINEAR mappings */ struct mutex i_mmap_mutex; /* protect tree, count, list */ + /* Protected by tree_lock together with the radix tree */ unsigned long nrpages; /* number of total pages */ pgoff_t writeback_index;/* writeback starts here */ const struct address_space_operations *a_ops; /* methods */ -- cgit v1.2.3 From 4aede84b33d6beb401136a3deca0651ae07c5e99 Mon Sep 17 00:00:00 2001 From: Justin TerAvest Date: Tue, 12 Jul 2011 08:31:45 +0200 Subject: fixlet: Remove fs_excl from struct task. fs_excl is a poor man's priority inheritance for filesystems to hint to the block layer that an operation is important. It was never clearly specified, not widely adopted, and will not prevent starvation in many cases (like across cgroups). fs_excl was introduced with the time sliced CFQ IO scheduler, to indicate when a process held FS exclusive resources and thus needed a boost. It doesn't cover all file systems, and it was never fully complete. Lets kill it. Signed-off-by: Justin TerAvest Signed-off-by: Jens Axboe --- include/linux/fs.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 6e73e2e9ae33..f6c866c287b5 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1453,10 +1453,6 @@ enum { #define vfs_check_frozen(sb, level) \ wait_event((sb)->s_wait_unfrozen, ((sb)->s_frozen < (level))) -#define get_fs_excl() atomic_inc(¤t->fs_excl) -#define put_fs_excl() atomic_dec(¤t->fs_excl) -#define has_fs_excl() atomic_read(¤t->fs_excl) - /* * until VFS tracks user namespaces for inodes, just make all files * belong to init_user_ns -- cgit v1.2.3 From 43e15cdbefea4ce6d68113de98d4f61c0cf45687 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 3 Jun 2011 20:16:57 -0400 Subject: new helper: iterate_supers_type() Call the given function for all superblocks of given type. Function gets a superblock (with s_umount locked shared) and (void *) argument supplied by caller of iterator. Signed-off-by: Al Viro --- include/linux/fs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index b5b979247863..a8735e7e1b35 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2432,6 +2432,8 @@ extern struct super_block *get_active_super(struct block_device *bdev); extern struct super_block *user_get_super(dev_t); extern void drop_super(struct super_block *sb); extern void iterate_supers(void (*)(struct super_block *, void *), void *); +extern void iterate_supers_type(struct file_system_type *, + void (*)(struct super_block *, void *), void *); extern int dcache_dir_open(struct inode *, struct file *); extern int dcache_dir_close(struct inode *, struct file *); -- cgit v1.2.3 From 3bfa784a6539f91a27d7ffdd408efdb638e3bebd Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 19 Jun 2011 12:55:10 -0400 Subject: kill file_permission() completely convert the last remaining caller to inode_permission() Signed-off-by: Al Viro --- include/linux/fs.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index a8735e7e1b35..d04e55586a17 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1490,7 +1490,6 @@ extern void dentry_unhash(struct dentry *dentry); /* * VFS file helper functions. */ -extern int file_permission(struct file *, int); extern void inode_init_owner(struct inode *inode, const struct inode *dir, mode_t mode); /* -- cgit v1.2.3 From 07b8ce1ee87d291ff564c02cf878fae973317a52 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 20 Jun 2011 10:52:57 -0400 Subject: lockless get_write_access/deny_write_access new helpers: atomic_inc_unless_negative()/atomic_dec_unless_positive() Signed-off-by: Al Viro --- include/linux/fs.h | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index d04e55586a17..8c84ed930389 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -392,8 +392,8 @@ struct inodes_stat_t { #include #include #include +#include -#include #include struct export_operations; @@ -2195,8 +2195,31 @@ static inline bool execute_ok(struct inode *inode) return (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode); } -extern int get_write_access(struct inode *); -extern int deny_write_access(struct file *); +/* + * get_write_access() gets write permission for a file. + * put_write_access() releases this write permission. + * This is used for regular files. + * We cannot support write (and maybe mmap read-write shared) accesses and + * MAP_DENYWRITE mmappings simultaneously. The i_writecount field of an inode + * can have the following values: + * 0: no writers, no VM_DENYWRITE mappings + * < 0: (-i_writecount) vm_area_structs with VM_DENYWRITE set exist + * > 0: (i_writecount) users are writing to the file. + * + * Normally we operate on that counter with atomic_{inc,dec} and it's safe + * except for the cases where we don't hold i_writecount yet. Then we need to + * use {get,deny}_write_access() - these functions check the sign and refuse + * to do the change if sign is wrong. + */ +static inline int get_write_access(struct inode *inode) +{ + return atomic_inc_unless_negative(&inode->i_writecount) ? 0 : -ETXTBSY; +} +static inline int deny_write_access(struct file *file) +{ + struct inode *inode = file->f_path.dentry->d_inode; + return atomic_dec_unless_positive(&inode->i_writecount) ? 0 : -ETXTBSY; +} static inline void put_write_access(struct inode * inode) { atomic_dec(&inode->i_writecount); -- cgit v1.2.3 From 178ea73521d64ba41d7aa5488fb9f549c6d4507d Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 20 Jun 2011 11:31:30 -0400 Subject: kill check_acl callback of generic_permission() its value depends only on inode and does not change; we might as well store it in ->i_op->check_acl and be done with that. Signed-off-by: Al Viro --- include/linux/fs.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 8c84ed930389..0c15d5e459d5 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2187,8 +2187,7 @@ extern sector_t bmap(struct inode *, sector_t); #endif extern int notify_change(struct dentry *, struct iattr *); extern int inode_permission(struct inode *, int); -extern int generic_permission(struct inode *, int, unsigned int, - int (*check_acl)(struct inode *, int, unsigned int)); +extern int generic_permission(struct inode *, int, unsigned int); static inline bool execute_ok(struct inode *inode) { -- cgit v1.2.3 From 1fc0f78ca9f311c6277e2f1b7655bb4d43ceb311 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 20 Jun 2011 18:59:02 -0400 Subject: ->permission() sanitizing: MAY_NOT_BLOCK Duplicate the flags argument into mask bitmap. Signed-off-by: Al Viro --- include/linux/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 0c15d5e459d5..60c1fe65bb2d 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -63,6 +63,7 @@ struct inodes_stat_t { #define MAY_ACCESS 16 #define MAY_OPEN 32 #define MAY_CHDIR 64 +#define MAY_NOT_BLOCK 128 /* called from RCU mode, don't block */ /* * flags in file.f_mode. Note that FMODE_READ and FMODE_WRITE must correspond -- cgit v1.2.3 From 7e40145eb111a5192e6d819f764db9d6828d1abb Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 20 Jun 2011 19:12:17 -0400 Subject: ->permission() sanitizing: don't pass flags to ->check_acl() not used in the instances anymore. Signed-off-by: Al Viro --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 60c1fe65bb2d..f218b42718aa 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1579,7 +1579,7 @@ struct inode_operations { struct dentry * (*lookup) (struct inode *,struct dentry *, struct nameidata *); void * (*follow_link) (struct dentry *, struct nameidata *); int (*permission) (struct inode *, int, unsigned int); - int (*check_acl)(struct inode *, int, unsigned int); + int (*check_acl)(struct inode *, int); int (*readlink) (struct dentry *, char __user *,int); void (*put_link) (struct dentry *, struct nameidata *, void *); -- cgit v1.2.3 From 2830ba7f34ebb27c4e5b8b6ef408cd6d74860890 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 20 Jun 2011 19:16:29 -0400 Subject: ->permission() sanitizing: don't pass flags to generic_permission() redundant; all callers get it duplicated in mask & MAY_NOT_BLOCK and none of them removes that bit. Signed-off-by: Al Viro --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index f218b42718aa..a1689c13eb77 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2188,7 +2188,7 @@ extern sector_t bmap(struct inode *, sector_t); #endif extern int notify_change(struct dentry *, struct iattr *); extern int inode_permission(struct inode *, int); -extern int generic_permission(struct inode *, int, unsigned int); +extern int generic_permission(struct inode *, int); static inline bool execute_ok(struct inode *inode) { -- cgit v1.2.3 From 10556cb21a0d0b24d95f00ea6df16f599a3345b2 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 20 Jun 2011 19:28:19 -0400 Subject: ->permission() sanitizing: don't pass flags to ->permission() not used by the instances anymore. Signed-off-by: Al Viro --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index a1689c13eb77..1fdefe3995fd 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1578,7 +1578,7 @@ struct file_operations { struct inode_operations { struct dentry * (*lookup) (struct inode *,struct dentry *, struct nameidata *); void * (*follow_link) (struct dentry *, struct nameidata *); - int (*permission) (struct inode *, int, unsigned int); + int (*permission) (struct inode *, int); int (*check_acl)(struct inode *, int); int (*readlink) (struct dentry *, char __user *,int); -- cgit v1.2.3 From 729cdb3a1ee03a4363f9c7e66ddd979727e99e1f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 21 Jun 2011 01:01:22 -0400 Subject: kill IPERM_FLAG_RCU not used anymore Signed-off-by: Al Viro --- include/linux/fs.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 1fdefe3995fd..8494aac189f0 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1573,8 +1573,6 @@ struct file_operations { loff_t len); }; -#define IPERM_FLAG_RCU 0x0001 - struct inode_operations { struct dentry * (*lookup) (struct inode *,struct dentry *, struct nameidata *); void * (*follow_link) (struct dentry *, struct nameidata *); -- cgit v1.2.3 From 0ee5dc676a5f8fadede608c7281dfedb1ae714ea Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 7 Jul 2011 15:44:25 -0400 Subject: btrfs: kill magical embedded struct superblock Signed-off-by: Al Viro --- include/linux/fs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 8494aac189f0..a0011aef4338 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1835,6 +1835,8 @@ void kill_litter_super(struct super_block *sb); void deactivate_super(struct super_block *sb); void deactivate_locked_super(struct super_block *sb); int set_anon_super(struct super_block *s, void *data); +int get_anon_bdev(dev_t *); +void free_anon_bdev(dev_t); struct super_block *sget(struct file_system_type *type, int (*test)(struct super_block *,void *), int (*set)(struct super_block *,void *), -- cgit v1.2.3 From 98b745c647a5a90c3c21ea43cbfad9a47b0dfad7 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 8 Jul 2011 14:14:39 +1000 Subject: inode: Make unused inode LRU per superblock The inode unused list is currently a global LRU. This does not match the other global filesystem cache - the dentry cache - which uses per-superblock LRU lists. Hence we have related filesystem object types using different LRU reclaimation schemes. To enable a per-superblock filesystem cache shrinker, both of these caches need to have per-sb unused object LRU lists. Hence this patch converts the global inode LRU to per-sb LRUs. The patch only does rudimentary per-sb propotioning in the shrinker infrastructure, as this gets removed when the per-sb shrinker callouts are introduced later on. Signed-off-by: Dave Chinner Signed-off-by: Al Viro --- include/linux/fs.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index a0011aef4338..9724f0a48742 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1397,6 +1397,10 @@ struct super_block { struct list_head s_dentry_lru; /* unused dentry lru */ int s_nr_dentry_unused; /* # of dentry on lru */ + /* inode_lru_lock protects s_inode_lru and s_nr_inodes_unused */ + struct list_head s_inode_lru; /* unused inode lru */ + int s_nr_inodes_unused; /* # of inodes on lru */ + struct block_device *s_bdev; struct backing_dev_info *s_bdi; struct mtd_info *s_mtd; -- cgit v1.2.3 From 09cc9fc7a7c3d872065426d7fb0f0ad6d3eb90fc Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 8 Jul 2011 14:14:40 +1000 Subject: inode: move to per-sb LRU locks With the inode LRUs moving to per-sb structures, there is no longer a need for a global inode_lru_lock. The locking can be made more fine-grained by moving to a per-sb LRU lock, isolating the LRU operations of different filesytsems completely from each other. Signed-off-by: Dave Chinner Signed-off-by: Al Viro --- include/linux/fs.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 9724f0a48742..460d2cc21ec6 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1397,7 +1397,8 @@ struct super_block { struct list_head s_dentry_lru; /* unused dentry lru */ int s_nr_dentry_unused; /* # of dentry on lru */ - /* inode_lru_lock protects s_inode_lru and s_nr_inodes_unused */ + /* s_inode_lru_lock protects s_inode_lru and s_nr_inodes_unused */ + spinlock_t s_inode_lru_lock ____cacheline_aligned_in_smp; struct list_head s_inode_lru; /* unused inode lru */ int s_nr_inodes_unused; /* # of inodes on lru */ -- cgit v1.2.3 From 8fb47a4fbf858a164e973b8ea8ef5e83e61f2e50 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Wed, 20 Jul 2011 20:21:59 -0400 Subject: locks: rename lock-manager ops Both the filesystem and the lock manager can associate operations with a lock. Confusingly, one of them (fl_release_private) actually has the same name in both operation structures. It would save some confusion to give the lock-manager ops different names. Signed-off-by: J. Bruce Fields --- include/linux/fs.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index b5b979247863..cf719beb2016 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1069,12 +1069,12 @@ struct file_lock_operations { }; struct lock_manager_operations { - int (*fl_compare_owner)(struct file_lock *, struct file_lock *); - void (*fl_notify)(struct file_lock *); /* unblock callback */ - int (*fl_grant)(struct file_lock *, struct file_lock *, int); - void (*fl_release_private)(struct file_lock *); - void (*fl_break)(struct file_lock *); - int (*fl_change)(struct file_lock **, int); + int (*lm_compare_owner)(struct file_lock *, struct file_lock *); + void (*lm_notify)(struct file_lock *); /* unblock callback */ + int (*lm_grant)(struct file_lock *, struct file_lock *, int); + void (*lm_release_private)(struct file_lock *); + void (*lm_break)(struct file_lock *); + int (*lm_change)(struct file_lock **, int); }; struct lock_manager { -- cgit v1.2.3 From b0d40c92adafde7c2d81203ce7c1c69275f41140 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 8 Jul 2011 14:14:42 +1000 Subject: superblock: introduce per-sb cache shrinker infrastructure With context based shrinkers, we can implement a per-superblock shrinker that shrinks the caches attached to the superblock. We currently have global shrinkers for the inode and dentry caches that split up into per-superblock operations via a coarse proportioning method that does not batch very well. The global shrinkers also have a dependency - dentries pin inodes - so we have to be very careful about how we register the global shrinkers so that the implicit call order is always correct. With a per-sb shrinker callout, we can encode this dependency directly into the per-sb shrinker, hence avoiding the need for strictly ordering shrinker registrations. We also have no need for any proportioning code for the shrinker subsystem already provides this functionality across all shrinkers. Allowing the shrinker to operate on a single superblock at a time means that we do less superblock list traversals and locking and reclaim should batch more effectively. This should result in less CPU overhead for reclaim and potentially faster reclaim of items from each filesystem. Signed-off-by: Dave Chinner Signed-off-by: Al Viro --- include/linux/fs.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 460d2cc21ec6..d7f35e90b84a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -393,6 +393,7 @@ struct inodes_stat_t { #include #include #include +#include #include #include @@ -1444,8 +1445,14 @@ struct super_block { * Saved pool identifier for cleancache (-1 means none) */ int cleancache_poolid; + + struct shrinker s_shrink; /* per-sb shrinker handle */ }; +/* superblock cache pruning functions */ +extern void prune_icache_sb(struct super_block *sb, int nr_to_scan); +extern void prune_dcache_sb(struct super_block *sb, int nr_to_scan); + extern struct timespec current_fs_time(struct super_block *sb); /* -- cgit v1.2.3 From 0e1fdafd93980eac62e778798549ce0f6073905c Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 8 Jul 2011 14:14:44 +1000 Subject: superblock: add filesystem shrinker operations Now we have a per-superblock shrinker implementation, we can add a filesystem specific callout to it to allow filesystem internal caches to be shrunk by the superblock shrinker. Rather than perpetuate the multipurpose shrinker callback API (i.e. nr_to_scan == 0 meaning "tell me how many objects freeable in the cache), two operations will be added. The first will return the number of objects that are freeable, the second is the actual shrinker call. Signed-off-by: Dave Chinner Signed-off-by: Al Viro --- include/linux/fs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index d7f35e90b84a..1393742bba9b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1655,6 +1655,8 @@ struct super_operations { ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); #endif int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t); + int (*nr_cached_objects)(struct super_block *); + void (*free_cached_objects)(struct super_block *, int); }; /* -- cgit v1.2.3 From bd5fe6c5eb9c548d7f07fe8f89a150bb6705e8e3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Jun 2011 14:29:43 -0400 Subject: fs: kill i_alloc_sem i_alloc_sem is a rather special rw_semaphore. It's the last one that may be released by a non-owner, and it's write side is always mirrored by real exclusion. It's intended use it to wait for all pending direct I/O requests to finish before starting a truncate. Replace it with a hand-grown construct: - exclusion for truncates is already guaranteed by i_mutex, so it can simply fall way - the reader side is replaced by an i_dio_count member in struct inode that counts the number of pending direct I/O requests. Truncate can't proceed as long as it's non-zero - when i_dio_count reaches non-zero we wake up a pending truncate using wake_up_bit on a new bit in i_flags - new references to i_dio_count can't appear while we are waiting for it to read zero because the direct I/O count always needs i_mutex (or an equivalent like XFS's i_iolock) for starting a new operation. This scheme is much simpler, and saves the space of a spinlock_t and a struct list_head in struct inode (typically 160 bits on a non-debug 64-bit system). Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- include/linux/fs.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 1393742bba9b..2fe920774abf 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -779,7 +779,7 @@ struct inode { struct timespec i_ctime; blkcnt_t i_blocks; unsigned short i_bytes; - struct rw_semaphore i_alloc_sem; + atomic_t i_dio_count; const struct file_operations *i_fop; /* former ->i_op->default_file_ops */ struct file_lock *i_flock; struct address_space *i_mapping; @@ -1705,6 +1705,10 @@ struct super_operations { * set during data writeback, and cleared with a wakeup * on the bit address once it is done. * + * I_REFERENCED Marks the inode as recently references on the LRU list. + * + * I_DIO_WAKEUP Never set. Only used as a key for wait_on_bit(). + * * Q: What is the difference between I_WILL_FREE and I_FREEING? */ #define I_DIRTY_SYNC (1 << 0) @@ -1718,6 +1722,8 @@ struct super_operations { #define __I_SYNC 7 #define I_SYNC (1 << __I_SYNC) #define I_REFERENCED (1 << 8) +#define __I_DIO_WAKEUP 9 +#define I_DIO_WAKEUP (1 << I_DIO_WAKEUP) #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES) @@ -1828,7 +1834,6 @@ struct file_system_type { struct lock_class_key i_lock_key; struct lock_class_key i_mutex_key; struct lock_class_key i_mutex_dir_key; - struct lock_class_key i_alloc_sem_key; }; extern struct dentry *mount_ns(struct file_system_type *fs_type, int flags, @@ -2404,6 +2409,8 @@ enum { }; void dio_end_io(struct bio *bio, int error); +void inode_dio_wait(struct inode *inode); +void inode_dio_done(struct inode *inode); ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, struct block_device *bdev, const struct iovec *iov, loff_t offset, -- cgit v1.2.3 From aacfc19c626ebd3daa675652457d71019a1f583f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Jun 2011 14:29:47 -0400 Subject: fs: simplify the blockdev_direct_IO prototype Simple filesystems always pass inode->i_sb_bdev as the block device argument, and never need a end_io handler. Let's simply things for them and for my grepping activity by dropping these arguments. The only thing not falling into that scheme is ext4, which passes and end_io handler without needing special flags (yet), but given how messy the direct I/O code there is use of __blockdev_direct_IO in one instead of two out of three cases isn't going to make a large difference anyway. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- include/linux/fs.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 2fe920774abf..824453be9fee 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2418,12 +2418,11 @@ ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, dio_submit_t submit_io, int flags); static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb, - struct inode *inode, struct block_device *bdev, const struct iovec *iov, - loff_t offset, unsigned long nr_segs, get_block_t get_block, - dio_iodone_t end_io) + struct inode *inode, const struct iovec *iov, loff_t offset, + unsigned long nr_segs, get_block_t get_block) { - return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, - nr_segs, get_block, end_io, NULL, + return __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, + offset, nr_segs, get_block, NULL, NULL, DIO_LOCKING | DIO_SKIP_HOLES); } #endif -- cgit v1.2.3 From 982d816581eeeacfe5b2b7c6d47d13a157616eff Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 18 Jul 2011 13:21:35 -0400 Subject: fs: add SEEK_HOLE and SEEK_DATA flags This just gets us ready to support the SEEK_HOLE and SEEK_DATA flags. Turns out using fiemap in things like cp cause more problems than it solves, so lets try and give userspace an interface that doesn't suck. We need to match solaris here, and the definitions are *o* If /whence/ is SEEK_HOLE, the offset of the start of the next hole greater than or equal to the supplied offset is returned. The definition of a hole is provided near the end of the DESCRIPTION. *o* If /whence/ is SEEK_DATA, the file pointer is set to the start of the next non-hole file region greater than or equal to the supplied offset. So in the generic case the entire file is data and there is a virtual hole at the end. That means we will just return i_size for SEEK_HOLE and will return the same offset for SEEK_DATA. This is how Solaris does it so we have to do it the same way. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Al Viro --- include/linux/fs.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 824453be9fee..4a61f98823a6 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -32,7 +32,9 @@ #define SEEK_SET 0 /* seek relative to beginning of file */ #define SEEK_CUR 1 /* seek relative to current file position */ #define SEEK_END 2 /* seek relative to end of file */ -#define SEEK_MAX SEEK_END +#define SEEK_DATA 3 /* seek to the next data */ +#define SEEK_HOLE 4 /* seek to the next hole */ +#define SEEK_MAX SEEK_HOLE struct fstrim_range { __u64 start; -- cgit v1.2.3 From 02c24a82187d5a628c68edfe71ae60dc135cd178 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Sat, 16 Jul 2011 20:44:56 -0400 Subject: fs: push i_mutex and filemap_write_and_wait down into ->fsync() handlers Btrfs needs to be able to control how filemap_write_and_wait_range() is called in fsync to make it less of a painful operation, so push down taking i_mutex and the calling of filemap_write_and_wait() down into the ->fsync() handlers. Some file systems can drop taking the i_mutex altogether it seems, like ext3 and ocfs2. For correctness sake I just pushed everything down in all cases to make sure that we keep the current behavior the same for everybody, and then each individual fs maintainer can make up their mind about what to do from there. Thanks, Acked-by: Jan Kara Signed-off-by: Josef Bacik Signed-off-by: Al Viro --- include/linux/fs.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 4a61f98823a6..9cd2075c4a39 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1572,7 +1572,7 @@ struct file_operations { int (*open) (struct inode *, struct file *); int (*flush) (struct file *, fl_owner_t id); int (*release) (struct inode *, struct file *); - int (*fsync) (struct file *, int datasync); + int (*fsync) (struct file *, loff_t, loff_t, int datasync); int (*aio_fsync) (struct kiocb *, int datasync); int (*fasync) (int, struct file *, int); int (*lock) (struct file *, int, struct file_lock *); @@ -2360,7 +2360,8 @@ extern int generic_segment_checks(const struct iovec *iov, /* fs/block_dev.c */ extern ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos); -extern int blkdev_fsync(struct file *filp, int datasync); +extern int blkdev_fsync(struct file *filp, loff_t start, loff_t end, + int datasync); /* fs/splice.c */ extern ssize_t generic_file_splice_read(struct file *, loff_t *, @@ -2490,7 +2491,7 @@ extern int simple_link(struct dentry *, struct inode *, struct dentry *); extern int simple_unlink(struct inode *, struct dentry *); extern int simple_rmdir(struct inode *, struct dentry *); extern int simple_rename(struct inode *, struct dentry *, struct inode *, struct dentry *); -extern int noop_fsync(struct file *, int); +extern int noop_fsync(struct file *, loff_t, loff_t, int); extern int simple_empty(struct dentry *); extern int simple_readpage(struct file *file, struct page *page); extern int simple_write_begin(struct file *file, struct address_space *mapping, @@ -2515,7 +2516,7 @@ extern ssize_t simple_read_from_buffer(void __user *to, size_t count, extern ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos, const void __user *from, size_t count); -extern int generic_file_fsync(struct file *, int); +extern int generic_file_fsync(struct file *, loff_t, loff_t, int); extern int generic_check_addressable(unsigned, u64); -- cgit v1.2.3 From 295cc522c2b2834b302424e41ebb5846df6d61bd Mon Sep 17 00:00:00 2001 From: Wanlong Gao Date: Tue, 19 Jul 2011 09:48:39 +0800 Subject: fs:update the NOTE of the file_operations structure Big kernel lock had been removed and setlease now use the lock_flocks() to hold a special spin lock file_lock_lock by Matthew. So just remove the out-of-date NOTE. Signed-off-by: Wanlong Gao Signed-off-by: Al Viro --- include/linux/fs.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 9cd2075c4a39..e0569e4ab2d5 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1552,11 +1552,6 @@ struct block_device_operations; #define HAVE_COMPAT_IOCTL 1 #define HAVE_UNLOCKED_IOCTL 1 -/* - * NOTE: - * all file operations except setlease can be called without - * the big kernel lock held in all filesystems. - */ struct file_operations { struct module *owner; loff_t (*llseek) (struct file *, loff_t, int); -- cgit v1.2.3 From ed70afcd6e795e3de98df56f1cd0f898fbf641a7 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 21 Jul 2011 13:55:37 -0700 Subject: mm/truncate.c: fix build for CONFIG_BLOCK not enabled Fix build error when CONFIG_BLOCK is not enabled by providing a stub inode_dio_wait() function. mm/truncate.c:612: error: implicit declaration of function 'inode_dio_wait' Signed-off-by: Randy Dunlap Signed-off-by: Al Viro --- include/linux/fs.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index e0569e4ab2d5..b224dc468a23 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2423,6 +2423,10 @@ static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb, offset, nr_segs, get_block, NULL, NULL, DIO_LOCKING | DIO_SKIP_HOLES); } +#else +static inline void inode_dio_wait(struct inode *inode) +{ +} #endif extern const struct file_operations generic_ro_fops; -- cgit v1.2.3 From 423e0ab086ad8b33626e45fa94ac7613146b7ffa Mon Sep 17 00:00:00 2001 From: Tim Chen Date: Tue, 19 Jul 2011 09:32:38 -0700 Subject: VFS : mount lock scalability for internal mounts For a number of file systems that don't have a mount point (e.g. sockfs and pipefs), they are not marked as long term. Therefore in mntput_no_expire, all locks in vfs_mount lock are taken instead of just local cpu's lock to aggregate reference counts when we release reference to file objects. In fact, only local lock need to have been taken to update ref counts as these file systems are in no danger of going away until we are ready to unregister them. The attached patch marks file systems using kern_mount without mount point as long term. The contentions of vfs_mount lock is now eliminated. Before un-registering such file system, kern_unmount should be called to remove the long term flag and make the mount point ready to be freed. Signed-off-by: Tim Chen Signed-off-by: Al Viro --- include/linux/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index b224dc468a23..7a757a48a5c6 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1885,6 +1885,7 @@ extern int register_filesystem(struct file_system_type *); extern int unregister_filesystem(struct file_system_type *); extern struct vfsmount *kern_mount_data(struct file_system_type *, void *data); #define kern_mount(type) kern_mount_data(type, NULL) +extern void kern_unmount(struct vfsmount *mnt); extern int may_umount_tree(struct vfsmount *); extern int may_umount(struct vfsmount *); extern long do_mount(char *, char *, char *, unsigned long, void *); -- cgit v1.2.3 From 4e34e719e457f2e031297175410fc0bd4016a085 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 23 Jul 2011 17:37:31 +0200 Subject: fs: take the ACL checks to common code Replace the ->check_acl method with a ->get_acl method that simply reads an ACL from disk after having a cache miss. This means we can replace the ACL checking boilerplate code with a single implementation in namei.c. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 7a757a48a5c6..12f84b30c3ca 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1586,7 +1586,7 @@ struct inode_operations { struct dentry * (*lookup) (struct inode *,struct dentry *, struct nameidata *); void * (*follow_link) (struct dentry *, struct nameidata *); int (*permission) (struct inode *, int); - int (*check_acl)(struct inode *, int); + struct posix_acl * (*get_acl)(struct inode *, int); int (*readlink) (struct dentry *, char __user *,int); void (*put_link) (struct dentry *, struct nameidata *, void *); -- cgit v1.2.3 From a209dfc7b0d94bd6fa94553c097836a2e6d0f0ba Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 26 Jul 2011 11:36:34 +0200 Subject: vfs: dont chain pipe/anon/socket on superblock s_inodes list Workloads using pipes and sockets hit inode_sb_list_lock contention. superblock s_inodes list is needed for quota, dirty, pagecache and fsnotify management. pipe/anon/socket fs are clearly not candidates for these. Signed-off-by: Eric Dumazet Reviewed-by: Christoph Hellwig Signed-off-by: Al Viro --- include/linux/fs.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index a6658043258a..cc363fa7bb82 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2310,7 +2310,8 @@ extern void __iget(struct inode * inode); extern void iget_failed(struct inode *); extern void end_writeback(struct inode *); extern void __destroy_inode(struct inode *); -extern struct inode *new_inode(struct super_block *); +extern struct inode *new_inode_pseudo(struct super_block *sb); +extern struct inode *new_inode(struct super_block *sb); extern void free_inode_nonrcu(struct inode *inode); extern int should_remove_suid(struct dentry *); extern int file_remove_suid(struct file *); -- cgit v1.2.3 From f2ee7abf4c40c8e6bffced923a7c01ea2d1f6c97 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 28 Jul 2011 06:41:09 +0200 Subject: vfs: avoid taking inode_hash_lock on pipes and sockets Some inodes (pipes, sockets, ...) are not hashed, no need to take contended inode_hash_lock at dismantle time. nice speedup on SMP machines on socket intensive workloads. Signed-off-by: Eric Dumazet Signed-off-by: Al Viro --- include/linux/fs.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index f23bcb77260c..786b3b1113cf 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2317,11 +2317,18 @@ extern int should_remove_suid(struct dentry *); extern int file_remove_suid(struct file *); extern void __insert_inode_hash(struct inode *, unsigned long hashval); -extern void remove_inode_hash(struct inode *); static inline void insert_inode_hash(struct inode *inode) { __insert_inode_hash(inode, inode->i_ino); } + +extern void __remove_inode_hash(struct inode *); +static inline void remove_inode_hash(struct inode *inode) +{ + if (!inode_unhashed(inode)) + __remove_inode_hash(inode); +} + extern void inode_sb_list_add(struct inode *inode); #ifdef CONFIG_BLOCK -- cgit v1.2.3 From 3ddcd0569cd68f00f3beae9a7959b72918bb91f4 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 6 Aug 2011 22:45:50 -0700 Subject: vfs: optimize inode cache access patterns The inode structure layout is largely random, and some of the vfs paths really do care. The path lookup in particular is already quite D$ intensive, and profiles show that accessing the 'inode->i_op->xyz' fields is quite costly. We already optimized the dcache to not unnecessarily load the d_op structure for members that are often NULL using the DCACHE_OP_xyz bits in dentry->d_flags, and this does something very similar for the inode ops that are used during pathname lookup. It also re-orders the fields so that the fields accessed by 'stat' are together at the beginning of the inode structure, and roughly in the order accessed. The effect of this seems to be in the 1-2% range for an empty kernel "make -j" run (which is fairly kernel-intensive, mostly in filename lookup), so it's visible. The numbers are fairly noisy, though, and likely depend a lot on exact microarchitecture. So there's more tuning to be done. Signed-off-by: Linus Torvalds --- include/linux/fs.h | 59 ++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 37 insertions(+), 22 deletions(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 786b3b1113cf..178cdb4f1d4a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -738,22 +738,54 @@ static inline int mapping_writably_mapped(struct address_space *mapping) struct posix_acl; #define ACL_NOT_CACHED ((void *)(-1)) +#define IOP_FASTPERM 0x0001 +#define IOP_LOOKUP 0x0002 +#define IOP_NOFOLLOW 0x0004 + +/* + * Keep mostly read-only and often accessed (especially for + * the RCU path lookup and 'stat' data) fields at the beginning + * of the 'struct inode' + */ struct inode { - /* RCU path lookup touches following: */ umode_t i_mode; + unsigned short i_opflags; uid_t i_uid; gid_t i_gid; + unsigned int i_flags; + +#ifdef CONFIG_FS_POSIX_ACL + struct posix_acl *i_acl; + struct posix_acl *i_default_acl; +#endif + const struct inode_operations *i_op; struct super_block *i_sb; + struct address_space *i_mapping; - spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */ - unsigned int i_flags; - unsigned long i_state; #ifdef CONFIG_SECURITY void *i_security; #endif - struct mutex i_mutex; + /* Stat data, not accessed from path walking */ + unsigned long i_ino; + unsigned int i_nlink; + dev_t i_rdev; + loff_t i_size; + struct timespec i_atime; + struct timespec i_mtime; + struct timespec i_ctime; + unsigned int i_blkbits; + blkcnt_t i_blocks; + +#ifdef __NEED_I_SIZE_ORDERED + seqcount_t i_size_seqcount; +#endif + + /* Misc */ + unsigned long i_state; + spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */ + struct mutex i_mutex; unsigned long dirtied_when; /* jiffies of first dirtying */ @@ -765,25 +797,12 @@ struct inode { struct list_head i_dentry; struct rcu_head i_rcu; }; - unsigned long i_ino; atomic_t i_count; - unsigned int i_nlink; - dev_t i_rdev; - unsigned int i_blkbits; u64 i_version; - loff_t i_size; -#ifdef __NEED_I_SIZE_ORDERED - seqcount_t i_size_seqcount; -#endif - struct timespec i_atime; - struct timespec i_mtime; - struct timespec i_ctime; - blkcnt_t i_blocks; unsigned short i_bytes; atomic_t i_dio_count; const struct file_operations *i_fop; /* former ->i_op->default_file_ops */ struct file_lock *i_flock; - struct address_space *i_mapping; struct address_space i_data; #ifdef CONFIG_QUOTA struct dquot *i_dquot[MAXQUOTAS]; @@ -806,10 +825,6 @@ struct inode { atomic_t i_readcount; /* struct files open RO */ #endif atomic_t i_writecount; -#ifdef CONFIG_FS_POSIX_ACL - struct posix_acl *i_acl; - struct posix_acl *i_default_acl; -#endif void *i_private; /* fs or device private pointer */ }; -- cgit v1.2.3