From facd07b07d2a7988f5ce849558838cc953847637 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Sun, 23 May 2010 11:00:55 -0400 Subject: direct-io: add a hook for the fs to provide its own submit_bio function Because BTRFS can do RAID and such, we need our own submit hook so we can setup the bio's in the correct fashion, and handle checksum errors properly. So there are a few changes here 1) The submit_io hook. This is straightforward, just call this instead of submit_bio. 2) Allow the fs to return -ENOTBLK for reads. Usually this has only worked for writes, since writes can fallback onto buffered IO. But BTRFS needs the option of falling back on buffered IO if it encounters a compressed extent, since we need to read the entire extent in and decompress it. So if we get -ENOTBLK back from get_block we'll return back and fallback on buffered just like the write case. I've tested these changes with fsx and everything seems to work. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- include/linux/fs.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 44f35aea2f1f..10704f0086c8 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2250,10 +2250,15 @@ static inline int xip_truncate_page(struct address_space *mapping, loff_t from) #endif #ifdef CONFIG_BLOCK +struct bio; +typedef void (dio_submit_t)(int rw, struct bio *bio, struct inode *inode, + loff_t file_offset); +void dio_end_io(struct bio *bio, int error); + ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, struct block_device *bdev, const struct iovec *iov, loff_t offset, unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, - int lock_type); + dio_submit_t submit_io, int lock_type); enum { /* need locking between buffered and direct access */ @@ -2269,7 +2274,7 @@ static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb, dio_iodone_t end_io) { return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, - nr_segs, get_block, end_io, + nr_segs, get_block, end_io, NULL, DIO_LOCKING | DIO_SKIP_HOLES); } @@ -2279,7 +2284,7 @@ static inline ssize_t blockdev_direct_IO_no_locking(int rw, struct kiocb *iocb, dio_iodone_t end_io) { return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, - nr_segs, get_block, end_io, 0); + nr_segs, get_block, end_io, NULL, 0); } #endif -- cgit v1.2.3 From ae6afc3f5cf53fb97bac2d0a209bb465952742e7 Mon Sep 17 00:00:00 2001 From: jan Blunck Date: Wed, 26 May 2010 14:44:48 -0700 Subject: vfs: introduce noop_llseek() This is an implementation of ->llseek useable for the rare special case when userspace expects the seek to succeed but the (device) file is actually not able to perform the seek. In this case you use noop_llseek() instead of falling back to the default implementation of ->llseek. Signed-off-by: Jan Blunck Cc: Frederic Weisbecker Cc: Christoph Hellwig Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index b336cb9ca9a0..9682d52d1507 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2228,6 +2228,7 @@ extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, extern void file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping); +extern loff_t noop_llseek(struct file *file, loff_t offset, int origin); extern loff_t no_llseek(struct file *file, loff_t offset, int origin); extern loff_t generic_file_llseek(struct file *file, loff_t offset, int origin); extern loff_t generic_file_llseek_unlocked(struct file *file, loff_t offset, -- cgit v1.2.3 From d7065da038227a4d09a244e6014e0186a6bd21d0 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 26 May 2010 15:13:55 -0400 Subject: get rid of the magic around f_count in aio __aio_put_req() plays sick games with file refcount. What it wants is fput() from atomic context; it's almost always done with f_count > 1, so they only have to deal with delayed work in rare cases when their reference happens to be the last one. Current code decrements f_count and if it hasn't hit 0, everything is fine. Otherwise it keeps a pointer to struct file (with zero f_count!) around and has delayed work do __fput() on it. Better way to do it: use atomic_long_add_unless( , -1, 1) instead of !atomic_long_dec_and_test(). IOW, decrement it only if it's not the last reference, leave refcount alone if it was. And use normal fput() in delayed work. I've made that atomic_long_add_unless call a new helper - fput_atomic(). Drops a reference to file if it's safe to do in atomic (i.e. if that's not the last one), tells if it had been able to do that. aio.c converted to it, __fput() use is gone. req->ki_file *always* contributes to refcount now. And __fput() became static. Signed-off-by: Al Viro --- include/linux/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 85e823adcd4a..3d9ed8302402 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -954,6 +954,7 @@ extern spinlock_t files_lock; #define file_list_unlock() spin_unlock(&files_lock); #define get_file(x) atomic_long_inc(&(x)->f_count) +#define fput_atomic(x) atomic_long_add_unless(&(x)->f_count, -1, 1) #define file_count(x) atomic_long_read(&(x)->f_count) #ifdef CONFIG_DEBUG_WRITECOUNT -- cgit v1.2.3 From 7ea8085910ef3dd4f3cad6845aaa2b580d39b115 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 26 May 2010 17:53:25 +0200 Subject: drop unused dentry argument to ->fsync Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- include/linux/fs.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 3d9ed8302402..eb39e5eb77f5 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1498,7 +1498,7 @@ struct file_operations { int (*open) (struct inode *, struct file *); int (*flush) (struct file *, fl_owner_t id); int (*release) (struct inode *, struct file *); - int (*fsync) (struct file *, struct dentry *, int datasync); + int (*fsync) (struct file *, int datasync); int (*aio_fsync) (struct kiocb *, int datasync); int (*fasync) (int, struct file *, int); int (*lock) (struct file *, int, struct file_lock *); @@ -2213,7 +2213,7 @@ extern int generic_segment_checks(const struct iovec *iov, /* fs/block_dev.c */ extern ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos); -extern int blkdev_fsync(struct file *filp, struct dentry *dentry, int datasync); +extern int blkdev_fsync(struct file *filp, int datasync); /* fs/splice.c */ extern ssize_t generic_file_splice_read(struct file *, loff_t *, @@ -2348,7 +2348,7 @@ extern int simple_link(struct dentry *, struct inode *, struct dentry *); extern int simple_unlink(struct inode *, struct dentry *); extern int simple_rmdir(struct inode *, struct dentry *); extern int simple_rename(struct inode *, struct dentry *, struct inode *, struct dentry *); -extern int simple_sync_file(struct file *, struct dentry *, int); +extern int simple_sync_file(struct file *, int); extern int simple_empty(struct dentry *); extern int simple_readpage(struct file *file, struct page *page); extern int simple_write_begin(struct file *file, struct address_space *mapping, @@ -2373,7 +2373,7 @@ extern ssize_t simple_read_from_buffer(void __user *to, size_t count, extern ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos, const void __user *from, size_t count); -extern int simple_fsync(struct file *, struct dentry *, int); +extern int simple_fsync(struct file *, int); #ifdef CONFIG_MIGRATION extern int buffer_migrate_page(struct address_space *, -- cgit v1.2.3 From 1b061d9247f71cd15edc4c4c4600191a903642c0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 26 May 2010 17:53:41 +0200 Subject: rename the generic fsync implementations We don't name our generic fsync implementations very well currently. The no-op implementation for in-memory filesystems currently is called simple_sync_file which doesn't make too much sense to start with, the the generic one for simple filesystems is called simple_fsync which can lead to some confusion. This patch renames the generic file fsync method to generic_file_fsync to match the other generic_file_* routines it is supposed to be used with, and the no-op implementation to noop_fsync to make it obvious what to expect. In addition add some documentation for both methods. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- include/linux/fs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index eb39e5eb77f5..acf6c52a50dd 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2348,7 +2348,7 @@ extern int simple_link(struct dentry *, struct inode *, struct dentry *); extern int simple_unlink(struct inode *, struct dentry *); extern int simple_rmdir(struct inode *, struct dentry *); extern int simple_rename(struct inode *, struct dentry *, struct inode *, struct dentry *); -extern int simple_sync_file(struct file *, int); +extern int noop_fsync(struct file *, int); extern int simple_empty(struct dentry *); extern int simple_readpage(struct file *file, struct page *page); extern int simple_write_begin(struct file *file, struct address_space *mapping, @@ -2373,7 +2373,7 @@ extern ssize_t simple_read_from_buffer(void __user *to, size_t count, extern ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos, const void __user *from, size_t count); -extern int simple_fsync(struct file *, int); +extern int generic_file_fsync(struct file *, int); #ifdef CONFIG_MIGRATION extern int buffer_migrate_page(struct address_space *, -- cgit v1.2.3 From 7bb46a6734a7e1ad4beaecc11cae7ed3ff81d30f Mon Sep 17 00:00:00 2001 From: "npiggin@suse.de" Date: Thu, 27 May 2010 01:05:33 +1000 Subject: fs: introduce new truncate sequence Introduce a new truncate calling sequence into fs/mm subsystems. Rather than setattr > vmtruncate > truncate, have filesystems call their truncate sequence from ->setattr if filesystem specific operations are required. vmtruncate is deprecated, and truncate_pagecache and inode_newsize_ok helpers introduced previously should be used. simple_setattr is introduced for simple in-ram filesystems to implement the new truncate sequence. Eventually all filesystems should be converted to implement a setattr, and the default code in notify_change should go away. simple_setsize is also introduced to perform just the ATTR_SIZE portion of simple_setattr (ie. changing i_size and trimming pagecache). To implement the new truncate sequence: - filesystem specific manipulations (eg freeing blocks) must be done in the setattr method rather than ->truncate. - vmtruncate can not be used by core code to trim blocks past i_size in the event of write failure after allocation, so this must be performed in the fs code. - convert usage of helpers block_write_begin, nobh_write_begin, cont_write_begin, and *blockdev_direct_IO* to use _newtrunc postfixed variants. These avoid calling vmtruncate to trim blocks (see previous). - inode_setattr should not be used. generic_setattr is a new function to be used to copy simple attributes into the generic inode. - make use of the better opportunity to handle errors with the new sequence. Big problem with the previous calling sequence: the filesystem is not called until i_size has already changed. This means it is not allowed to fail the call, and also it does not know what the previous i_size was. Also, generic code calling vmtruncate to truncate allocated blocks in case of error had no good way to return a meaningful error (or, for example, atomically handle block deallocation). Cc: Christoph Hellwig Acked-by: Jan Kara Signed-off-by: Nick Piggin Signed-off-by: Al Viro --- include/linux/fs.h | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index acf6c52a50dd..3428393942a6 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2257,6 +2257,10 @@ typedef void (dio_submit_t)(int rw, struct bio *bio, struct inode *inode, loff_t file_offset); void dio_end_io(struct bio *bio, int error); +ssize_t __blockdev_direct_IO_newtrunc(int rw, struct kiocb *iocb, struct inode *inode, + struct block_device *bdev, const struct iovec *iov, loff_t offset, + unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, + dio_submit_t submit_io, int lock_type); ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, struct block_device *bdev, const struct iovec *iov, loff_t offset, unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, @@ -2270,6 +2274,24 @@ enum { DIO_SKIP_HOLES = 0x02, }; +static inline ssize_t blockdev_direct_IO_newtrunc(int rw, struct kiocb *iocb, + struct inode *inode, struct block_device *bdev, const struct iovec *iov, + loff_t offset, unsigned long nr_segs, get_block_t get_block, + dio_iodone_t end_io) +{ + return __blockdev_direct_IO_newtrunc(rw, iocb, inode, bdev, iov, offset, + nr_segs, get_block, end_io, NULL, + DIO_LOCKING | DIO_SKIP_HOLES); +} + +static inline ssize_t blockdev_direct_IO_no_locking_newtrunc(int rw, struct kiocb *iocb, + struct inode *inode, struct block_device *bdev, const struct iovec *iov, + loff_t offset, unsigned long nr_segs, get_block_t get_block, + dio_iodone_t end_io) +{ + return __blockdev_direct_IO_newtrunc(rw, iocb, inode, bdev, iov, offset, + nr_segs, get_block, end_io, NULL, 0); +} static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, struct block_device *bdev, const struct iovec *iov, loff_t offset, unsigned long nr_segs, get_block_t get_block, @@ -2342,12 +2364,14 @@ extern int dcache_dir_open(struct inode *, struct file *); extern int dcache_dir_close(struct inode *, struct file *); extern loff_t dcache_dir_lseek(struct file *, loff_t, int); extern int dcache_readdir(struct file *, void *, filldir_t); +extern int simple_setattr(struct dentry *, struct iattr *); extern int simple_getattr(struct vfsmount *, struct dentry *, struct kstat *); extern int simple_statfs(struct dentry *, struct kstatfs *); extern int simple_link(struct dentry *, struct inode *, struct dentry *); extern int simple_unlink(struct inode *, struct dentry *); extern int simple_rmdir(struct inode *, struct dentry *); extern int simple_rename(struct inode *, struct dentry *, struct inode *, struct dentry *); +extern int simple_setsize(struct inode *, loff_t); extern int noop_fsync(struct file *, int); extern int simple_empty(struct dentry *); extern int simple_readpage(struct file *file, struct page *page); @@ -2384,7 +2408,8 @@ extern int buffer_migrate_page(struct address_space *, extern int inode_change_ok(const struct inode *, struct iattr *); extern int inode_newsize_ok(const struct inode *, loff_t offset); -extern int __must_check inode_setattr(struct inode *, struct iattr *); +extern int __must_check inode_setattr(struct inode *, const struct iattr *); +extern void generic_setattr(struct inode *inode, const struct iattr *attr); extern void file_update_time(struct file *file); -- cgit v1.2.3