From f758eeabeb96f878c860e8f110f94ec8820822a9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 21 Apr 2011 18:19:44 -0600 Subject: writeback: split inode_wb_list_lock into bdi_writeback.list_lock Split the global inode_wb_list_lock into a per-bdi_writeback list_lock, as it's currently the most contended lock in the system for metadata heavy workloads. It won't help for single-filesystem workloads for which we'll need the I/O-less balance_dirty_pages, but at least we can dedicate a cpu to spinning on each bdi now for larger systems. Based on earlier patches from Nick Piggin and Dave Chinner. It reduces lock contentions to 1/4 in this test case: 10 HDD JBOD, 100 dd on each disk, XFS, 6GB ram lock_stat version 0.3 ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- class name con-bounces contentions waittime-min waittime-max waittime-total acq-bounces acquisitions holdtime-min holdtime-max holdtime-total ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- vanilla 2.6.39-rc3: inode_wb_list_lock: 42590 44433 0.12 147.74 144127.35 252274 886792 0.08 121.34 917211.23 ------------------ inode_wb_list_lock 2 [] bdev_inode_switch_bdi+0x29/0x85 inode_wb_list_lock 34 [] inode_wb_list_del+0x22/0x49 inode_wb_list_lock 12893 [] __mark_inode_dirty+0x170/0x1d0 inode_wb_list_lock 10702 [] writeback_single_inode+0x16d/0x20a ------------------ inode_wb_list_lock 2 [] bdev_inode_switch_bdi+0x29/0x85 inode_wb_list_lock 19 [] inode_wb_list_del+0x22/0x49 inode_wb_list_lock 5550 [] __mark_inode_dirty+0x170/0x1d0 inode_wb_list_lock 8511 [] writeback_sb_inodes+0x10f/0x157 2.6.39-rc3 + patch: &(&wb->list_lock)->rlock: 11383 11657 0.14 151.69 40429.51 90825 527918 0.11 145.90 556843.37 ------------------------ &(&wb->list_lock)->rlock 10 [] inode_wb_list_del+0x5f/0x86 &(&wb->list_lock)->rlock 1493 [] writeback_inodes_wb+0x3d/0x150 &(&wb->list_lock)->rlock 3652 [] writeback_sb_inodes+0x123/0x16f &(&wb->list_lock)->rlock 1412 [] writeback_single_inode+0x17f/0x223 ------------------------ &(&wb->list_lock)->rlock 3 [] bdi_lock_two+0x46/0x4b &(&wb->list_lock)->rlock 6 [] inode_wb_list_del+0x5f/0x86 &(&wb->list_lock)->rlock 2061 [] __mark_inode_dirty+0x173/0x1cf &(&wb->list_lock)->rlock 2629 [] writeback_sb_inodes+0x123/0x16f hughd@google.com: fix recursive lock when bdi_lock_two() is called with new the same as old akpm@linux-foundation.org: cleanup bdev_inode_switch_bdi() comment Signed-off-by: Christoph Hellwig Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Wu Fengguang --- fs/block_dev.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'fs/block_dev.c') diff --git a/fs/block_dev.c b/fs/block_dev.c index 1a2421f908f0..3c9a03e51b62 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -44,24 +44,28 @@ inline struct block_device *I_BDEV(struct inode *inode) { return &BDEV_I(inode)->bdev; } - EXPORT_SYMBOL(I_BDEV); /* - * move the inode from it's current bdi to the a new bdi. if the inode is dirty - * we need to move it onto the dirty list of @dst so that the inode is always - * on the right list. + * Move the inode from its current bdi to a new bdi. If the inode is dirty we + * need to move it onto the dirty list of @dst so that the inode is always on + * the right list. */ static void bdev_inode_switch_bdi(struct inode *inode, struct backing_dev_info *dst) { - spin_lock(&inode_wb_list_lock); + struct backing_dev_info *old = inode->i_data.backing_dev_info; + + if (unlikely(dst == old)) /* deadlock avoidance */ + return; + bdi_lock_two(&old->wb, &dst->wb); spin_lock(&inode->i_lock); inode->i_data.backing_dev_info = dst; if (inode->i_state & I_DIRTY) list_move(&inode->i_wb_list, &dst->wb.b_dirty); spin_unlock(&inode->i_lock); - spin_unlock(&inode_wb_list_lock); + spin_unlock(&old->wb.list_lock); + spin_unlock(&dst->wb.list_lock); } static sector_t max_block(struct block_device *bdev) -- cgit v1.2.3 From d4c208b86b8be4254eba0e74071496e599f94639 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 13 Jun 2011 12:45:48 +0200 Subject: block: use the passed in @bdev when claiming if partno is zero 6b4517a791 (block: implement bd_claiming and claiming block) introduced claiming block to support O_EXCL blkdev opens properly. bd_start_claiming() looks up the part 0 bdev and starts claiming block. The function assumed that there is only one part 0 bdev and always used bdget_disk(disk, 0) to look it up; unfortunately, this isn't true for some drivers (floppy) which use multiple block devices to denote different operating parameters for the same physical device. There can be multiple part 0 bdev's for the same device number. This incorrect assumption caused the wrong bdev to be used during claiming leading to unbalanced bd_holders as reported in the following bug. https://bugzilla.kernel.org/show_bug.cgi?id=28522 This patch updates bd_start_claiming() such that it uses the bdev specified as argument if its partno is zero. Note that this means that different bdev's can be used for the same device and O_EXCL check can be effectively bypassed. It has always been broken that way and floppy is fortunately on its way out. Leave that breakage alone. Signed-off-by: Tejun Heo Reported-by: Alex Villacis Lasso Tested-by: Alex Villacis Lasso Cc: stable@kernel.org # >= v2.6.36 Signed-off-by: Jens Axboe --- fs/block_dev.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'fs/block_dev.c') diff --git a/fs/block_dev.c b/fs/block_dev.c index 1a2421f908f0..610e8e0b04b8 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -762,7 +762,19 @@ static struct block_device *bd_start_claiming(struct block_device *bdev, if (!disk) return ERR_PTR(-ENXIO); - whole = bdget_disk(disk, 0); + /* + * Normally, @bdev should equal what's returned from bdget_disk() + * if partno is 0; however, some drivers (floppy) use multiple + * bdev's for the same physical device and @bdev may be one of the + * aliases. Keep @bdev if partno is 0. This means claimer + * tracking is broken for those devices but it has always been that + * way. + */ + if (partno) + whole = bdget_disk(disk, 0); + else + whole = bdgrab(bdev); + module_put(disk->fops->owner); put_disk(disk); if (!whole) -- cgit v1.2.3 From 85ef06d1d252f6a2e73b678591ab71caad4667bb Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 1 Jul 2011 16:17:47 +0200 Subject: block: flush MEDIA_CHANGE from drivers on close(2) Currently, only open(2) is defined as the 'clearing' point. It has two roles - first, it's an acknowledgement from userland indicating that the event has been received and kernel can clear pending states and proceed to generate more events. Secondly, it's passed on to device drivers as a hint indicating that a synchronization point has been reached and it might want to take a deeper look at the device. The latter currently is only used by sr which uses two different mechanisms - GET_EVENT_MEDIA_STATUS_NOTIFICATION and TEST_UNIT_READY to discover events, where the former is lighter weight and safe to be used repeatedly but may not provide full coverage. Among other things, GET_EVENT can't detect media removal while TUR can. This patch makes close(2) - blkdev_put() - indicate clearing hint for MEDIA_CHANGE to drivers. disk_check_events() is renamed to disk_flush_events() and updated to take @mask for events to flush which is or'd to ev->clearing and will be passed to the driver on the next ->check_events() invocation. This change makes sr generate MEDIA_CHANGE when media is ejected from userland - e.g. with eject(1). Note: Given the current usage, it seems @clearing hint is needlessly complex. disk_clear_events() can simply clear all events and the hint can be boolean @flush. Signed-off-by: Tejun Heo Cc: Kay Sievers Signed-off-by: Jens Axboe --- fs/block_dev.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) (limited to 'fs/block_dev.c') diff --git a/fs/block_dev.c b/fs/block_dev.c index 610e8e0b04b8..6ff7f070016f 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1447,6 +1447,8 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) int blkdev_put(struct block_device *bdev, fmode_t mode) { + mutex_lock(&bdev->bd_mutex); + if (mode & FMODE_EXCL) { bool bdev_free; @@ -1455,7 +1457,6 @@ int blkdev_put(struct block_device *bdev, fmode_t mode) * are protected with bdev_lock. bd_mutex is to * synchronize disk_holder unlinking. */ - mutex_lock(&bdev->bd_mutex); spin_lock(&bdev_lock); WARN_ON_ONCE(--bdev->bd_holders < 0); @@ -1473,17 +1474,21 @@ int blkdev_put(struct block_device *bdev, fmode_t mode) * If this was the last claim, remove holder link and * unblock evpoll if it was a write holder. */ - if (bdev_free) { - if (bdev->bd_write_holder) { - disk_unblock_events(bdev->bd_disk); - disk_check_events(bdev->bd_disk); - bdev->bd_write_holder = false; - } + if (bdev_free && bdev->bd_write_holder) { + disk_unblock_events(bdev->bd_disk); + bdev->bd_write_holder = false; } - - mutex_unlock(&bdev->bd_mutex); } + /* + * Trigger event checking and tell drivers to flush MEDIA_CHANGE + * event. This is to ensure detection of media removal commanded + * from userland - e.g. eject(1). + */ + disk_flush_events(bdev->bd_disk, DISK_EVENT_MEDIA_CHANGE); + + mutex_unlock(&bdev->bd_mutex); + return __blkdev_put(bdev, mode, 0); } EXPORT_SYMBOL(blkdev_put); -- cgit v1.2.3 From 06222e491e663dac939f04b125c9dc52126a75c4 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 18 Jul 2011 13:21:38 -0400 Subject: fs: handle SEEK_HOLE/SEEK_DATA properly in all fs's that define their own llseek This converts everybody to handle SEEK_HOLE/SEEK_DATA properly. In some cases we just return -EINVAL, in others we do the normal generic thing, and in others we're simply making sure that the properly due-dilligence is done. For example in NFS/CIFS we need to make sure the file size is update properly for the SEEK_HOLE and SEEK_DATA case, but since it calls the generic llseek stuff itself that is all we have to do. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Al Viro --- fs/block_dev.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'fs/block_dev.c') diff --git a/fs/block_dev.c b/fs/block_dev.c index 610e8e0b04b8..966617a422d9 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -355,20 +355,25 @@ static loff_t block_llseek(struct file *file, loff_t offset, int origin) mutex_lock(&bd_inode->i_mutex); size = i_size_read(bd_inode); + retval = -EINVAL; switch (origin) { - case 2: + case SEEK_END: offset += size; break; - case 1: + case SEEK_CUR: offset += file->f_pos; + case SEEK_SET: + break; + default: + goto out; } - retval = -EINVAL; if (offset >= 0 && offset <= size) { if (offset != file->f_pos) { file->f_pos = offset; } retval = offset; } +out: mutex_unlock(&bd_inode->i_mutex); return retval; } -- cgit v1.2.3 From 02c24a82187d5a628c68edfe71ae60dc135cd178 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Sat, 16 Jul 2011 20:44:56 -0400 Subject: fs: push i_mutex and filemap_write_and_wait down into ->fsync() handlers Btrfs needs to be able to control how filemap_write_and_wait_range() is called in fsync to make it less of a painful operation, so push down taking i_mutex and the calling of filemap_write_and_wait() down into the ->fsync() handlers. Some file systems can drop taking the i_mutex altogether it seems, like ext3 and ocfs2. For correctness sake I just pushed everything down in all cases to make sure that we keep the current behavior the same for everybody, and then each individual fs maintainer can make up their mind about what to do from there. Thanks, Acked-by: Jan Kara Signed-off-by: Josef Bacik Signed-off-by: Al Viro --- fs/block_dev.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'fs/block_dev.c') diff --git a/fs/block_dev.c b/fs/block_dev.c index 966617a422d9..9fb0b15331d3 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -378,7 +378,7 @@ out: return retval; } -int blkdev_fsync(struct file *filp, int datasync) +int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync) { struct inode *bd_inode = filp->f_mapping->host; struct block_device *bdev = I_BDEV(bd_inode); @@ -389,14 +389,10 @@ int blkdev_fsync(struct file *filp, int datasync) * i_mutex and doing so causes performance issues with concurrent * O_SYNC writers to a block device. */ - mutex_unlock(&bd_inode->i_mutex); - error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL); if (error == -EOPNOTSUPP) error = 0; - mutex_lock(&bd_inode->i_mutex); - return error; } EXPORT_SYMBOL(blkdev_fsync); -- cgit v1.2.3 From 782b94cdf577b4df1feb376f372dccc28e66a771 Mon Sep 17 00:00:00 2001 From: Lachlan McIlroy Date: Thu, 30 Jun 2011 11:01:45 +1000 Subject: block: initialise bd_super in bdget() bd_super is currently reset to NULL in kill_block_super() so we rely on previous users of the block_device object to initialise this value for the next user. This quirk was exposed on RHEL5 when a third party filesystem did not always use kill_block_super() and therefore bd_super wasn't being reset when a block_device object was recycled within the cache. This may not be a problem upstream but makes sense to be defensive. Signed-off-by: Lachlan McIlroy Reviewed-by: Eric Sandeen Signed-off-by: Al Viro --- fs/block_dev.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/block_dev.c') diff --git a/fs/block_dev.c b/fs/block_dev.c index f55aad4d1611..f28680553288 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -552,6 +552,7 @@ struct block_device *bdget(dev_t dev) if (inode->i_state & I_NEW) { bdev->bd_contains = NULL; + bdev->bd_super = NULL; bdev->bd_inode = inode; bdev->bd_block_size = (1 << inode->i_blkbits); bdev->bd_part_count = 0; -- cgit v1.2.3 From da5aa861bea09197e6ae4d7c46618616064891e4 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 2 Aug 2011 02:17:48 +0200 Subject: fix block device fallout from ->fsync() changes blkdev_fsync() needs to write pages in pagecache... Signed-off-by: Rafael J. Wysocki Signed-off-by: Al Viro --- fs/block_dev.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs/block_dev.c') diff --git a/fs/block_dev.c b/fs/block_dev.c index f28680553288..ff77262e887c 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -387,6 +387,10 @@ int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync) struct inode *bd_inode = filp->f_mapping->host; struct block_device *bdev = I_BDEV(bd_inode); int error; + + error = filemap_write_and_wait_range(filp->f_mapping, start, end); + if (error) + return error; /* * There is no need to serialise calls to blkdev_issue_flush with -- cgit v1.2.3