From b4caecd48005fbed3949dde6c1cb233142fd69e9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 14 Jan 2015 10:42:32 +0100 Subject: fs: introduce f_op->mmap_capabilities for nommu mmap support Since "BDI: Provide backing device capability information [try #3]" the backing_dev_info structure also provides flags for the kind of mmap operation available in a nommu environment, which is entirely unrelated to it's original purpose. Introduce a new nommu-only file operation to provide this information to the nommu mmap code instead. Splitting this from the backing_dev_info structure allows to remove lots of backing_dev_info instance that aren't otherwise needed, and entirely gets rid of the concept of providing a backing_dev_info for a character device. It also removes the need for the mtd_inodefs filesystem. Signed-off-by: Christoph Hellwig Reviewed-by: Tejun Heo Acked-by: Brian Norris Signed-off-by: Jens Axboe --- mm/backing-dev.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'mm/backing-dev.c') diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 0ae0df55000b..16c68958aeda 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -17,8 +17,6 @@ static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0); struct backing_dev_info default_backing_dev_info = { .name = "default", .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE, - .state = 0, - .capabilities = BDI_CAP_MAP_COPY, }; EXPORT_SYMBOL_GPL(default_backing_dev_info); @@ -513,13 +511,12 @@ EXPORT_SYMBOL(bdi_destroy); * For use from filesystems to quickly init and register a bdi associated * with dirty writeback */ -int bdi_setup_and_register(struct backing_dev_info *bdi, char *name, - unsigned int cap) +int bdi_setup_and_register(struct backing_dev_info *bdi, char *name) { int err; bdi->name = name; - bdi->capabilities = cap; + bdi->capabilities = 0; err = bdi_init(bdi); if (err) return err; -- cgit v1.2.3 From b83ae6d421435c6204150300f1c25bfbd39cd62b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 14 Jan 2015 10:42:37 +0100 Subject: fs: remove mapping->backing_dev_info Now that we never use the backing_dev_info pointer in struct address_space we can simply remove it and save 4 to 8 bytes in every inode. Signed-off-by: Christoph Hellwig Acked-by: Ryusuke Konishi Reviewed-by: Tejun Heo Reviewed-by: Jan Kara Signed-off-by: Jens Axboe --- mm/backing-dev.c | 1 - 1 file changed, 1 deletion(-) (limited to 'mm/backing-dev.c') diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 16c68958aeda..52e0c7652448 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -24,7 +24,6 @@ struct backing_dev_info noop_backing_dev_info = { .name = "noop", .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, }; -EXPORT_SYMBOL_GPL(noop_backing_dev_info); static struct class *bdi_class; -- cgit v1.2.3 From c4db59d31e39ea067c32163ac961e9c80198fd37 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 20 Jan 2015 14:05:00 -0700 Subject: fs: don't reassign dirty inodes to default_backing_dev_info If we have dirty inodes we need to call the filesystem for it, even if the device has been removed and the filesystem will error out early. The current code does that by reassining all dirty inodes to the default backing_dev_info when a bdi is unlinked, but that's pretty pointless given that the bdi must always outlive the super block. Instead of stopping writeback at unregister time and moving inodes to the default bdi just keep the current bdi alive until it is destroyed. The containing objects of the bdi ensure this doesn't happen until all writeback has finished by erroring out. Signed-off-by: Christoph Hellwig Reviewed-by: Tejun Heo Reviewed-by: Jan Kara Killed the redundant WARN_ON(), as noticed by Jan. Signed-off-by: Jens Axboe --- mm/backing-dev.c | 90 +++++++++++++++----------------------------------------- 1 file changed, 23 insertions(+), 67 deletions(-) (limited to 'mm/backing-dev.c') diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 52e0c7652448..1725adb242e0 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -37,17 +37,6 @@ LIST_HEAD(bdi_list); /* bdi_wq serves all asynchronous writeback tasks */ struct workqueue_struct *bdi_wq; -static void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2) -{ - if (wb1 < wb2) { - spin_lock(&wb1->list_lock); - spin_lock_nested(&wb2->list_lock, 1); - } else { - spin_lock(&wb2->list_lock); - spin_lock_nested(&wb1->list_lock, 1); - } -} - #ifdef CONFIG_DEBUG_FS #include #include @@ -352,19 +341,19 @@ EXPORT_SYMBOL(bdi_register_dev); */ static void bdi_wb_shutdown(struct backing_dev_info *bdi) { - if (!bdi_cap_writeback_dirty(bdi)) + /* Make sure nobody queues further work */ + spin_lock_bh(&bdi->wb_lock); + if (!test_and_clear_bit(BDI_registered, &bdi->state)) { + spin_unlock_bh(&bdi->wb_lock); return; + } + spin_unlock_bh(&bdi->wb_lock); /* * Make sure nobody finds us on the bdi_list anymore */ bdi_remove_from_list(bdi); - /* Make sure nobody queues further work */ - spin_lock_bh(&bdi->wb_lock); - clear_bit(BDI_registered, &bdi->state); - spin_unlock_bh(&bdi->wb_lock); - /* * Drain work list and shutdown the delayed_work. At this point, * @bdi->bdi_list is empty telling bdi_Writeback_workfn() that @bdi @@ -372,37 +361,22 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi) */ mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); flush_delayed_work(&bdi->wb.dwork); - WARN_ON(!list_empty(&bdi->work_list)); - WARN_ON(delayed_work_pending(&bdi->wb.dwork)); } /* - * This bdi is going away now, make sure that no super_blocks point to it + * Called when the device behind @bdi has been removed or ejected. + * + * We can't really do much here except for reducing the dirty ratio at + * the moment. In the future we should be able to set a flag so that + * the filesystem can handle errors at mark_inode_dirty time instead + * of only at writeback time. */ -static void bdi_prune_sb(struct backing_dev_info *bdi) -{ - struct super_block *sb; - - spin_lock(&sb_lock); - list_for_each_entry(sb, &super_blocks, s_list) { - if (sb->s_bdi == bdi) - sb->s_bdi = &default_backing_dev_info; - } - spin_unlock(&sb_lock); -} - void bdi_unregister(struct backing_dev_info *bdi) { - if (bdi->dev) { - bdi_set_min_ratio(bdi, 0); - trace_writeback_bdi_unregister(bdi); - bdi_prune_sb(bdi); + if (WARN_ON_ONCE(!bdi->dev)) + return; - bdi_wb_shutdown(bdi); - bdi_debug_unregister(bdi); - device_unregister(bdi->dev); - bdi->dev = NULL; - } + bdi_set_min_ratio(bdi, 0); } EXPORT_SYMBOL(bdi_unregister); @@ -471,37 +445,19 @@ void bdi_destroy(struct backing_dev_info *bdi) { int i; - /* - * Splice our entries to the default_backing_dev_info. This - * condition shouldn't happen. @wb must be empty at this point and - * dirty inodes on it might cause other issues. This workaround is - * added by ce5f8e779519 ("writeback: splice dirty inode entries to - * default bdi on bdi_destroy()") without root-causing the issue. - * - * http://lkml.kernel.org/g/1253038617-30204-11-git-send-email-jens.axboe@oracle.com - * http://thread.gmane.org/gmane.linux.file-systems/35341/focus=35350 - * - * We should probably add WARN_ON() to find out whether it still - * happens and track it down if so. - */ - if (bdi_has_dirty_io(bdi)) { - struct bdi_writeback *dst = &default_backing_dev_info.wb; - - bdi_lock_two(&bdi->wb, dst); - list_splice(&bdi->wb.b_dirty, &dst->b_dirty); - list_splice(&bdi->wb.b_io, &dst->b_io); - list_splice(&bdi->wb.b_more_io, &dst->b_more_io); - spin_unlock(&bdi->wb.list_lock); - spin_unlock(&dst->list_lock); - } - - bdi_unregister(bdi); + bdi_wb_shutdown(bdi); + WARN_ON(!list_empty(&bdi->work_list)); WARN_ON(delayed_work_pending(&bdi->wb.dwork)); + if (bdi->dev) { + bdi_debug_unregister(bdi); + device_unregister(bdi->dev); + bdi->dev = NULL; + } + for (i = 0; i < NR_BDI_STAT_ITEMS; i++) percpu_counter_destroy(&bdi->bdi_stat[i]); - fprop_local_destroy_percpu(&bdi->completions); } EXPORT_SYMBOL(bdi_destroy); -- cgit v1.2.3 From df0ce26cb4ee8bc233d50213b97213532aff0a3c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 14 Jan 2015 10:42:41 +0100 Subject: fs: remove default_backing_dev_info Now that default_backing_dev_info is not used for writeback purposes we can git rid of it easily: - instead of using it's name for tracing unregistered bdi we just use "unknown" - btrfs and ceph can just assign the default read ahead window themselves like several other filesystems already do. - we can assign noop_backing_dev_info as the default one in alloc_super. All filesystems already either assigned their own or noop_backing_dev_info. Signed-off-by: Christoph Hellwig Reviewed-by: Tejun Heo Reviewed-by: Jan Kara Signed-off-by: Jens Axboe --- mm/backing-dev.c | 9 --------- 1 file changed, 9 deletions(-) (limited to 'mm/backing-dev.c') diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 1725adb242e0..7690ec77c722 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -14,12 +14,6 @@ static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0); -struct backing_dev_info default_backing_dev_info = { - .name = "default", - .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE, -}; -EXPORT_SYMBOL_GPL(default_backing_dev_info); - struct backing_dev_info noop_backing_dev_info = { .name = "noop", .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, @@ -250,9 +244,6 @@ static int __init default_bdi_init(void) if (!bdi_wq) return -ENOMEM; - err = bdi_init(&default_backing_dev_info); - if (!err) - bdi_register(&default_backing_dev_info, NULL, "default"); err = bdi_init(&noop_backing_dev_info); return err; -- cgit v1.2.3 From 0ae45f63d4ef8d8eeec49c7d8b44a1775fff13e8 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 2 Feb 2015 00:37:00 -0500 Subject: vfs: add support for a lazytime mount option Add a new mount option which enables a new "lazytime" mode. This mode causes atime, mtime, and ctime updates to only be made to the in-memory version of the inode. The on-disk times will only get updated when (a) if the inode needs to be updated for some non-time related change, (b) if userspace calls fsync(), syncfs() or sync(), or (c) just before an undeleted inode is evicted from memory. This is OK according to POSIX because there are no guarantees after a crash unless userspace explicitly requests via a fsync(2) call. For workloads which feature a large number of random write to a preallocated file, the lazytime mount option significantly reduces writes to the inode table. The repeated 4k writes to a single block will result in undesirable stress on flash devices and SMR disk drives. Even on conventional HDD's, the repeated writes to the inode table block will trigger Adjacent Track Interference (ATI) remediation latencies, which very negatively impact long tail latencies --- which is a very big deal for web serving tiers (for example). Google-Bug-Id: 18297052 Signed-off-by: Theodore Ts'o Signed-off-by: Al Viro --- mm/backing-dev.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'mm/backing-dev.c') diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 0ae0df55000b..915feea94c66 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -69,10 +69,10 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) unsigned long background_thresh; unsigned long dirty_thresh; unsigned long bdi_thresh; - unsigned long nr_dirty, nr_io, nr_more_io; + unsigned long nr_dirty, nr_io, nr_more_io, nr_dirty_time; struct inode *inode; - nr_dirty = nr_io = nr_more_io = 0; + nr_dirty = nr_io = nr_more_io = nr_dirty_time = 0; spin_lock(&wb->list_lock); list_for_each_entry(inode, &wb->b_dirty, i_wb_list) nr_dirty++; @@ -80,6 +80,9 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) nr_io++; list_for_each_entry(inode, &wb->b_more_io, i_wb_list) nr_more_io++; + list_for_each_entry(inode, &wb->b_dirty_time, i_wb_list) + if (inode->i_state & I_DIRTY_TIME) + nr_dirty_time++; spin_unlock(&wb->list_lock); global_dirty_limits(&background_thresh, &dirty_thresh); @@ -98,6 +101,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) "b_dirty: %10lu\n" "b_io: %10lu\n" "b_more_io: %10lu\n" + "b_dirty_time: %10lu\n" "bdi_list: %10u\n" "state: %10lx\n", (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)), @@ -111,6 +115,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) nr_dirty, nr_io, nr_more_io, + nr_dirty_time, !list_empty(&bdi->bdi_list), bdi->state); #undef K @@ -418,6 +423,7 @@ static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi) INIT_LIST_HEAD(&wb->b_dirty); INIT_LIST_HEAD(&wb->b_io); INIT_LIST_HEAD(&wb->b_more_io); + INIT_LIST_HEAD(&wb->b_dirty_time); spin_lock_init(&wb->list_lock); INIT_DELAYED_WORK(&wb->dwork, bdi_writeback_workfn); } -- cgit v1.2.3