From 4452226ea276e74fc3e252c88d9bb7e8f8e44bf0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:26 -0400 Subject: writeback: move backing_dev_info->state into bdi_writeback Currently, a bdi (backing_dev_info) embeds single wb (bdi_writeback) and the role of the separation is unclear. For cgroup support for writeback IOs, a bdi will be updated to host multiple wb's where each wb serves writeback IOs of a different cgroup on the bdi. To achieve that, a wb should carry all states necessary for servicing writeback IOs for a cgroup independently. This patch moves bdi->state into wb. * enum bdi_state is renamed to wb_state and the prefix of all enums is changed from BDI_ to WB_. * Explicit zeroing of bdi->state is removed without adding zeoring of wb->state as the whole data structure is zeroed on init anyway. * As there's still only one bdi_writeback per backing_dev_info, all uses of bdi->state are mechanically replaced with bdi->wb.state introducing no behavior changes. Signed-off-by: Tejun Heo Reviewed-by: Jan Kara Cc: Jens Axboe Cc: Wu Fengguang Cc: drbd-dev@lists.linbit.com Cc: Neil Brown Cc: Alasdair Kergon Cc: Mike Snitzer Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'include/linux/backing-dev.h') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index aff923ae8c4b..eb14f988a63e 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -25,13 +25,13 @@ struct device; struct dentry; /* - * Bits in backing_dev_info.state + * Bits in bdi_writeback.state */ -enum bdi_state { - BDI_async_congested, /* The async (write) queue is getting full */ - BDI_sync_congested, /* The sync queue is getting full */ - BDI_registered, /* bdi_register() was done */ - BDI_writeback_running, /* Writeback is in progress */ +enum wb_state { + WB_async_congested, /* The async (write) queue is getting full */ + WB_sync_congested, /* The sync queue is getting full */ + WB_registered, /* bdi_register() was done */ + WB_writeback_running, /* Writeback is in progress */ }; typedef int (congested_fn)(void *, int); @@ -49,6 +49,7 @@ enum bdi_stat_item { struct bdi_writeback { struct backing_dev_info *bdi; /* our parent bdi */ + unsigned long state; /* Always use atomic bitops on this */ unsigned long last_old_flush; /* last old data flush */ struct delayed_work dwork; /* work item used for writeback */ @@ -62,7 +63,6 @@ struct bdi_writeback { struct backing_dev_info { struct list_head bdi_list; unsigned long ra_pages; /* max readahead in PAGE_CACHE_SIZE units */ - unsigned long state; /* Always use atomic bitops on this */ unsigned int capabilities; /* Device capabilities */ congested_fn *congested_fn; /* Function pointer if device is md/dm */ void *congested_data; /* Pointer to aux data for congested func */ @@ -250,23 +250,23 @@ static inline int bdi_congested(struct backing_dev_info *bdi, int bdi_bits) { if (bdi->congested_fn) return bdi->congested_fn(bdi->congested_data, bdi_bits); - return (bdi->state & bdi_bits); + return (bdi->wb.state & bdi_bits); } static inline int bdi_read_congested(struct backing_dev_info *bdi) { - return bdi_congested(bdi, 1 << BDI_sync_congested); + return bdi_congested(bdi, 1 << WB_sync_congested); } static inline int bdi_write_congested(struct backing_dev_info *bdi) { - return bdi_congested(bdi, 1 << BDI_async_congested); + return bdi_congested(bdi, 1 << WB_async_congested); } static inline int bdi_rw_congested(struct backing_dev_info *bdi) { - return bdi_congested(bdi, (1 << BDI_sync_congested) | - (1 << BDI_async_congested)); + return bdi_congested(bdi, (1 << WB_sync_congested) | + (1 << WB_async_congested)); } enum { -- cgit v1.2.3 From 93f78d882865cb90020d0f80a9523c99cf46924c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:27 -0400 Subject: writeback: move backing_dev_info->bdi_stat[] into bdi_writeback Currently, a bdi (backing_dev_info) embeds single wb (bdi_writeback) and the role of the separation is unclear. For cgroup support for writeback IOs, a bdi will be updated to host multiple wb's where each wb serves writeback IOs of a different cgroup on the bdi. To achieve that, a wb should carry all states necessary for servicing writeback IOs for a cgroup independently. This patch moves bdi->bdi_stat[] into wb. * enum bdi_stat_item is renamed to wb_stat_item and the prefix of all enums is changed from BDI_ to WB_. * BDI_STAT_BATCH() -> WB_STAT_BATCH() * [__]{add|inc|dec|sum}_wb_stat(bdi, ...) -> [__]{add|inc}_wb_stat(wb, ...) * bdi_stat[_error]() -> wb_stat[_error]() * bdi_writeout_inc() -> wb_writeout_inc() * stat init is moved to bdi_wb_init() and bdi_wb_exit() is added and frees stat. * As there's still only one bdi_writeback per backing_dev_info, all uses of bdi->stat[] are mechanically replaced with bdi->wb.stat[] introducing no behavior changes. Signed-off-by: Tejun Heo Reviewed-by: Jan Kara Cc: Jens Axboe Cc: Wu Fengguang Cc: Miklos Szeredi Cc: Trond Myklebust Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 68 +++++++++++++++++++++------------------------ 1 file changed, 32 insertions(+), 36 deletions(-) (limited to 'include/linux/backing-dev.h') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index eb14f988a63e..fe7a907a4e16 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -36,15 +36,15 @@ enum wb_state { typedef int (congested_fn)(void *, int); -enum bdi_stat_item { - BDI_RECLAIMABLE, - BDI_WRITEBACK, - BDI_DIRTIED, - BDI_WRITTEN, - NR_BDI_STAT_ITEMS +enum wb_stat_item { + WB_RECLAIMABLE, + WB_WRITEBACK, + WB_DIRTIED, + WB_WRITTEN, + NR_WB_STAT_ITEMS }; -#define BDI_STAT_BATCH (8*(1+ilog2(nr_cpu_ids))) +#define WB_STAT_BATCH (8*(1+ilog2(nr_cpu_ids))) struct bdi_writeback { struct backing_dev_info *bdi; /* our parent bdi */ @@ -58,6 +58,8 @@ struct bdi_writeback { struct list_head b_more_io; /* parked for more writeback */ struct list_head b_dirty_time; /* time stamps are dirty */ spinlock_t list_lock; /* protects the b_* lists */ + + struct percpu_counter stat[NR_WB_STAT_ITEMS]; }; struct backing_dev_info { @@ -69,8 +71,6 @@ struct backing_dev_info { char *name; - struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS]; - unsigned long bw_time_stamp; /* last time write bw is updated */ unsigned long dirtied_stamp; unsigned long written_stamp; /* pages written at bw_time_stamp */ @@ -137,78 +137,74 @@ static inline int wb_has_dirty_io(struct bdi_writeback *wb) !list_empty(&wb->b_more_io); } -static inline void __add_bdi_stat(struct backing_dev_info *bdi, - enum bdi_stat_item item, s64 amount) +static inline void __add_wb_stat(struct bdi_writeback *wb, + enum wb_stat_item item, s64 amount) { - __percpu_counter_add(&bdi->bdi_stat[item], amount, BDI_STAT_BATCH); + __percpu_counter_add(&wb->stat[item], amount, WB_STAT_BATCH); } -static inline void __inc_bdi_stat(struct backing_dev_info *bdi, - enum bdi_stat_item item) +static inline void __inc_wb_stat(struct bdi_writeback *wb, + enum wb_stat_item item) { - __add_bdi_stat(bdi, item, 1); + __add_wb_stat(wb, item, 1); } -static inline void inc_bdi_stat(struct backing_dev_info *bdi, - enum bdi_stat_item item) +static inline void inc_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item) { unsigned long flags; local_irq_save(flags); - __inc_bdi_stat(bdi, item); + __inc_wb_stat(wb, item); local_irq_restore(flags); } -static inline void __dec_bdi_stat(struct backing_dev_info *bdi, - enum bdi_stat_item item) +static inline void __dec_wb_stat(struct bdi_writeback *wb, + enum wb_stat_item item) { - __add_bdi_stat(bdi, item, -1); + __add_wb_stat(wb, item, -1); } -static inline void dec_bdi_stat(struct backing_dev_info *bdi, - enum bdi_stat_item item) +static inline void dec_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item) { unsigned long flags; local_irq_save(flags); - __dec_bdi_stat(bdi, item); + __dec_wb_stat(wb, item); local_irq_restore(flags); } -static inline s64 bdi_stat(struct backing_dev_info *bdi, - enum bdi_stat_item item) +static inline s64 wb_stat(struct bdi_writeback *wb, enum wb_stat_item item) { - return percpu_counter_read_positive(&bdi->bdi_stat[item]); + return percpu_counter_read_positive(&wb->stat[item]); } -static inline s64 __bdi_stat_sum(struct backing_dev_info *bdi, - enum bdi_stat_item item) +static inline s64 __wb_stat_sum(struct bdi_writeback *wb, + enum wb_stat_item item) { - return percpu_counter_sum_positive(&bdi->bdi_stat[item]); + return percpu_counter_sum_positive(&wb->stat[item]); } -static inline s64 bdi_stat_sum(struct backing_dev_info *bdi, - enum bdi_stat_item item) +static inline s64 wb_stat_sum(struct bdi_writeback *wb, enum wb_stat_item item) { s64 sum; unsigned long flags; local_irq_save(flags); - sum = __bdi_stat_sum(bdi, item); + sum = __wb_stat_sum(wb, item); local_irq_restore(flags); return sum; } -extern void bdi_writeout_inc(struct backing_dev_info *bdi); +extern void wb_writeout_inc(struct bdi_writeback *wb); /* * maximal error of a stat counter. */ -static inline unsigned long bdi_stat_error(struct backing_dev_info *bdi) +static inline unsigned long wb_stat_error(struct bdi_writeback *wb) { #ifdef CONFIG_SMP - return nr_cpu_ids * BDI_STAT_BATCH; + return nr_cpu_ids * WB_STAT_BATCH; #else return 1; #endif -- cgit v1.2.3 From a88a341a73be4ef035ca26170c849f002797da27 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:28 -0400 Subject: writeback: move bandwidth related fields from backing_dev_info into bdi_writeback Currently, a bdi (backing_dev_info) embeds single wb (bdi_writeback) and the role of the separation is unclear. For cgroup support for writeback IOs, a bdi will be updated to host multiple wb's where each wb serves writeback IOs of a different cgroup on the bdi. To achieve that, a wb should carry all states necessary for servicing writeback IOs for a cgroup independently. This patch moves bandwidth related fields from backing_dev_info into bdi_writeback. * The moved fields are: bw_time_stamp, dirtied_stamp, written_stamp, write_bandwidth, avg_write_bandwidth, dirty_ratelimit, balanced_dirty_ratelimit, completions and dirty_exceeded. * writeback_chunk_size() and over_bground_thresh() now take @wb instead of @bdi. * bdi_writeout_fraction(bdi, ...) -> wb_writeout_fraction(wb, ...) bdi_dirty_limit(bdi, ...) -> wb_dirty_limit(wb, ...) bdi_position_ration(bdi, ...) -> wb_position_ratio(wb, ...) bdi_update_writebandwidth(bdi, ...) -> wb_update_write_bandwidth(wb, ...) [__]bdi_update_bandwidth(bdi, ...) -> [__]wb_update_bandwidth(wb, ...) bdi_{max|min}_pause(bdi, ...) -> wb_{max|min}_pause(wb, ...) bdi_dirty_limits(bdi, ...) -> wb_dirty_limits(wb, ...) * Init/exits of the relocated fields are moved to bdi_wb_init/exit() respectively. Note that explicit zeroing is dropped in the process as wb's are cleared in entirety anyway. * As there's still only one bdi_writeback per backing_dev_info, all uses of bdi->stat[] are mechanically replaced with bdi->wb.stat[] introducing no behavior changes. v2: Typo in description fixed as suggested by Jan. Signed-off-by: Tejun Heo Reviewed-by: Jan Kara Cc: Jens Axboe Cc: Wu Fengguang Cc: Jaegeuk Kim Cc: Steven Whitehouse Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'include/linux/backing-dev.h') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index fe7a907a4e16..2ab06049d812 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -60,16 +60,6 @@ struct bdi_writeback { spinlock_t list_lock; /* protects the b_* lists */ struct percpu_counter stat[NR_WB_STAT_ITEMS]; -}; - -struct backing_dev_info { - struct list_head bdi_list; - unsigned long ra_pages; /* max readahead in PAGE_CACHE_SIZE units */ - unsigned int capabilities; /* Device capabilities */ - congested_fn *congested_fn; /* Function pointer if device is md/dm */ - void *congested_data; /* Pointer to aux data for congested func */ - - char *name; unsigned long bw_time_stamp; /* last time write bw is updated */ unsigned long dirtied_stamp; @@ -88,6 +78,16 @@ struct backing_dev_info { struct fprop_local_percpu completions; int dirty_exceeded; +}; + +struct backing_dev_info { + struct list_head bdi_list; + unsigned long ra_pages; /* max readahead in PAGE_CACHE_SIZE units */ + unsigned int capabilities; /* Device capabilities */ + congested_fn *congested_fn; /* Function pointer if device is md/dm */ + void *congested_data; /* Pointer to aux data for congested func */ + + char *name; unsigned int min_ratio; unsigned int max_ratio, max_prop_frac; -- cgit v1.2.3 From f0054bb1e1f3be03cc33369df640db97f10f6172 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:30 -0400 Subject: writeback: move backing_dev_info->wb_lock and ->worklist into bdi_writeback Currently, a bdi (backing_dev_info) embeds single wb (bdi_writeback) and the role of the separation is unclear. For cgroup support for writeback IOs, a bdi will be updated to host multiple wb's where each wb serves writeback IOs of a different cgroup on the bdi. To achieve that, a wb should carry all states necessary for servicing writeback IOs for a cgroup independently. This patch moves bdi->wb_lock and ->worklist into wb. * The lock protects bdi->worklist and bdi->wb.dwork scheduling. While moving, rename it to wb->work_lock as wb->wb_lock is confusing. Also, move wb->dwork downwards so that it's colocated with the new ->work_lock and ->work_list fields. * bdi_writeback_workfn() -> wb_workfn() bdi_wakeup_thread_delayed(bdi) -> wb_wakeup_delayed(wb) bdi_wakeup_thread(bdi) -> wb_wakeup(wb) bdi_queue_work(bdi, ...) -> wb_queue_work(wb, ...) __bdi_start_writeback(bdi, ...) -> __wb_start_writeback(wb, ...) get_next_work_item(bdi) -> get_next_work_item(wb) * bdi_wb_shutdown() is renamed to wb_shutdown() and now takes @wb. The function contained parts which belong to the containing bdi rather than the wb itself - testing cap_writeback_dirty and bdi_remove_from_list() invocation. Those are moved to bdi_unregister(). * bdi_wb_{init|exit}() are renamed to wb_{init|exit}(). Initializations of the moved bdi->wb_lock and ->work_list are relocated from bdi_init() to wb_init(). * As there's still only one bdi_writeback per backing_dev_info, all uses of bdi->state are mechanically replaced with bdi->wb.state introducing no behavior changes. Signed-off-by: Tejun Heo Reviewed-by: Jan Kara Cc: Jens Axboe Cc: Wu Fengguang Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux/backing-dev.h') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 2ab06049d812..d796f49ce87a 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -52,7 +52,6 @@ struct bdi_writeback { unsigned long state; /* Always use atomic bitops on this */ unsigned long last_old_flush; /* last old data flush */ - struct delayed_work dwork; /* work item used for writeback */ struct list_head b_dirty; /* dirty inodes */ struct list_head b_io; /* parked for writeback */ struct list_head b_more_io; /* parked for more writeback */ @@ -78,6 +77,10 @@ struct bdi_writeback { struct fprop_local_percpu completions; int dirty_exceeded; + + spinlock_t work_lock; /* protects work_list & dwork scheduling */ + struct list_head work_list; + struct delayed_work dwork; /* work item used for writeback */ }; struct backing_dev_info { @@ -93,9 +96,6 @@ struct backing_dev_info { unsigned int max_ratio, max_prop_frac; struct bdi_writeback wb; /* default writeback info for this bdi */ - spinlock_t wb_lock; /* protects work_list & wb.dwork scheduling */ - - struct list_head work_list; struct device *dev; @@ -121,9 +121,9 @@ int __must_check bdi_setup_and_register(struct backing_dev_info *, char *); void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, enum wb_reason reason); void bdi_start_background_writeback(struct backing_dev_info *bdi); -void bdi_writeback_workfn(struct work_struct *work); +void wb_workfn(struct work_struct *work); int bdi_has_dirty_io(struct backing_dev_info *bdi); -void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi); +void wb_wakeup_delayed(struct bdi_writeback *wb); extern spinlock_t bdi_lock; extern struct list_head bdi_list; -- cgit v1.2.3 From 66114cad64bf76a155fec1f0fff0de771cf909d5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:32 -0400 Subject: writeback: separate out include/linux/backing-dev-defs.h With the planned cgroup writeback support, backing-dev related declarations will be more widely used across block and cgroup; unfortunately, including backing-dev.h from include/linux/blkdev.h makes cyclic include dependency quite likely. This patch separates out backing-dev-defs.h which only has the essential definitions and updates blkdev.h to include it. c files which need access to more backing-dev details now include backing-dev.h directly. This takes backing-dev.h off the common include dependency chain making it a lot easier to use it across block and cgroup. v2: fs/fat build failure fixed. Signed-off-by: Tejun Heo Reviewed-by: Jan Kara Cc: Jens Axboe Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 102 +------------------------------------------- 1 file changed, 1 insertion(+), 101 deletions(-) (limited to 'include/linux/backing-dev.h') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index d796f49ce87a..5e39f7a8efed 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -8,104 +8,11 @@ #ifndef _LINUX_BACKING_DEV_H #define _LINUX_BACKING_DEV_H -#include -#include -#include #include #include #include -#include #include -#include -#include -#include - -struct page; -struct device; -struct dentry; - -/* - * Bits in bdi_writeback.state - */ -enum wb_state { - WB_async_congested, /* The async (write) queue is getting full */ - WB_sync_congested, /* The sync queue is getting full */ - WB_registered, /* bdi_register() was done */ - WB_writeback_running, /* Writeback is in progress */ -}; - -typedef int (congested_fn)(void *, int); - -enum wb_stat_item { - WB_RECLAIMABLE, - WB_WRITEBACK, - WB_DIRTIED, - WB_WRITTEN, - NR_WB_STAT_ITEMS -}; - -#define WB_STAT_BATCH (8*(1+ilog2(nr_cpu_ids))) - -struct bdi_writeback { - struct backing_dev_info *bdi; /* our parent bdi */ - - unsigned long state; /* Always use atomic bitops on this */ - unsigned long last_old_flush; /* last old data flush */ - - struct list_head b_dirty; /* dirty inodes */ - struct list_head b_io; /* parked for writeback */ - struct list_head b_more_io; /* parked for more writeback */ - struct list_head b_dirty_time; /* time stamps are dirty */ - spinlock_t list_lock; /* protects the b_* lists */ - - struct percpu_counter stat[NR_WB_STAT_ITEMS]; - - unsigned long bw_time_stamp; /* last time write bw is updated */ - unsigned long dirtied_stamp; - unsigned long written_stamp; /* pages written at bw_time_stamp */ - unsigned long write_bandwidth; /* the estimated write bandwidth */ - unsigned long avg_write_bandwidth; /* further smoothed write bw */ - - /* - * The base dirty throttle rate, re-calculated on every 200ms. - * All the bdi tasks' dirty rate will be curbed under it. - * @dirty_ratelimit tracks the estimated @balanced_dirty_ratelimit - * in small steps and is much more smooth/stable than the latter. - */ - unsigned long dirty_ratelimit; - unsigned long balanced_dirty_ratelimit; - - struct fprop_local_percpu completions; - int dirty_exceeded; - - spinlock_t work_lock; /* protects work_list & dwork scheduling */ - struct list_head work_list; - struct delayed_work dwork; /* work item used for writeback */ -}; - -struct backing_dev_info { - struct list_head bdi_list; - unsigned long ra_pages; /* max readahead in PAGE_CACHE_SIZE units */ - unsigned int capabilities; /* Device capabilities */ - congested_fn *congested_fn; /* Function pointer if device is md/dm */ - void *congested_data; /* Pointer to aux data for congested func */ - - char *name; - - unsigned int min_ratio; - unsigned int max_ratio, max_prop_frac; - - struct bdi_writeback wb; /* default writeback info for this bdi */ - - struct device *dev; - - struct timer_list laptop_mode_wb_timer; - -#ifdef CONFIG_DEBUG_FS - struct dentry *debug_dir; - struct dentry *debug_stats; -#endif -}; +#include struct backing_dev_info *inode_to_bdi(struct inode *inode); @@ -265,13 +172,6 @@ static inline int bdi_rw_congested(struct backing_dev_info *bdi) (1 << WB_async_congested)); } -enum { - BLK_RW_ASYNC = 0, - BLK_RW_SYNC = 1, -}; - -void clear_bdi_congested(struct backing_dev_info *bdi, int sync); -void set_bdi_congested(struct backing_dev_info *bdi, int sync); long congestion_wait(int sync, long timeout); long wait_iff_congested(struct zone *zone, int sync, long timeout); int pdflush_proc_obsolete(struct ctl_table *table, int write, -- cgit v1.2.3 From a212b105b07d75b48b1a166378282e8a77fbf53d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:33 -0400 Subject: bdi: make inode_to_bdi() inline Now that bdi definitions are moved to backing-dev-defs.h, backing-dev.h can include blkdev.h and inline inode_to_bdi() without worrying about introducing circular include dependency. The function gets called from hot paths and fairly trivial. This patch makes inode_to_bdi() and sb_is_blkdev_sb() that the function calls inline. blockdev_superblock and noop_backing_dev_info are EXPORT_GPL'd to allow the inline functions to be used from modules. While at it, make sb_is_blkdev_sb() return bool instead of int. v2: Fixed typo in description as suggested by Jan. Signed-off-by: Tejun Heo Reviewed-by: Jens Axboe Cc: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'include/linux/backing-dev.h') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 5e39f7a8efed..785782034e86 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -11,11 +11,10 @@ #include #include #include +#include #include #include -struct backing_dev_info *inode_to_bdi(struct inode *inode); - int __must_check bdi_init(struct backing_dev_info *bdi); void bdi_destroy(struct backing_dev_info *bdi); @@ -149,6 +148,21 @@ extern struct backing_dev_info noop_backing_dev_info; int writeback_in_progress(struct backing_dev_info *bdi); +static inline struct backing_dev_info *inode_to_bdi(struct inode *inode) +{ + struct super_block *sb; + + if (!inode) + return &noop_backing_dev_info; + + sb = inode->i_sb; +#ifdef CONFIG_BLOCK + if (sb_is_blkdev_sb(sb)) + return blk_get_backing_dev_info(I_BDEV(inode)); +#endif + return sb->s_bdi; +} + static inline int bdi_congested(struct backing_dev_info *bdi, int bdi_bits) { if (bdi->congested_fn) -- cgit v1.2.3 From 4aa9c692e052cf6db99db62a8fe0543e5c455da7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:35 -0400 Subject: bdi: separate out congested state into a separate struct Currently, a wb's (bdi_writeback) congestion state is carried in its ->state field; however, cgroup writeback support will require multiple wb's sharing the same congestion state. This patch separates out congestion state into its own struct - struct bdi_writeback_congested. A new field wb field, wb_congested, points to its associated congested struct. The default wb, bdi->wb, always points to bdi->wb_congested. While this patch adds a layer of indirection, it doesn't introduce any behavior changes. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/backing-dev.h') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 785782034e86..bfdaa18ba0a1 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -167,7 +167,7 @@ static inline int bdi_congested(struct backing_dev_info *bdi, int bdi_bits) { if (bdi->congested_fn) return bdi->congested_fn(bdi->congested_data, bdi_bits); - return (bdi->wb.state & bdi_bits); + return (bdi->wb.congested->state & bdi_bits); } static inline int bdi_read_congested(struct backing_dev_info *bdi) -- cgit v1.2.3 From 89e9b9e07a390c50980d10aa37a04631db5a23ab Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:36 -0400 Subject: writeback: add {CONFIG|BDI_CAP|FS}_CGROUP_WRITEBACK cgroup writeback requires support from both bdi and filesystem sides. Add BDI_CAP_CGROUP_WRITEBACK and FS_CGROUP_WRITEBACK to indicate support and enable BDI_CAP_CGROUP_WRITEBACK on block based bdi's by default. Also, define CONFIG_CGROUP_WRITEBACK which is enabled if both MEMCG and BLK_CGROUP are enabled. inode_cgwb_enabled() which determines whether a given inode's both bdi and fs support cgroup writeback is added. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) (limited to 'include/linux/backing-dev.h') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index bfdaa18ba0a1..6bb31234e6a9 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -134,12 +134,15 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); * BDI_CAP_NO_WRITEBACK: Don't write pages back * BDI_CAP_NO_ACCT_WB: Don't automatically account writeback pages * BDI_CAP_STRICTLIMIT: Keep number of dirty pages below bdi threshold. + * + * BDI_CAP_CGROUP_WRITEBACK: Supports cgroup-aware writeback. */ #define BDI_CAP_NO_ACCT_DIRTY 0x00000001 #define BDI_CAP_NO_WRITEBACK 0x00000002 #define BDI_CAP_NO_ACCT_WB 0x00000004 #define BDI_CAP_STABLE_WRITES 0x00000008 #define BDI_CAP_STRICTLIMIT 0x00000010 +#define BDI_CAP_CGROUP_WRITEBACK 0x00000020 #define BDI_CAP_NO_ACCT_AND_WRITEBACK \ (BDI_CAP_NO_WRITEBACK | BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_ACCT_WB) @@ -229,4 +232,31 @@ static inline int bdi_sched_wait(void *word) return 0; } -#endif /* _LINUX_BACKING_DEV_H */ +#ifdef CONFIG_CGROUP_WRITEBACK + +/** + * inode_cgwb_enabled - test whether cgroup writeback is enabled on an inode + * @inode: inode of interest + * + * cgroup writeback requires support from both the bdi and filesystem. + * Test whether @inode has both. + */ +static inline bool inode_cgwb_enabled(struct inode *inode) +{ + struct backing_dev_info *bdi = inode_to_bdi(inode); + + return bdi_cap_account_dirty(bdi) && + (bdi->capabilities & BDI_CAP_CGROUP_WRITEBACK) && + (inode->i_sb->s_type->fs_flags & FS_CGROUP_WRITEBACK); +} + +#else /* CONFIG_CGROUP_WRITEBACK */ + +static inline bool inode_cgwb_enabled(struct inode *inode) +{ + return false; +} + +#endif /* CONFIG_CGROUP_WRITEBACK */ + +#endif /* _LINUX_BACKING_DEV_H */ -- cgit v1.2.3 From 52ebea749aaed195245701a8f90a23d672c7a933 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:37 -0400 Subject: writeback: make backing_dev_info host cgroup-specific bdi_writebacks For the planned cgroup writeback support, on each bdi (backing_dev_info), each memcg will be served by a separate wb (bdi_writeback). This patch updates bdi so that a bdi can host multiple wbs (bdi_writebacks). On the default hierarchy, blkcg implicitly enables memcg. This allows using memcg's page ownership for attributing writeback IOs, and every memcg - blkcg combination can be served by its own wb by assigning a dedicated wb to each memcg. This means that there may be multiple wb's of a bdi mapped to the same blkcg. As congested state is per blkcg - bdi combination, those wb's should share the same congested state. This is achieved by tracking congested state via bdi_writeback_congested structs which are keyed by blkcg. bdi->wb remains unchanged and will keep serving the root cgroup. cgwb's (cgroup wb's) for non-root cgroups are created on-demand or looked up while dirtying an inode according to the memcg of the page being dirtied or current task. Each cgwb is indexed on bdi->cgwb_tree by its memcg id. Once an inode is associated with its wb, it can be retrieved using inode_to_wb(). Currently, none of the filesystems has FS_CGROUP_WRITEBACK and all pages will keep being associated with bdi->wb. v3: inode_attach_wb() in account_page_dirtied() moved inside mapping_cap_account_dirty() block where it's known to be !NULL. Also, an unnecessary NULL check before kfree() removed. Both detected by the kbuild bot. v2: Updated so that wb association is per inode and wb is per memcg rather than blkcg. Signed-off-by: Tejun Heo Cc: kbuild test robot Cc: Dan Carpenter Cc: Jens Axboe Cc: Jan Kara Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 195 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 195 insertions(+) (limited to 'include/linux/backing-dev.h') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 6bb31234e6a9..8ae59df2e3d1 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -13,6 +13,7 @@ #include #include #include +#include #include int __must_check bdi_init(struct backing_dev_info *bdi); @@ -234,6 +235,16 @@ static inline int bdi_sched_wait(void *word) #ifdef CONFIG_CGROUP_WRITEBACK +struct bdi_writeback_congested * +wb_congested_get_create(struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp); +void wb_congested_put(struct bdi_writeback_congested *congested); +struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi, + struct cgroup_subsys_state *memcg_css, + gfp_t gfp); +void __inode_attach_wb(struct inode *inode, struct page *page); +void wb_memcg_offline(struct mem_cgroup *memcg); +void wb_blkcg_offline(struct blkcg *blkcg); + /** * inode_cgwb_enabled - test whether cgroup writeback is enabled on an inode * @inode: inode of interest @@ -250,6 +261,135 @@ static inline bool inode_cgwb_enabled(struct inode *inode) (inode->i_sb->s_type->fs_flags & FS_CGROUP_WRITEBACK); } +/** + * wb_tryget - try to increment a wb's refcount + * @wb: bdi_writeback to get + */ +static inline bool wb_tryget(struct bdi_writeback *wb) +{ + if (wb != &wb->bdi->wb) + return percpu_ref_tryget(&wb->refcnt); + return true; +} + +/** + * wb_get - increment a wb's refcount + * @wb: bdi_writeback to get + */ +static inline void wb_get(struct bdi_writeback *wb) +{ + if (wb != &wb->bdi->wb) + percpu_ref_get(&wb->refcnt); +} + +/** + * wb_put - decrement a wb's refcount + * @wb: bdi_writeback to put + */ +static inline void wb_put(struct bdi_writeback *wb) +{ + if (wb != &wb->bdi->wb) + percpu_ref_put(&wb->refcnt); +} + +/** + * wb_find_current - find wb for %current on a bdi + * @bdi: bdi of interest + * + * Find the wb of @bdi which matches both the memcg and blkcg of %current. + * Must be called under rcu_read_lock() which protects the returend wb. + * NULL if not found. + */ +static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi) +{ + struct cgroup_subsys_state *memcg_css; + struct bdi_writeback *wb; + + memcg_css = task_css(current, memory_cgrp_id); + if (!memcg_css->parent) + return &bdi->wb; + + wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id); + + /* + * %current's blkcg equals the effective blkcg of its memcg. No + * need to use the relatively expensive cgroup_get_e_css(). + */ + if (likely(wb && wb->blkcg_css == task_css(current, blkio_cgrp_id))) + return wb; + return NULL; +} + +/** + * wb_get_create_current - get or create wb for %current on a bdi + * @bdi: bdi of interest + * @gfp: allocation mask + * + * Equivalent to wb_get_create() on %current's memcg. This function is + * called from a relatively hot path and optimizes the common cases using + * wb_find_current(). + */ +static inline struct bdi_writeback * +wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp) +{ + struct bdi_writeback *wb; + + rcu_read_lock(); + wb = wb_find_current(bdi); + if (wb && unlikely(!wb_tryget(wb))) + wb = NULL; + rcu_read_unlock(); + + if (unlikely(!wb)) { + struct cgroup_subsys_state *memcg_css; + + memcg_css = task_get_css(current, memory_cgrp_id); + wb = wb_get_create(bdi, memcg_css, gfp); + css_put(memcg_css); + } + return wb; +} + +/** + * inode_attach_wb - associate an inode with its wb + * @inode: inode of interest + * @page: page being dirtied (may be NULL) + * + * If @inode doesn't have its wb, associate it with the wb matching the + * memcg of @page or, if @page is NULL, %current. May be called w/ or w/o + * @inode->i_lock. + */ +static inline void inode_attach_wb(struct inode *inode, struct page *page) +{ + if (!inode->i_wb) + __inode_attach_wb(inode, page); +} + +/** + * inode_detach_wb - disassociate an inode from its wb + * @inode: inode of interest + * + * @inode is being freed. Detach from its wb. + */ +static inline void inode_detach_wb(struct inode *inode) +{ + if (inode->i_wb) { + wb_put(inode->i_wb); + inode->i_wb = NULL; + } +} + +/** + * inode_to_wb - determine the wb of an inode + * @inode: inode of interest + * + * Returns the wb @inode is currently associated with. + */ +static inline struct bdi_writeback *inode_to_wb(struct inode *inode) +{ + return inode->i_wb; +} + #else /* CONFIG_CGROUP_WRITEBACK */ static inline bool inode_cgwb_enabled(struct inode *inode) @@ -257,6 +397,61 @@ static inline bool inode_cgwb_enabled(struct inode *inode) return false; } +static inline struct bdi_writeback_congested * +wb_congested_get_create(struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp) +{ + return bdi->wb.congested; +} + +static inline void wb_congested_put(struct bdi_writeback_congested *congested) +{ +} + +static inline bool wb_tryget(struct bdi_writeback *wb) +{ + return true; +} + +static inline void wb_get(struct bdi_writeback *wb) +{ +} + +static inline void wb_put(struct bdi_writeback *wb) +{ +} + +static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi) +{ + return &bdi->wb; +} + +static inline struct bdi_writeback * +wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp) +{ + return &bdi->wb; +} + +static inline void inode_attach_wb(struct inode *inode, struct page *page) +{ +} + +static inline void inode_detach_wb(struct inode *inode) +{ +} + +static inline struct bdi_writeback *inode_to_wb(struct inode *inode) +{ + return &inode_to_bdi(inode)->wb; +} + +static inline void wb_memcg_offline(struct mem_cgroup *memcg) +{ +} + +static inline void wb_blkcg_offline(struct blkcg *blkcg) +{ +} + #endif /* CONFIG_CGROUP_WRITEBACK */ #endif /* _LINUX_BACKING_DEV_H */ -- cgit v1.2.3 From ec8a6f2643923ee5b74d24fa8d134240379f436b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:41 -0400 Subject: writeback: make congestion functions per bdi_writeback Currently, all congestion functions take bdi (backing_dev_info) and always operate on the root wb (bdi->wb) and the congestion state from the block layer is propagated only for the root blkcg. This patch introduces {set|clear}_wb_congested() and wb_congested() which take a bdi_writeback_congested and bdi_writeback respectively. The bdi counteparts are now wrappers invoking the wb based functions on @bdi->wb. While converting clear_bdi_congested() to clear_wb_congested(), the local variable declaration order between @wqh and @bit is swapped for cosmetic reason. This patch just adds the new wb based functions. The following patches will apply them. v2: Updated for bdi_writeback_congested. Signed-off-by: Tejun Heo Reviewed-by: Jan Kara Cc: Jens Axboe Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 45 ++++++++++++++++++++++++++------------------- 1 file changed, 26 insertions(+), 19 deletions(-) (limited to 'include/linux/backing-dev.h') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 8ae59df2e3d1..2c498a2a8268 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -167,27 +167,13 @@ static inline struct backing_dev_info *inode_to_bdi(struct inode *inode) return sb->s_bdi; } -static inline int bdi_congested(struct backing_dev_info *bdi, int bdi_bits) +static inline int wb_congested(struct bdi_writeback *wb, int cong_bits) { - if (bdi->congested_fn) - return bdi->congested_fn(bdi->congested_data, bdi_bits); - return (bdi->wb.congested->state & bdi_bits); -} - -static inline int bdi_read_congested(struct backing_dev_info *bdi) -{ - return bdi_congested(bdi, 1 << WB_sync_congested); -} - -static inline int bdi_write_congested(struct backing_dev_info *bdi) -{ - return bdi_congested(bdi, 1 << WB_async_congested); -} + struct backing_dev_info *bdi = wb->bdi; -static inline int bdi_rw_congested(struct backing_dev_info *bdi) -{ - return bdi_congested(bdi, (1 << WB_sync_congested) | - (1 << WB_async_congested)); + if (bdi->congested_fn) + return bdi->congested_fn(bdi->congested_data, cong_bits); + return wb->congested->state & cong_bits; } long congestion_wait(int sync, long timeout); @@ -454,4 +440,25 @@ static inline void wb_blkcg_offline(struct blkcg *blkcg) #endif /* CONFIG_CGROUP_WRITEBACK */ +static inline int bdi_congested(struct backing_dev_info *bdi, int cong_bits) +{ + return wb_congested(&bdi->wb, cong_bits); +} + +static inline int bdi_read_congested(struct backing_dev_info *bdi) +{ + return bdi_congested(bdi, 1 << WB_sync_congested); +} + +static inline int bdi_write_congested(struct backing_dev_info *bdi) +{ + return bdi_congested(bdi, 1 << WB_async_congested); +} + +static inline int bdi_rw_congested(struct backing_dev_info *bdi) +{ + return bdi_congested(bdi, (1 << WB_sync_congested) | + (1 << WB_async_congested)); +} + #endif /* _LINUX_BACKING_DEV_H */ -- cgit v1.2.3 From 703c270887bb5106c4c46a00cc7477d30d5e04f5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:44 -0400 Subject: writeback: implement and use inode_congested() In several places, bdi_congested() and its wrappers are used to determine whether more IOs should be issued. With cgroup writeback support, this question can't be answered solely based on the bdi (backing_dev_info). It's dependent on whether the filesystem and bdi support cgroup writeback and the blkcg the inode is associated with. This patch implements inode_congested() and its wrappers which take @inode and determines the congestion state considering cgroup writeback. The new functions replace bdi_*congested() calls in places where the query is about specific inode and task. There are several filesystem users which also fit this criteria but they should be updated when each filesystem implements cgroup writeback support. v2: Now that a given inode is associated with only one wb, congestion state can be determined independent from the asking task. Drop @task. Spotted by Vivek. Also, converted to take @inode instead of @mapping and renamed to inode_congested(). Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Cc: Vivek Goyal Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'include/linux/backing-dev.h') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 2c498a2a8268..6f0882105f95 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -230,6 +230,7 @@ struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi, void __inode_attach_wb(struct inode *inode, struct page *page); void wb_memcg_offline(struct mem_cgroup *memcg); void wb_blkcg_offline(struct blkcg *blkcg); +int inode_congested(struct inode *inode, int cong_bits); /** * inode_cgwb_enabled - test whether cgroup writeback is enabled on an inode @@ -438,8 +439,29 @@ static inline void wb_blkcg_offline(struct blkcg *blkcg) { } +static inline int inode_congested(struct inode *inode, int cong_bits) +{ + return wb_congested(&inode_to_bdi(inode)->wb, cong_bits); +} + #endif /* CONFIG_CGROUP_WRITEBACK */ +static inline int inode_read_congested(struct inode *inode) +{ + return inode_congested(inode, 1 << WB_sync_congested); +} + +static inline int inode_write_congested(struct inode *inode) +{ + return inode_congested(inode, 1 << WB_async_congested); +} + +static inline int inode_rw_congested(struct inode *inode) +{ + return inode_congested(inode, (1 << WB_sync_congested) | + (1 << WB_async_congested)); +} + static inline int bdi_congested(struct backing_dev_info *bdi, int cong_bits) { return wb_congested(&bdi->wb, cong_bits); -- cgit v1.2.3 From d6c10f1fc8626dc55946f4768ae322b4c57b07dd Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:45 -0400 Subject: writeback: implement WB_has_dirty_io wb_state flag Currently, wb_has_dirty_io() determines whether a wb (bdi_writeback) has any dirty inode by testing all three IO lists on each invocation without actively keeping track. For cgroup writeback support, a single bdi will host multiple wb's each of which will host dirty inodes separately and we'll need to make bdi_has_dirty_io(), which currently only represents the root wb, aggregate has_dirty_io from all member wb's, which requires tracking transitions in has_dirty_io state on each wb. This patch introduces inode_wb_list_{move|del}_locked() to consolidate IO list operations leaving queue_io() the only other function which directly manipulates IO lists (via move_expired_inodes()). All three functions are updated to call wb_io_lists_[de]populated() which keep track of whether the wb has dirty inodes or not and record it using the new WB_has_dirty_io flag. inode_wb_list_moved_locked()'s return value indicates whether the wb had no dirty inodes before. mark_inode_dirty() is restructured so that the return value of inode_wb_list_move_locked() can be used for deciding whether to wake up the wb. While at it, change {bdi|wb}_has_dirty_io()'s return values to bool. These functions were returning 0 and 1 before. Also, add a comment explaining the synchronization of wb_state flags. v2: Updated to accommodate b_dirty_time. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'include/linux/backing-dev.h') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 6f0882105f95..3c8403c012ce 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -29,7 +29,7 @@ void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, enum wb_reason reason); void bdi_start_background_writeback(struct backing_dev_info *bdi); void wb_workfn(struct work_struct *work); -int bdi_has_dirty_io(struct backing_dev_info *bdi); +bool bdi_has_dirty_io(struct backing_dev_info *bdi); void wb_wakeup_delayed(struct bdi_writeback *wb); extern spinlock_t bdi_lock; @@ -37,11 +37,9 @@ extern struct list_head bdi_list; extern struct workqueue_struct *bdi_wq; -static inline int wb_has_dirty_io(struct bdi_writeback *wb) +static inline bool wb_has_dirty_io(struct bdi_writeback *wb) { - return !list_empty(&wb->b_dirty) || - !list_empty(&wb->b_io) || - !list_empty(&wb->b_more_io); + return test_bit(WB_has_dirty_io, &wb->state); } static inline void __add_wb_stat(struct bdi_writeback *wb, -- cgit v1.2.3 From 95a46c65e3c09edb9f17dabf2dc16670cd328739 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:47 -0400 Subject: writeback: make bdi_has_dirty_io() take multiple bdi_writeback's into account bdi_has_dirty_io() used to only reflect whether the root wb (bdi_writeback) has dirty inodes. For cgroup writeback support, it needs to take all active wb's into account. If any wb on the bdi has dirty inodes, bdi_has_dirty_io() should return true. To achieve that, as inode_wb_list_{move|del}_locked() now keep track of the dirty state transition of each wb, the number of dirty wbs can be counted in the bdi; however, bdi is already aggregating wb->avg_write_bandwidth which can easily be guaranteed to be > 0 when there are any dirty inodes by ensuring wb->avg_write_bandwidth can't dip below 1. bdi_has_dirty_io() can simply test whether bdi->tot_write_bandwidth is zero or not. While this bumps the value of wb->avg_write_bandwidth to one when it used to be zero, this shouldn't cause any meaningful behavior difference. bdi_has_dirty_io() is made an inline function which tests whether ->tot_write_bandwidth is non-zero. Also, WARN_ON_ONCE()'s on its value are added to inode_wb_list_{move|del}_locked(). Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include/linux/backing-dev.h') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 3c8403c012ce..0839e44105bd 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -29,7 +29,6 @@ void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, enum wb_reason reason); void bdi_start_background_writeback(struct backing_dev_info *bdi); void wb_workfn(struct work_struct *work); -bool bdi_has_dirty_io(struct backing_dev_info *bdi); void wb_wakeup_delayed(struct bdi_writeback *wb); extern spinlock_t bdi_lock; @@ -42,6 +41,15 @@ static inline bool wb_has_dirty_io(struct bdi_writeback *wb) return test_bit(WB_has_dirty_io, &wb->state); } +static inline bool bdi_has_dirty_io(struct backing_dev_info *bdi) +{ + /* + * @bdi->tot_write_bandwidth is guaranteed to be > 0 if there are + * any dirty wbs. See wb_update_write_bandwidth(). + */ + return atomic_long_read(&bdi->tot_write_bandwidth); +} + static inline void __add_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item, s64 amount) { -- cgit v1.2.3 From ebe41ab0c79d5633123f6faa3265a1a63c5f22d8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:50 -0400 Subject: writeback: implement bdi_for_each_wb() This will be used to implement bdi-wide operations which should be distributed across all its cgroup bdi_writebacks. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 63 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) (limited to 'include/linux/backing-dev.h') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 0839e44105bd..c7979806baee 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -383,6 +383,61 @@ static inline struct bdi_writeback *inode_to_wb(struct inode *inode) return inode->i_wb; } +struct wb_iter { + int start_blkcg_id; + struct radix_tree_iter tree_iter; + void **slot; +}; + +static inline struct bdi_writeback *__wb_iter_next(struct wb_iter *iter, + struct backing_dev_info *bdi) +{ + struct radix_tree_iter *titer = &iter->tree_iter; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + if (iter->start_blkcg_id >= 0) { + iter->slot = radix_tree_iter_init(titer, iter->start_blkcg_id); + iter->start_blkcg_id = -1; + } else { + iter->slot = radix_tree_next_slot(iter->slot, titer, 0); + } + + if (!iter->slot) + iter->slot = radix_tree_next_chunk(&bdi->cgwb_tree, titer, 0); + if (iter->slot) + return *iter->slot; + return NULL; +} + +static inline struct bdi_writeback *__wb_iter_init(struct wb_iter *iter, + struct backing_dev_info *bdi, + int start_blkcg_id) +{ + iter->start_blkcg_id = start_blkcg_id; + + if (start_blkcg_id) + return __wb_iter_next(iter, bdi); + else + return &bdi->wb; +} + +/** + * bdi_for_each_wb - walk all wb's of a bdi in ascending blkcg ID order + * @wb_cur: cursor struct bdi_writeback pointer + * @bdi: bdi to walk wb's of + * @iter: pointer to struct wb_iter to be used as iteration buffer + * @start_blkcg_id: blkcg ID to start iteration from + * + * Iterate @wb_cur through the wb's (bdi_writeback's) of @bdi in ascending + * blkcg ID order starting from @start_blkcg_id. @iter is struct wb_iter + * to be used as temp storage during iteration. rcu_read_lock() must be + * held throughout iteration. + */ +#define bdi_for_each_wb(wb_cur, bdi, iter, start_blkcg_id) \ + for ((wb_cur) = __wb_iter_init(iter, bdi, start_blkcg_id); \ + (wb_cur); (wb_cur) = __wb_iter_next(iter, bdi)) + #else /* CONFIG_CGROUP_WRITEBACK */ static inline bool inode_cgwb_enabled(struct inode *inode) @@ -445,6 +500,14 @@ static inline void wb_blkcg_offline(struct blkcg *blkcg) { } +struct wb_iter { + int next_id; +}; + +#define bdi_for_each_wb(wb_cur, bdi, iter, start_blkcg_id) \ + for ((iter)->next_id = (start_blkcg_id); \ + ({ (wb_cur) = !(iter)->next_id++ ? &(bdi)->wb : NULL; }); ) + static inline int inode_congested(struct inode *inode, int cong_bits) { return wb_congested(&inode_to_bdi(inode)->wb, cong_bits); -- cgit v1.2.3 From c00ddad39f512b1a81e25b7892217ce10efab0f1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:51 -0400 Subject: writeback: remove bdi_start_writeback() bdi_start_writeback() is a thin wrapper on top of __wb_start_writeback() which is used only by laptop_mode_timer_fn(). This patches removes bdi_start_writeback(), renames __wb_start_writeback() to wb_start_writeback() and makes laptop_mode_timer_fn() use it instead. This doesn't cause any functional difference and will ease making laptop_mode_timer_fn() cgroup writeback aware. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux/backing-dev.h') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index c7979806baee..0ff40c228bee 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -25,8 +25,8 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent, int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev); void bdi_unregister(struct backing_dev_info *bdi); int __must_check bdi_setup_and_register(struct backing_dev_info *, char *); -void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, - enum wb_reason reason); +void wb_start_writeback(struct bdi_writeback *wb, long nr_pages, + bool range_cyclic, enum wb_reason reason); void bdi_start_background_writeback(struct backing_dev_info *bdi); void wb_workfn(struct work_struct *work); void wb_wakeup_delayed(struct bdi_writeback *wb); -- cgit v1.2.3 From bc05873dccd27d75d6acdf812c3edfb181f1ba17 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:53 -0400 Subject: writeback: make writeback_in_progress() take bdi_writeback instead of backing_dev_info writeback_in_progress() currently takes @bdi and returns whether writeback is in progress on its root wb (bdi_writeback). In preparation for cgroup writeback support, make it take wb instead. While at it, make it an inline function. This patch doesn't make any functional difference. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'include/linux/backing-dev.h') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 0ff40c228bee..f04956c900ec 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -156,7 +156,17 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); extern struct backing_dev_info noop_backing_dev_info; -int writeback_in_progress(struct backing_dev_info *bdi); +/** + * writeback_in_progress - determine whether there is writeback in progress + * @wb: bdi_writeback of interest + * + * Determine whether there is writeback waiting to be handled against a + * bdi_writeback. + */ +static inline bool writeback_in_progress(struct bdi_writeback *wb) +{ + return test_bit(WB_writeback_running, &wb->state); +} static inline struct backing_dev_info *inode_to_bdi(struct inode *inode) { -- cgit v1.2.3 From 9ecf4866c018aeb304a7b49216c4d183665becb7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:54 -0400 Subject: writeback: make bdi_start_background_writeback() take bdi_writeback instead of backing_dev_info bdi_start_background_writeback() currently takes @bdi and kicks the root wb (bdi_writeback). In preparation for cgroup writeback support, make it take wb instead. This patch doesn't make any functional difference. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/backing-dev.h') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index f04956c900ec..9cc11e5b97ca 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -27,7 +27,7 @@ void bdi_unregister(struct backing_dev_info *bdi); int __must_check bdi_setup_and_register(struct backing_dev_info *, char *); void wb_start_writeback(struct bdi_writeback *wb, long nr_pages, bool range_cyclic, enum wb_reason reason); -void bdi_start_background_writeback(struct backing_dev_info *bdi); +void wb_start_background_writeback(struct bdi_writeback *wb); void wb_workfn(struct work_struct *work); void wb_wakeup_delayed(struct bdi_writeback *wb); -- cgit v1.2.3 From bafc0dba1e20d84578d7098d32caf63441e5743d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 2 Jun 2015 08:37:23 -0600 Subject: buffer, writeback: make __block_write_full_page() honor cgroup writeback [__]block_write_full_page() is used to implement ->writepage in various filesystems. All writeback logic is now updated to handle cgroup writeback and the block cgroup to issue IOs for is encoded in writeback_control and can be retrieved from the inode; however, [__]block_write_full_page() currently ignores the blkcg indicated by inode and issues all bio's without explicit blkcg association. This patch adds submit_bh_blkcg() which associates the bio with the specified blkio cgroup before issuing and uses it in __block_write_full_page() so that the issued bio's are associated with inode_to_wb_blkcg_css(inode). v2: Updated for per-inode wb association. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Cc: Andrew Morton Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux/backing-dev.h') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 9cc11e5b97ca..e9d7373f5f93 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -393,6 +393,12 @@ static inline struct bdi_writeback *inode_to_wb(struct inode *inode) return inode->i_wb; } +static inline struct cgroup_subsys_state * +inode_to_wb_blkcg_css(struct inode *inode) +{ + return inode_to_wb(inode)->blkcg_css; +} + struct wb_iter { int start_blkcg_id; struct radix_tree_iter tree_iter; @@ -510,6 +516,12 @@ static inline void wb_blkcg_offline(struct blkcg *blkcg) { } +static inline struct cgroup_subsys_state * +inode_to_wb_blkcg_css(struct inode *inode) +{ + return blkcg_root_css; +} + struct wb_iter { int next_id; }; -- cgit v1.2.3 From 21c6321fbb3a3787af07f1bc031d713a707fb69c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 28 May 2015 14:50:49 -0400 Subject: writeback: relocate wb[_try]_get(), wb_put(), inode_{attach|detach}_wb() Currently, majority of cgroup writeback support including all the above functions are implemented in include/linux/backing-dev.h and mm/backing-dev.c; however, the portion closely related to writeback logic implemented in include/linux/writeback.h and mm/page-writeback.c will expand to support foreign writeback detection and correction. This patch moves wb[_try]_get() and wb_put() to include/linux/backing-dev-defs.h so that they can be used from writeback.h and inode_{attach|detach}_wb() to writeback.h and page-writeback.c. This is pure reorganization and doesn't introduce any functional changes. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Cc: Wu Fengguang Cc: Greg Thelen Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 82 --------------------------------------------- 1 file changed, 82 deletions(-) (limited to 'include/linux/backing-dev.h') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index e9d7373f5f93..5c978a924157 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -243,7 +243,6 @@ void wb_congested_put(struct bdi_writeback_congested *congested); struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi, struct cgroup_subsys_state *memcg_css, gfp_t gfp); -void __inode_attach_wb(struct inode *inode, struct page *page); void wb_memcg_offline(struct mem_cgroup *memcg); void wb_blkcg_offline(struct blkcg *blkcg); int inode_congested(struct inode *inode, int cong_bits); @@ -264,37 +263,6 @@ static inline bool inode_cgwb_enabled(struct inode *inode) (inode->i_sb->s_type->fs_flags & FS_CGROUP_WRITEBACK); } -/** - * wb_tryget - try to increment a wb's refcount - * @wb: bdi_writeback to get - */ -static inline bool wb_tryget(struct bdi_writeback *wb) -{ - if (wb != &wb->bdi->wb) - return percpu_ref_tryget(&wb->refcnt); - return true; -} - -/** - * wb_get - increment a wb's refcount - * @wb: bdi_writeback to get - */ -static inline void wb_get(struct bdi_writeback *wb) -{ - if (wb != &wb->bdi->wb) - percpu_ref_get(&wb->refcnt); -} - -/** - * wb_put - decrement a wb's refcount - * @wb: bdi_writeback to put - */ -static inline void wb_put(struct bdi_writeback *wb) -{ - if (wb != &wb->bdi->wb) - percpu_ref_put(&wb->refcnt); -} - /** * wb_find_current - find wb for %current on a bdi * @bdi: bdi of interest @@ -353,35 +321,6 @@ wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp) return wb; } -/** - * inode_attach_wb - associate an inode with its wb - * @inode: inode of interest - * @page: page being dirtied (may be NULL) - * - * If @inode doesn't have its wb, associate it with the wb matching the - * memcg of @page or, if @page is NULL, %current. May be called w/ or w/o - * @inode->i_lock. - */ -static inline void inode_attach_wb(struct inode *inode, struct page *page) -{ - if (!inode->i_wb) - __inode_attach_wb(inode, page); -} - -/** - * inode_detach_wb - disassociate an inode from its wb - * @inode: inode of interest - * - * @inode is being freed. Detach from its wb. - */ -static inline void inode_detach_wb(struct inode *inode) -{ - if (inode->i_wb) { - wb_put(inode->i_wb); - inode->i_wb = NULL; - } -} - /** * inode_to_wb - determine the wb of an inode * @inode: inode of interest @@ -471,19 +410,6 @@ static inline void wb_congested_put(struct bdi_writeback_congested *congested) { } -static inline bool wb_tryget(struct bdi_writeback *wb) -{ - return true; -} - -static inline void wb_get(struct bdi_writeback *wb) -{ -} - -static inline void wb_put(struct bdi_writeback *wb) -{ -} - static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi) { return &bdi->wb; @@ -495,14 +421,6 @@ wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp) return &bdi->wb; } -static inline void inode_attach_wb(struct inode *inode, struct page *page) -{ -} - -static inline void inode_detach_wb(struct inode *inode) -{ -} - static inline struct bdi_writeback *inode_to_wb(struct inode *inode) { return &inode_to_bdi(inode)->wb; -- cgit v1.2.3 From b16b1deb553adcd7b3b7ce3e6d6fd1b923f314da Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 2 Jun 2015 08:39:48 -0600 Subject: writeback: make writeback_control track the inode being written back Currently, for cgroup writeback, the IO submission paths directly associate the bio's with the blkcg from inode_to_wb_blkcg_css(); however, it'd be necessary to keep more writeback context to implement foreign inode writeback detection. wbc (writeback_control) is the natural fit for the extra context - it persists throughout the writeback of each inode and is passed all the way down to IO submission paths. This patch adds wbc_attach_and_unlock_inode(), wbc_detach_inode(), and wbc_attach_fdatawrite_inode() which are used to associate wbc with the inode being written back. IO submission paths now use wbc_init_bio() instead of directly associating bio's with blkcg themselves. This leaves inode_to_wb_blkcg_css() w/o any user. The function is removed. wbc currently only tracks the associated wb (bdi_writeback). Future patches will add more for foreign inode detection. The association is established under i_lock which will be depended upon when migrating foreign inodes to other wb's. As currently, once established, inode to wb association never changes, going through wbc when initializing bio's doesn't cause any behavior changes. v2: submit_blk_blkcg() now checks whether the wbc is associated with a wb before dereferencing it. This can happen when pageout() is writing pages directly without going through the usual writeback path. As pageout() path is single-threaded, we don't want it to be blocked behind a slow cgroup and ultimately want it to delegate actual writing to the usual writeback path. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Cc: Wu Fengguang Cc: Greg Thelen Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include/linux/backing-dev.h') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 5c978a924157..b1d2489a6536 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -332,12 +332,6 @@ static inline struct bdi_writeback *inode_to_wb(struct inode *inode) return inode->i_wb; } -static inline struct cgroup_subsys_state * -inode_to_wb_blkcg_css(struct inode *inode) -{ - return inode_to_wb(inode)->blkcg_css; -} - struct wb_iter { int start_blkcg_id; struct radix_tree_iter tree_iter; @@ -434,12 +428,6 @@ static inline void wb_blkcg_offline(struct blkcg *blkcg) { } -static inline struct cgroup_subsys_state * -inode_to_wb_blkcg_css(struct inode *inode) -{ - return blkcg_root_css; -} - struct wb_iter { int next_id; }; -- cgit v1.2.3 From 682aa8e1a6a1504a4caaa62e6c2c9daae3757210 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 28 May 2015 14:50:53 -0400 Subject: writeback: implement unlocked_inode_to_wb transaction and use it for stat updates The mechanism for detecting whether an inode should switch its wb (bdi_writeback) association is now in place. This patch build the framework for the actual switching. This patch adds a new inode flag I_WB_SWITCHING, which has two functions. First, the easy one, it ensures that there's only one switching in progress for a give inode. Second, it's used as a mechanism to synchronize wb stat updates. The two stats, WB_RECLAIMABLE and WB_WRITEBACK, aren't event counters but track the current number of dirty pages and pages under writeback respectively. As such, when an inode is moved from one wb to another, the inode's portion of those stats have to be transferred together; unfortunately, this is a bit tricky as those stat updates are percpu operations which are performed without holding any lock in some places. This patch solves the problem in a similar way as memcg. Each such lockless stat updates are wrapped in transaction surrounded by unlocked_inode_to_wb_begin/end(). During normal operation, they map to rcu_read_lock/unlock(); however, if I_WB_SWITCHING is asserted, mapping->tree_lock is grabbed across the transaction. In turn, the switching path sets I_WB_SWITCHING and waits for a RCU grace period to pass before actually starting to switch, which guarantees that all stat update paths are synchronizing against mapping->tree_lock. This patch still doesn't implement the actual switching. v3: Updated on top of the recent cancel_dirty_page() updates. unlocked_inode_to_wb_begin() now nests inside mem_cgroup_begin_page_stat() to match the locking order. v2: The i_wb access transaction will be used for !stat accesses too. Function names and comments updated accordingly. s/inode_wb_stat_unlocked_{begin|end}/unlocked_inode_to_wb_{begin|end}/ s/switch_wb/switch_wbs/ Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Cc: Wu Fengguang Cc: Greg Thelen Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 54 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) (limited to 'include/linux/backing-dev.h') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index b1d2489a6536..73ffa32e58ee 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -332,6 +332,50 @@ static inline struct bdi_writeback *inode_to_wb(struct inode *inode) return inode->i_wb; } +/** + * unlocked_inode_to_wb_begin - begin unlocked inode wb access transaction + * @inode: target inode + * @lockedp: temp bool output param, to be passed to the end function + * + * The caller wants to access the wb associated with @inode but isn't + * holding inode->i_lock, mapping->tree_lock or wb->list_lock. This + * function determines the wb associated with @inode and ensures that the + * association doesn't change until the transaction is finished with + * unlocked_inode_to_wb_end(). + * + * The caller must call unlocked_inode_to_wb_end() with *@lockdep + * afterwards and can't sleep during transaction. IRQ may or may not be + * disabled on return. + */ +static inline struct bdi_writeback * +unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp) +{ + rcu_read_lock(); + + /* + * Paired with store_release in inode_switch_wb_work_fn() and + * ensures that we see the new wb if we see cleared I_WB_SWITCH. + */ + *lockedp = smp_load_acquire(&inode->i_state) & I_WB_SWITCH; + + if (unlikely(*lockedp)) + spin_lock_irq(&inode->i_mapping->tree_lock); + return inode_to_wb(inode); +} + +/** + * unlocked_inode_to_wb_end - end inode wb access transaction + * @inode: target inode + * @locked: *@lockedp from unlocked_inode_to_wb_begin() + */ +static inline void unlocked_inode_to_wb_end(struct inode *inode, bool locked) +{ + if (unlikely(locked)) + spin_unlock_irq(&inode->i_mapping->tree_lock); + + rcu_read_unlock(); +} + struct wb_iter { int start_blkcg_id; struct radix_tree_iter tree_iter; @@ -420,6 +464,16 @@ static inline struct bdi_writeback *inode_to_wb(struct inode *inode) return &inode_to_bdi(inode)->wb; } +static inline struct bdi_writeback * +unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp) +{ + return inode_to_wb(inode); +} + +static inline void unlocked_inode_to_wb_end(struct inode *inode, bool locked) +{ +} + static inline void wb_memcg_offline(struct mem_cgroup *memcg) { } -- cgit v1.2.3 From aaa2cacf8184e2a92accb8e443b1608d65f9a13f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 28 May 2015 14:50:55 -0400 Subject: writeback: add lockdep annotation to inode_to_wb() With the previous three patches, all operations which acquire wb from inode are either under one of inode->i_lock, mapping->tree_lock or wb->list_lock or protected by unlocked_inode_to_wb transaction. This will be depended upon by foreign inode wb switching. This patch adds lockdep assertion to inode_to_wb() so that usages outside the above list locks can be caught easily. There are three exceptions. * locked_inode_to_wb_and_lock_list() is holding wb->list_lock but the wb may not be the inode's. Ensuring that is the function's role after all. Updated to deref inode->i_wb directly. * inode_wb_stat_unlocked_begin() is usually protected by combination of !I_WB_SWITCH and rcu_read_lock(). Updated to deref inode->i_wb directly. * inode_congested() wants to test whether inode->i_wb is set before starting the transaction. Added inode_to_wb_is_valid() which tests inode->i_wb directly. v5: might_lock() removed. It annotates that the lock is grabbed w/ irq enabled which isn't the case and triggering lockdep warning spuriously. v4: might_lock() added to unlocked_inode_to_wb_begin(). v3: inode_congested() conversion added. v2: locked_inode_to_wb_and_lock_list() was missing in the first version. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Cc: Wu Fengguang Cc: Greg Thelen Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 34 ++++++++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) (limited to 'include/linux/backing-dev.h') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 73ffa32e58ee..dfce80869145 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -321,14 +321,34 @@ wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp) return wb; } +/** + * inode_to_wb_is_valid - test whether an inode has a wb associated + * @inode: inode of interest + * + * Returns %true if @inode has a wb associated. May be called without any + * locking. + */ +static inline bool inode_to_wb_is_valid(struct inode *inode) +{ + return inode->i_wb; +} + /** * inode_to_wb - determine the wb of an inode * @inode: inode of interest * - * Returns the wb @inode is currently associated with. + * Returns the wb @inode is currently associated with. The caller must be + * holding either @inode->i_lock, @inode->i_mapping->tree_lock, or the + * associated wb's list_lock. */ static inline struct bdi_writeback *inode_to_wb(struct inode *inode) { +#ifdef CONFIG_LOCKDEP + WARN_ON_ONCE(debug_locks && + (!lockdep_is_held(&inode->i_lock) && + !lockdep_is_held(&inode->i_mapping->tree_lock) && + !lockdep_is_held(&inode->i_wb->list_lock))); +#endif return inode->i_wb; } @@ -360,7 +380,12 @@ unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp) if (unlikely(*lockedp)) spin_lock_irq(&inode->i_mapping->tree_lock); - return inode_to_wb(inode); + + /* + * Protected by either !I_WB_SWITCH + rcu_read_lock() or tree_lock. + * inode_to_wb() will bark. Deref directly. + */ + return inode->i_wb; } /** @@ -459,6 +484,11 @@ wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp) return &bdi->wb; } +static inline bool inode_to_wb_is_valid(struct inode *inode) +{ + return true; +} + static inline struct bdi_writeback *inode_to_wb(struct inode *inode) { return &inode_to_bdi(inode)->wb; -- cgit v1.2.3 From 46b15caa7cb19b0f6e3bc8ebaee5bc1bb2e35110 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 16 Jun 2015 18:48:31 -0400 Subject: vfs, writeback: replace FS_CGROUP_WRITEBACK with SB_I_CGROUPWB FS_CGROUP_WRITEBACK indicates whether a file_system_type supports cgroup writeback; however, different super_blocks of the same file_system_type may or may not support cgroup writeback depending on filesystem options. This patch replaces FS_CGROUP_WRITEBACK with a per-super_block flag. super_block->s_flags carries some internal flags in the high bits but it's exposd to userland through uapi header and running out of space anyway. This patch adds a new field super_block->s_iflags to carry kernel-internal flags. It is currently only used by the new SB_I_CGROUPWB flag whose concatenated and abbreviated name is for consistency with other super_block flags. ext2_fill_super() is updated to set SB_I_CGROUPWB. v2: Added super_block->s_iflags instead of stealing another high bit from sb->s_flags as suggested by Christoph and Jan. Signed-off-by: Tejun Heo Cc: Alexander Viro Cc: linux-fsdevel@vger.kernel.org Cc: Christoph Hellwig Cc: Jan Kara Cc: linux-ext4@vger.kernel.org Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/backing-dev.h') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index dfce80869145..a13181a42b9a 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -260,7 +260,7 @@ static inline bool inode_cgwb_enabled(struct inode *inode) return bdi_cap_account_dirty(bdi) && (bdi->capabilities & BDI_CAP_CGROUP_WRITEBACK) && - (inode->i_sb->s_type->fs_flags & FS_CGROUP_WRITEBACK); + (inode->i_sb->s_iflags & SB_I_CGROUPWB); } /** -- cgit v1.2.3