From 309381feaee564281c3d9e90fbca8963bb7428ad Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Thu, 23 Jan 2014 15:52:54 -0800 Subject: mm: dump page when hitting a VM_BUG_ON using VM_BUG_ON_PAGE Most of the VM_BUG_ON assertions are performed on a page. Usually, when one of these assertions fails we'll get a BUG_ON with a call stack and the registers. I've recently noticed based on the requests to add a small piece of code that dumps the page to various VM_BUG_ON sites that the page dump is quite useful to people debugging issues in mm. This patch adds a VM_BUG_ON_PAGE(cond, page) which beyond doing what VM_BUG_ON() does, also dumps the page before executing the actual BUG_ON. [akpm@linux-foundation.org: fix up includes] Signed-off-by: Sasha Levin Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index eea668d9cff6..2254f36b74b8 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -603,7 +603,7 @@ void putback_lru_page(struct page *page) bool is_unevictable; int was_unevictable = PageUnevictable(page); - VM_BUG_ON(PageLRU(page)); + VM_BUG_ON_PAGE(PageLRU(page), page); redo: ClearPageUnevictable(page); @@ -794,8 +794,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, if (!trylock_page(page)) goto keep; - VM_BUG_ON(PageActive(page)); - VM_BUG_ON(page_zone(page) != zone); + VM_BUG_ON_PAGE(PageActive(page), page); + VM_BUG_ON_PAGE(page_zone(page) != zone, page); sc->nr_scanned++; @@ -1079,14 +1079,14 @@ activate_locked: /* Not a candidate for swapping, so reclaim swap space. */ if (PageSwapCache(page) && vm_swap_full()) try_to_free_swap(page); - VM_BUG_ON(PageActive(page)); + VM_BUG_ON_PAGE(PageActive(page), page); SetPageActive(page); pgactivate++; keep_locked: unlock_page(page); keep: list_add(&page->lru, &ret_pages); - VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); + VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page); } free_hot_cold_page_list(&free_pages, 1); @@ -1240,7 +1240,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, page = lru_to_page(src); prefetchw_prev_lru_page(page, src, flags); - VM_BUG_ON(!PageLRU(page)); + VM_BUG_ON_PAGE(!PageLRU(page), page); switch (__isolate_lru_page(page, mode)) { case 0: @@ -1295,7 +1295,7 @@ int isolate_lru_page(struct page *page) { int ret = -EBUSY; - VM_BUG_ON(!page_count(page)); + VM_BUG_ON_PAGE(!page_count(page), page); if (PageLRU(page)) { struct zone *zone = page_zone(page); @@ -1366,7 +1366,7 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list) struct page *page = lru_to_page(page_list); int lru; - VM_BUG_ON(PageLRU(page)); + VM_BUG_ON_PAGE(PageLRU(page), page); list_del(&page->lru); if (unlikely(!page_evictable(page))) { spin_unlock_irq(&zone->lru_lock); @@ -1586,7 +1586,7 @@ static void move_active_pages_to_lru(struct lruvec *lruvec, page = lru_to_page(list); lruvec = mem_cgroup_page_lruvec(page, zone); - VM_BUG_ON(PageLRU(page)); + VM_BUG_ON_PAGE(PageLRU(page), page); SetPageLRU(page); nr_pages = hpage_nr_pages(page); @@ -3701,7 +3701,7 @@ void check_move_unevictable_pages(struct page **pages, int nr_pages) if (page_evictable(page)) { enum lru_list lru = page_lru_base_type(page); - VM_BUG_ON(PageActive(page)); + VM_BUG_ON_PAGE(PageActive(page), page); ClearPageUnevictable(page); del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE); add_page_to_lru_list(page, lruvec, lru); -- cgit v1.2.3 From 0b1fb40a3b1291f2f12f13f644ac95cf756a00e6 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 23 Jan 2014 15:53:22 -0800 Subject: mm: vmscan: shrink all slab objects if tight on memory When reclaiming kmem, we currently don't scan slabs that have less than batch_size objects (see shrink_slab_node()): while (total_scan >= batch_size) { shrinkctl->nr_to_scan = batch_size; shrinker->scan_objects(shrinker, shrinkctl); total_scan -= batch_size; } If there are only a few shrinkers available, such a behavior won't cause any problems, because the batch_size is usually small, but if we have a lot of slab shrinkers, which is perfectly possible since FS shrinkers are now per-superblock, we can end up with hundreds of megabytes of practically unreclaimable kmem objects. For instance, mounting a thousand of ext2 FS images with a hundred of files in each and iterating over all the files using du(1) will result in about 200 Mb of FS caches that cannot be dropped even with the aid of the vm.drop_caches sysctl! This problem was initially pointed out by Glauber Costa [*]. Glauber proposed to fix it by making the shrink_slab() always take at least one pass, to put it simply, turning the scan loop above to a do{}while() loop. However, this proposal was rejected, because it could result in more aggressive and frequent slab shrinking even under low memory pressure when total_scan is naturally very small. This patch is a slightly modified version of Glauber's approach. Similarly to Glauber's patch, it makes shrink_slab() scan less than batch_size objects, but only if the total number of objects we want to scan (total_scan) is greater than the total number of objects available (max_pass). Since total_scan is biased as half max_pass if the current delta change is small: if (delta < max_pass / 4) total_scan = min(total_scan, max_pass / 2); this is only possible if we are scanning at high prio. That said, this patch shouldn't change the vmscan behaviour if the memory pressure is low, but if we are tight on memory, we will do our best by trying to reclaim all available objects, which sounds reasonable. [*] http://www.spinics.net/lists/cgroups/msg06913.html Signed-off-by: Vladimir Davydov Cc: Mel Gorman Cc: Michal Hocko Cc: Johannes Weiner Cc: Rik van Riel Cc: Dave Chinner Cc: Glauber Costa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index 2254f36b74b8..45c1cf61cbed 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -281,17 +281,34 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, nr_pages_scanned, lru_pages, max_pass, delta, total_scan); - while (total_scan >= batch_size) { + /* + * Normally, we should not scan less than batch_size objects in one + * pass to avoid too frequent shrinker calls, but if the slab has less + * than batch_size objects in total and we are really tight on memory, + * we will try to reclaim all available objects, otherwise we can end + * up failing allocations although there are plenty of reclaimable + * objects spread over several slabs with usage less than the + * batch_size. + * + * We detect the "tight on memory" situations by looking at the total + * number of objects we want to scan (total_scan). If it is greater + * than the total number of objects on slab (max_pass), we must be + * scanning at high prio and therefore should try to reclaim as much as + * possible. + */ + while (total_scan >= batch_size || + total_scan >= max_pass) { unsigned long ret; + unsigned long nr_to_scan = min(batch_size, total_scan); - shrinkctl->nr_to_scan = batch_size; + shrinkctl->nr_to_scan = nr_to_scan; ret = shrinker->scan_objects(shrinker, shrinkctl); if (ret == SHRINK_STOP) break; freed += ret; - count_vm_events(SLABS_SCANNED, batch_size); - total_scan -= batch_size; + count_vm_events(SLABS_SCANNED, nr_to_scan); + total_scan -= nr_to_scan; cond_resched(); } -- cgit v1.2.3 From ec97097bca147d5718a5d2c024d1ec740b10096d Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 23 Jan 2014 15:53:23 -0800 Subject: mm: vmscan: call NUMA-unaware shrinkers irrespective of nodemask If a shrinker is not NUMA-aware, shrink_slab() should call it exactly once with nid=0, but currently it is not true: if node 0 is not set in the nodemask or if it is not online, we will not call such shrinkers at all. As a result some slabs will be left untouched under some circumstances. Let us fix it. Signed-off-by: Vladimir Davydov Reported-by: Dave Chinner Cc: Mel Gorman Cc: Michal Hocko Cc: Johannes Weiner Cc: Rik van Riel Cc: Glauber Costa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index 45c1cf61cbed..90c4075d8d75 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -369,16 +369,17 @@ unsigned long shrink_slab(struct shrink_control *shrinkctl, } list_for_each_entry(shrinker, &shrinker_list, list) { - for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) { - if (!node_online(shrinkctl->nid)) - continue; - - if (!(shrinker->flags & SHRINKER_NUMA_AWARE) && - (shrinkctl->nid != 0)) - break; - + if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) { + shrinkctl->nid = 0; freed += shrink_slab_node(shrinkctl, shrinker, - nr_pages_scanned, lru_pages); + nr_pages_scanned, lru_pages); + continue; + } + + for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) { + if (node_online(shrinkctl->nid)) + freed += shrink_slab_node(shrinkctl, shrinker, + nr_pages_scanned, lru_pages); } } -- cgit v1.2.3 From a1c3bfb2f67ef766de03f1f56bdfff9c8595ab14 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 29 Jan 2014 14:05:41 -0800 Subject: mm/page-writeback.c: do not count anon pages as dirtyable memory The VM is currently heavily tuned to avoid swapping. Whether that is good or bad is a separate discussion, but as long as the VM won't swap to make room for dirty cache, we can not consider anonymous pages when calculating the amount of dirtyable memory, the baseline to which dirty_background_ratio and dirty_ratio are applied. A simple workload that occupies a significant size (40+%, depending on memory layout, storage speeds etc.) of memory with anon/tmpfs pages and uses the remainder for a streaming writer demonstrates this problem. In that case, the actual cache pages are a small fraction of what is considered dirtyable overall, which results in an relatively large portion of the cache pages to be dirtied. As kswapd starts rotating these, random tasks enter direct reclaim and stall on IO. Only consider free pages and file pages dirtyable. Signed-off-by: Johannes Weiner Reported-by: Tejun Heo Tested-by: Tejun Heo Reviewed-by: Rik van Riel Cc: Mel Gorman Cc: Wu Fengguang Reviewed-by: Michal Hocko Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 23 +---------------------- 1 file changed, 1 insertion(+), 22 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index 90c4075d8d75..a9c74b409681 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -147,7 +147,7 @@ static bool global_reclaim(struct scan_control *sc) } #endif -unsigned long zone_reclaimable_pages(struct zone *zone) +static unsigned long zone_reclaimable_pages(struct zone *zone) { int nr; @@ -3315,27 +3315,6 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) wake_up_interruptible(&pgdat->kswapd_wait); } -/* - * The reclaimable count would be mostly accurate. - * The less reclaimable pages may be - * - mlocked pages, which will be moved to unevictable list when encountered - * - mapped pages, which may require several travels to be reclaimed - * - dirty pages, which is not "instantly" reclaimable - */ -unsigned long global_reclaimable_pages(void) -{ - int nr; - - nr = global_page_state(NR_ACTIVE_FILE) + - global_page_state(NR_INACTIVE_FILE); - - if (get_nr_swap_pages() > 0) - nr += global_page_state(NR_ACTIVE_ANON) + - global_page_state(NR_INACTIVE_ANON); - - return nr; -} - #ifdef CONFIG_HIBERNATION /* * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of -- cgit v1.2.3