diff options
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r-- | mm/vmscan.c | 418 |
1 files changed, 275 insertions, 143 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c index 76853088f66b..aa1074d3031b 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -104,6 +104,13 @@ struct scan_control { /* Number of pages freed so far during a call to shrink_zones() */ unsigned long nr_reclaimed; + + /* + * Reclaim pages from a vma. If the page is shared by other tasks + * it is zapped from a vma without reclaim so it ends up remaining + * on memory until last task zap it. + */ + struct vm_area_struct *target_vma; }; #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) @@ -146,6 +153,12 @@ int vm_swappiness = 60; */ unsigned long vm_total_pages; +#ifdef CONFIG_KSWAPD_CPU_AFFINITY_MASK +char *kswapd_cpu_mask = CONFIG_KSWAPD_CPU_AFFINITY_MASK; +#else +char *kswapd_cpu_mask = NULL; +#endif + static LIST_HEAD(shrinker_list); static DECLARE_RWSEM(shrinker_rwsem); @@ -281,6 +294,10 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, long batch_size = shrinker->batch ? shrinker->batch : SHRINK_BATCH; long scanned = 0, next_deferred; + long min_cache_size = batch_size; + + if (current_is_kswapd()) + min_cache_size = 0; freeable = shrinker->count_objects(shrinker, shrinkctl); if (freeable == 0) @@ -348,7 +365,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, * scanning at high prio and therefore should try to reclaim as much as * possible. */ - while (total_scan >= batch_size || + while (total_scan > min_cache_size || total_scan >= freeable) { unsigned long ret; unsigned long nr_to_scan = min(batch_size, total_scan); @@ -385,6 +402,35 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, return freed; } +static void shrink_slab_lmk(gfp_t gfp_mask, int nid, + struct mem_cgroup *memcg, + unsigned long nr_scanned, + unsigned long nr_eligible) +{ + struct shrinker *shrinker; + + if (nr_scanned == 0) + nr_scanned = SWAP_CLUSTER_MAX; + + if (!down_read_trylock(&shrinker_rwsem)) + goto out; + + list_for_each_entry(shrinker, &shrinker_list, list) { + struct shrink_control sc = { + .gfp_mask = gfp_mask, + }; + + if (!(shrinker->flags & SHRINKER_LMK)) + continue; + + do_shrink_slab(&sc, shrinker, nr_scanned, nr_eligible); + } + + up_read(&shrinker_rwsem); +out: + cond_resched(); +} + /** * shrink_slab - shrink slab caches * @gfp_mask: allocation context @@ -446,6 +492,9 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, .memcg = memcg, }; + if (shrinker->flags & SHRINKER_LMK) + continue; + if (memcg && !(shrinker->flags & SHRINKER_MEMCG_AWARE)) continue; @@ -915,7 +964,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, struct address_space *mapping; struct page *page; int may_enter_fs; - enum page_references references = PAGEREF_RECLAIM_CLEAN; + enum page_references references = PAGEREF_RECLAIM; bool dirty, writeback; cond_resched(); @@ -927,7 +976,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, goto keep; VM_BUG_ON_PAGE(PageActive(page), page); - VM_BUG_ON_PAGE(page_zone(page) != zone, page); + if (zone) + VM_BUG_ON_PAGE(page_zone(page) != zone, page); sc->nr_scanned++; @@ -1006,7 +1056,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, /* Case 1 above */ if (current_is_kswapd() && PageReclaim(page) && - test_bit(ZONE_WRITEBACK, &zone->flags)) { + (zone && test_bit(ZONE_WRITEBACK, &zone->flags))) { nr_immediate++; goto keep_locked; @@ -1072,7 +1122,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, */ if (page_mapped(page) && mapping) { switch (try_to_unmap(page, - ttu_flags|TTU_BATCH_FLUSH)) { + ttu_flags|TTU_BATCH_FLUSH, + sc->target_vma)) { case SWAP_FAIL: goto activate_locked; case SWAP_AGAIN: @@ -1092,7 +1143,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, */ if (page_is_file_cache(page) && (!current_is_kswapd() || - !test_bit(ZONE_DIRTY, &zone->flags))) { + (zone && + !test_bit(ZONE_DIRTY, &zone->flags)))) { /* * Immediately reclaim when written back. * Similar in principal to deactivate_page() @@ -1195,7 +1247,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, * we obviously don't have to worry about waking up a process * waiting on the page lock, because there are no references. */ - __clear_page_locked(page); + __ClearPageLocked(page); free_it: nr_reclaimed++; @@ -1204,6 +1256,13 @@ free_it: * appear not as the counts should be low */ list_add(&page->lru, &free_pages); + /* + * If pagelist are from multiple zones, we should decrease + * NR_ISOLATED_ANON + x on freed pages in here. + */ + if (!zone) + dec_zone_page_state(page, NR_ISOLATED_ANON + + page_is_file_cache(page)); continue; cull_mlocked: @@ -1215,7 +1274,7 @@ cull_mlocked: activate_locked: /* Not a candidate for swapping, so reclaim swap space. */ - if (PageSwapCache(page) && vm_swap_full()) + if (PageSwapCache(page) && vm_swap_full(page_swap_info(page))) try_to_free_swap(page); VM_BUG_ON_PAGE(PageActive(page), page); SetPageActive(page); @@ -1249,6 +1308,8 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, .gfp_mask = GFP_KERNEL, .priority = DEF_PRIORITY, .may_unmap = 1, + /* Doesn't allow to write out dirty page */ + .may_writepage = 0, }; unsigned long ret, dummy1, dummy2, dummy3, dummy4, dummy5; struct page *page, *next; @@ -1256,7 +1317,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, list_for_each_entry_safe(page, next, page_list, lru) { if (page_is_file_cache(page) && !PageDirty(page) && - !isolated_balloon_page(page)) { + !__PageMovable(page)) { ClearPageActive(page); list_move(&page->lru, &clean_pages); } @@ -1270,6 +1331,42 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, return ret; } +#ifdef CONFIG_PROCESS_RECLAIM +unsigned long reclaim_pages_from_list(struct list_head *page_list, + struct vm_area_struct *vma) +{ + struct scan_control sc = { + .gfp_mask = GFP_KERNEL, + .priority = DEF_PRIORITY, + .may_writepage = 1, + .may_unmap = 1, + .may_swap = 1, + .target_vma = vma, + }; + + unsigned long nr_reclaimed; + struct page *page; + unsigned long dummy1, dummy2, dummy3, dummy4, dummy5; + + list_for_each_entry(page, page_list, lru) + ClearPageActive(page); + + nr_reclaimed = shrink_page_list(page_list, NULL, &sc, + TTU_UNMAP|TTU_IGNORE_ACCESS, + &dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true); + + while (!list_empty(page_list)) { + page = lru_to_page(page_list); + list_del(&page->lru); + dec_zone_page_state(page, NR_ISOLATED_ANON + + page_is_file_cache(page)); + putback_lru_page(page); + } + + return nr_reclaimed; +} +#endif + /* * Attempt to remove the specified page from its LRU. Only take this page * if it is of the appropriate PageActive status. Pages which are being @@ -1466,6 +1563,44 @@ int isolate_lru_page(struct page *page) return ret; } +static int __too_many_isolated(struct zone *zone, int file, + struct scan_control *sc, int safe) +{ + unsigned long inactive, isolated; + + if (file) { + if (safe) { + inactive = zone_page_state_snapshot(zone, + NR_INACTIVE_FILE); + isolated = zone_page_state_snapshot(zone, + NR_ISOLATED_FILE); + } else { + inactive = zone_page_state(zone, NR_INACTIVE_FILE); + isolated = zone_page_state(zone, NR_ISOLATED_FILE); + } + } else { + if (safe) { + inactive = zone_page_state_snapshot(zone, + NR_INACTIVE_ANON); + isolated = zone_page_state_snapshot(zone, + NR_ISOLATED_ANON); + } else { + inactive = zone_page_state(zone, NR_INACTIVE_ANON); + isolated = zone_page_state(zone, NR_ISOLATED_ANON); + } + } + + /* + * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they + * won't get blocked by normal direct-reclaimers, forming a circular + * deadlock. + */ + if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS)) + inactive >>= 3; + + return isolated > inactive; +} + /* * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and * then get resheduled. When there are massive number of tasks doing page @@ -1474,33 +1609,22 @@ int isolate_lru_page(struct page *page) * unnecessary swapping, thrashing and OOM. */ static int too_many_isolated(struct zone *zone, int file, - struct scan_control *sc) + struct scan_control *sc, int safe) { - unsigned long inactive, isolated; - if (current_is_kswapd()) return 0; if (!sane_reclaim(sc)) return 0; - if (file) { - inactive = zone_page_state(zone, NR_INACTIVE_FILE); - isolated = zone_page_state(zone, NR_ISOLATED_FILE); - } else { - inactive = zone_page_state(zone, NR_INACTIVE_ANON); - isolated = zone_page_state(zone, NR_ISOLATED_ANON); + if (unlikely(__too_many_isolated(zone, file, sc, 0))) { + if (safe) + return __too_many_isolated(zone, file, sc, safe); + else + return 1; } - /* - * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they - * won't get blocked by normal direct-reclaimers, forming a circular - * deadlock. - */ - if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS)) - inactive >>= 3; - - return isolated > inactive; + return 0; } static noinline_for_stack void @@ -1516,6 +1640,7 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list) while (!list_empty(page_list)) { struct page *page = lru_to_page(page_list); int lru; + int file; VM_BUG_ON_PAGE(PageLRU(page), page); list_del(&page->lru); @@ -1532,8 +1657,11 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list) lru = page_lru(page); add_page_to_lru_list(page, lruvec, lru); + file = is_file_lru(lru); + if (IS_ENABLED(CONFIG_ZCACHE)) + if (file) + SetPageWasActive(page); if (is_active_lru(lru)) { - int file = is_file_lru(lru); int numpages = hpage_nr_pages(page); reclaim_stat->recent_rotated[file] += numpages; } @@ -1590,15 +1718,18 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, unsigned long nr_immediate = 0; isolate_mode_t isolate_mode = 0; int file = is_file_lru(lru); + int safe = 0; struct zone *zone = lruvec_zone(lruvec); struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; - while (unlikely(too_many_isolated(zone, file, sc))) { + while (unlikely(too_many_isolated(zone, file, sc, safe))) { congestion_wait(BLK_RW_ASYNC, HZ/10); /* We are about to die and free our memory. Return now. */ if (fatal_signal_pending(current)) return SWAP_CLUSTER_MAX; + + safe = 1; } lru_add_drain(); @@ -1855,6 +1986,12 @@ static void shrink_active_list(unsigned long nr_to_scan, } ClearPageActive(page); /* we are de-activating */ + if (IS_ENABLED(CONFIG_ZCACHE)) + /* + * For zcache to know whether the page is from active + * file list + */ + SetPageWasActive(page); list_add(&page->lru, &l_inactive); } @@ -2075,8 +2212,9 @@ static void get_scan_count(struct lruvec *lruvec, int swappiness, * lruvec even if it has plenty of old anonymous pages unless the * system is under heavy pressure. */ - if (!inactive_file_is_low(lruvec) && - get_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) { + if (!IS_ENABLED(CONFIG_BALANCE_ANON_FILE_RECLAIM) && + !inactive_file_is_low(lruvec) && + get_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) { scan_balance = SCAN_FILE; goto out; } @@ -2449,15 +2587,23 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc, sc->nr_scanned - nr_scanned, zone_lru_pages); + /* + * Record the subtree's reclaim efficiency. The reclaimed + * pages from slab is excluded here because the corresponding + * scanned pages is not accounted. Moreover, freeing a page + * by slab shrinking depends on each slab's object population, + * making the cost model (i.e. scan:free) different from that + * of LRU. + */ + vmpressure(sc->gfp_mask, sc->target_mem_cgroup, + sc->nr_scanned - nr_scanned, + sc->nr_reclaimed - nr_reclaimed); + if (reclaim_state) { sc->nr_reclaimed += reclaim_state->reclaimed_slab; reclaim_state->reclaimed_slab = 0; } - vmpressure(sc->gfp_mask, sc->target_mem_cgroup, - sc->nr_scanned - nr_scanned, - sc->nr_reclaimed - nr_reclaimed); - if (sc->nr_reclaimed - nr_reclaimed) reclaimable = true; @@ -2531,6 +2677,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) gfp_t orig_mask; enum zone_type requested_highidx = gfp_zone(sc->gfp_mask); bool reclaimable = false; + unsigned long lru_pages = 0; /* * If the number of buffer_heads in the machine exceeds the maximum @@ -2558,6 +2705,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) * to global LRU. */ if (global_reclaim(sc)) { + lru_pages += zone_reclaimable_pages(zone); if (!cpuset_zone_allowed(zone, GFP_KERNEL | __GFP_HARDWALL)) continue; @@ -2608,6 +2756,9 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) reclaimable = true; } + if (global_reclaim(sc)) + shrink_slab_lmk(sc->gfp_mask, 0, NULL, + sc->nr_scanned, lru_pages); /* * Restore to original mask to avoid the impact on the caller if we * promoted it to __GFP_HIGHMEM. @@ -2966,18 +3117,23 @@ static void age_active_anon(struct zone *zone, struct scan_control *sc) } while (memcg); } -static bool zone_balanced(struct zone *zone, int order, - unsigned long balance_gap, int classzone_idx) +static bool zone_balanced(struct zone *zone, int order, bool highorder, + unsigned long balance_gap, int classzone_idx) { - if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) + - balance_gap, classzone_idx)) - return false; + unsigned long mark = high_wmark_pages(zone) + balance_gap; - if (IS_ENABLED(CONFIG_COMPACTION) && order && compaction_suitable(zone, - order, 0, classzone_idx) == COMPACT_SKIPPED) - return false; + /* + * When checking from pgdat_balanced(), kswapd should stop and sleep + * when it reaches the high order-0 watermark and let kcompactd take + * over. Other callers such as wakeup_kswapd() want to determine the + * true high-order watermark. + */ + if (IS_ENABLED(CONFIG_COMPACTION) && !highorder) { + mark += (1UL << order); + order = 0; + } - return true; + return zone_watermark_ok_safe(zone, order, mark, classzone_idx); } /* @@ -3027,7 +3183,7 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) continue; } - if (zone_balanced(zone, order, 0, i)) + if (zone_balanced(zone, order, false, 0, i)) balanced_pages += zone->managed_pages; else if (!order) return false; @@ -3082,9 +3238,8 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, static bool kswapd_shrink_zone(struct zone *zone, int classzone_idx, struct scan_control *sc, - unsigned long *nr_attempted) + unsigned long lru_pages) { - int testorder = sc->order; unsigned long balance_gap; bool lowmem_pressure; @@ -3092,17 +3247,6 @@ static bool kswapd_shrink_zone(struct zone *zone, sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone)); /* - * Kswapd reclaims only single pages with compaction enabled. Trying - * too hard to reclaim until contiguous free pages have become - * available can hurt performance by evicting too much useful data - * from memory. Do not reclaim more than needed for compaction. - */ - if (IS_ENABLED(CONFIG_COMPACTION) && sc->order && - compaction_suitable(zone, sc->order, 0, classzone_idx) - != COMPACT_SKIPPED) - testorder = 0; - - /* * We put equal pressure on every zone, unless one zone has way too * many pages free already. The "too many pages" is defined as the * high wmark plus a "gap" where the gap is either the low @@ -3116,14 +3260,13 @@ static bool kswapd_shrink_zone(struct zone *zone, * reclaim is necessary */ lowmem_pressure = (buffer_heads_over_limit && is_highmem(zone)); - if (!lowmem_pressure && zone_balanced(zone, testorder, + if (!lowmem_pressure && zone_balanced(zone, sc->order, false, balance_gap, classzone_idx)) return true; shrink_zone(zone, sc, zone_idx(zone) == classzone_idx); - - /* Account for the number of pages attempted to reclaim */ - *nr_attempted += sc->nr_to_reclaim; + shrink_slab_lmk(sc->gfp_mask, zone_to_nid(zone), NULL, + sc->nr_scanned, lru_pages); clear_bit(ZONE_WRITEBACK, &zone->flags); @@ -3134,7 +3277,7 @@ static bool kswapd_shrink_zone(struct zone *zone, * waits. */ if (zone_reclaimable(zone) && - zone_balanced(zone, testorder, 0, classzone_idx)) { + zone_balanced(zone, sc->order, false, 0, classzone_idx)) { clear_bit(ZONE_CONGESTED, &zone->flags); clear_bit(ZONE_DIRTY, &zone->flags); } @@ -3146,7 +3289,7 @@ static bool kswapd_shrink_zone(struct zone *zone, * For kswapd, balance_pgdat() will work across all this node's zones until * they are all at high_wmark_pages(zone). * - * Returns the final order kswapd was reclaiming at + * Returns the highest zone idx kswapd was reclaiming at * * There is special handling here for zones which are full of pinned pages. * This can happen if the pages are all mlocked, or if they are all used by @@ -3163,8 +3306,7 @@ static bool kswapd_shrink_zone(struct zone *zone, * interoperates with the page allocator fallback scheme to ensure that aging * of pages is balanced across the zones. */ -static unsigned long balance_pgdat(pg_data_t *pgdat, int order, - int *classzone_idx) +static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) { int i; int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ @@ -3181,9 +3323,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, count_vm_event(PAGEOUTRUN); do { - unsigned long nr_attempted = 0; bool raise_priority = true; - bool pgdat_needs_compaction = (order > 0); + unsigned long lru_pages = 0; sc.nr_reclaimed = 0; @@ -3218,7 +3359,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, break; } - if (!zone_balanced(zone, order, 0, 0)) { + if (!zone_balanced(zone, order, false, 0, 0)) { end_zone = i; break; } else { @@ -3234,32 +3375,23 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, if (i < 0) goto out; + /* + * If we're getting trouble reclaiming, start doing writepage + * even in laptop mode. + */ + if (sc.priority < DEF_PRIORITY - 2) + sc.may_writepage = 1; + for (i = 0; i <= end_zone; i++) { struct zone *zone = pgdat->node_zones + i; if (!populated_zone(zone)) continue; - /* - * If any zone is currently balanced then kswapd will - * not call compaction as it is expected that the - * necessary pages are already available. - */ - if (pgdat_needs_compaction && - zone_watermark_ok(zone, order, - low_wmark_pages(zone), - *classzone_idx, 0)) - pgdat_needs_compaction = false; + lru_pages += zone_reclaimable_pages(zone); } /* - * If we're getting trouble reclaiming, start doing writepage - * even in laptop mode. - */ - if (sc.priority < DEF_PRIORITY - 2) - sc.may_writepage = 1; - - /* * Now scan the zone in the dma->highmem direction, stopping * at the last zone which needs scanning. * @@ -3295,8 +3427,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, * that that high watermark would be met at 100% * efficiency. */ - if (kswapd_shrink_zone(zone, end_zone, - &sc, &nr_attempted)) + if (kswapd_shrink_zone(zone, end_zone, &sc, lru_pages)) raise_priority = false; } @@ -3309,49 +3440,29 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, pfmemalloc_watermark_ok(pgdat)) wake_up_all(&pgdat->pfmemalloc_wait); - /* - * Fragmentation may mean that the system cannot be rebalanced - * for high-order allocations in all zones. If twice the - * allocation size has been reclaimed and the zones are still - * not balanced then recheck the watermarks at order-0 to - * prevent kswapd reclaiming excessively. Assume that a - * process requested a high-order can direct reclaim/compact. - */ - if (order && sc.nr_reclaimed >= 2UL << order) - order = sc.order = 0; - /* Check if kswapd should be suspending */ if (try_to_freeze() || kthread_should_stop()) break; /* - * Compact if necessary and kswapd is reclaiming at least the - * high watermark number of pages as requsted - */ - if (pgdat_needs_compaction && sc.nr_reclaimed > nr_attempted) - compact_pgdat(pgdat, order); - - /* * Raise priority if scanning rate is too low or there was no * progress in reclaiming pages */ if (raise_priority || !sc.nr_reclaimed) sc.priority--; } while (sc.priority >= 1 && - !pgdat_balanced(pgdat, order, *classzone_idx)); + !pgdat_balanced(pgdat, order, classzone_idx)); out: /* - * Return the order we were reclaiming at so prepare_kswapd_sleep() - * makes a decision on the order we were last reclaiming at. However, - * if another caller entered the allocator slow path while kswapd - * was awake, order will remain at the higher level + * Return the highest zone idx we were reclaiming at so + * prepare_kswapd_sleep() makes the same decisions as here. */ - *classzone_idx = end_zone; - return order; + return end_zone; } -static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) +static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, + int classzone_idx, int balanced_classzone_idx) { long remaining = 0; DEFINE_WAIT(wait); @@ -3362,7 +3473,22 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); /* Try to sleep for a short interval */ - if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) { + if (prepare_kswapd_sleep(pgdat, order, remaining, + balanced_classzone_idx)) { + /* + * Compaction records what page blocks it recently failed to + * isolate pages from and skips them in the future scanning. + * When kswapd is going to sleep, it is reasonable to assume + * that pages and compaction may succeed so reset the cache. + */ + reset_isolation_suitable(pgdat); + + /* + * We have freed the memory, now we should compact it to make + * allocation of the requested order possible. + */ + wakeup_kcompactd(pgdat, order, classzone_idx); + remaining = schedule_timeout(HZ/10); finish_wait(&pgdat->kswapd_wait, &wait); prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); @@ -3372,7 +3498,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) * After a short sleep, check if it was a premature sleep. If not, then * go fully to sleep until explicitly woken up. */ - if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) { + if (prepare_kswapd_sleep(pgdat, order, remaining, + balanced_classzone_idx)) { trace_mm_vmscan_kswapd_sleep(pgdat->node_id); /* @@ -3385,14 +3512,6 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) */ set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); - /* - * Compaction records what page blocks it recently failed to - * isolate pages from and skips them in the future scanning. - * When kswapd is going to sleep, it is reasonable to assume - * that pages and compaction may succeed so reset the cache. - */ - reset_isolation_suitable(pgdat); - if (!kthread_should_stop()) schedule(); @@ -3422,7 +3541,6 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) static int kswapd(void *p) { unsigned long order, new_order; - unsigned balanced_order; int classzone_idx, new_classzone_idx; int balanced_classzone_idx; pg_data_t *pgdat = (pg_data_t*)p; @@ -3435,7 +3553,7 @@ static int kswapd(void *p) lockdep_set_current_reclaim_state(GFP_KERNEL); - if (!cpumask_empty(cpumask)) + if (kswapd_cpu_mask == NULL && !cpumask_empty(cpumask)) set_cpus_allowed_ptr(tsk, cpumask); current->reclaim_state = &reclaim_state; @@ -3455,24 +3573,19 @@ static int kswapd(void *p) set_freezable(); order = new_order = 0; - balanced_order = 0; classzone_idx = new_classzone_idx = pgdat->nr_zones - 1; balanced_classzone_idx = classzone_idx; for ( ; ; ) { bool ret; /* - * If the last balance_pgdat was unsuccessful it's unlikely a - * new request of a similar or harder type will succeed soon - * so consider going to sleep on the basis we reclaimed at + * While we were reclaiming, there might have been another + * wakeup, so check the values. */ - if (balanced_classzone_idx >= new_classzone_idx && - balanced_order == new_order) { - new_order = pgdat->kswapd_max_order; - new_classzone_idx = pgdat->classzone_idx; - pgdat->kswapd_max_order = 0; - pgdat->classzone_idx = pgdat->nr_zones - 1; - } + new_order = pgdat->kswapd_max_order; + new_classzone_idx = pgdat->classzone_idx; + pgdat->kswapd_max_order = 0; + pgdat->classzone_idx = pgdat->nr_zones - 1; if (order < new_order || classzone_idx > new_classzone_idx) { /* @@ -3482,7 +3595,7 @@ static int kswapd(void *p) order = new_order; classzone_idx = new_classzone_idx; } else { - kswapd_try_to_sleep(pgdat, balanced_order, + kswapd_try_to_sleep(pgdat, order, classzone_idx, balanced_classzone_idx); order = pgdat->kswapd_max_order; classzone_idx = pgdat->classzone_idx; @@ -3502,9 +3615,8 @@ static int kswapd(void *p) */ if (!ret) { trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); - balanced_classzone_idx = classzone_idx; - balanced_order = balance_pgdat(pgdat, order, - &balanced_classzone_idx); + balanced_classzone_idx = balance_pgdat(pgdat, order, + classzone_idx); } } @@ -3534,7 +3646,7 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) } if (!waitqueue_active(&pgdat->kswapd_wait)) return; - if (zone_balanced(zone, order, 0, 0)) + if (zone_balanced(zone, order, true, 0, 0)) return; trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); @@ -3605,6 +3717,22 @@ static int cpu_callback(struct notifier_block *nfb, unsigned long action, return NOTIFY_OK; } +static int set_kswapd_cpu_mask(pg_data_t *pgdat) +{ + int ret = 0; + cpumask_t tmask; + + if (!kswapd_cpu_mask) + return 0; + + cpumask_clear(&tmask); + ret = cpumask_parse(kswapd_cpu_mask, &tmask); + if (ret) + return ret; + + return set_cpus_allowed_ptr(pgdat->kswapd, &tmask); +} + /* * This kswapd start function will be called by init and node-hot-add. * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added. @@ -3624,6 +3752,9 @@ int kswapd_run(int nid) pr_err("Failed to start kswapd on node %d\n", nid); ret = PTR_ERR(pgdat->kswapd); pgdat->kswapd = NULL; + } else if (kswapd_cpu_mask) { + if (set_kswapd_cpu_mask(pgdat)) + pr_warn("error setting kswapd cpu affinity mask\n"); } return ret; } @@ -3649,7 +3780,8 @@ static int __init kswapd_init(void) swap_setup(); for_each_node_state(nid, N_MEMORY) kswapd_run(nid); - hotcpu_notifier(cpu_callback, 0); + if (kswapd_cpu_mask == NULL) + hotcpu_notifier(cpu_callback, 0); return 0; } |