diff options
Diffstat (limited to 'mm/page_alloc.c')
| -rw-r--r-- | mm/page_alloc.c | 172 |
1 files changed, 117 insertions, 55 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2bcdfbf8c36d..5b7402abc1d6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -238,9 +238,21 @@ compound_page_dtor * const compound_page_dtors[] = { #endif }; +/* + * Try to keep at least this much lowmem free. Do not allow normal + * allocations below this point, only high priority ones. Automatically + * tuned according to the amount of memory in the system. + */ int min_free_kbytes = 1024; int user_min_free_kbytes = -1; +/* + * Extra memory for the system to try freeing. Used to temporarily + * free memory, to make space for new workloads. Anyone can allocate + * down to the min watermarks controlled by min_free_kbytes above. + */ +int extra_free_kbytes = 0; + static unsigned long __meminitdata nr_kernel_pages; static unsigned long __meminitdata nr_all_pages; static unsigned long __meminitdata dma_reserve; @@ -267,8 +279,37 @@ EXPORT_SYMBOL(nr_online_nodes); int page_group_by_mobility_disabled __read_mostly; #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT + +/* + * Determine how many pages need to be initialized durig early boot + * (non-deferred initialization). + * The value of first_deferred_pfn will be set later, once non-deferred pages + * are initialized, but for now set it ULONG_MAX. + */ static inline void reset_deferred_meminit(pg_data_t *pgdat) { + phys_addr_t start_addr, end_addr; + unsigned long max_pgcnt; + unsigned long reserved; + + /* + * Initialise at least 2G of a node but also take into account that + * two large system hashes that can take up 1GB for 0.25TB/node. + */ + max_pgcnt = max(2UL << (30 - PAGE_SHIFT), + (pgdat->node_spanned_pages >> 8)); + + /* + * Compensate the all the memblock reservations (e.g. crash kernel) + * from the initial estimation to make sure we will initialize enough + * memory to boot. + */ + start_addr = PFN_PHYS(pgdat->node_start_pfn); + end_addr = PFN_PHYS(pgdat->node_start_pfn + max_pgcnt); + reserved = memblock_reserved_memory_within(start_addr, end_addr); + max_pgcnt += PHYS_PFN(reserved); + + pgdat->static_init_pgcnt = min(max_pgcnt, pgdat->node_spanned_pages); pgdat->first_deferred_pfn = ULONG_MAX; } @@ -302,10 +343,9 @@ static inline bool update_defer_init(pg_data_t *pgdat, /* Always populate low zones for address-contrained allocations */ if (zone_end < pgdat_end_pfn(pgdat)) return true; - /* Initialise at least 2G of the highest zone */ (*nr_initialised)++; - if (*nr_initialised > (2UL << (30 - PAGE_SHIFT)) && + if ((*nr_initialised > pgdat->static_init_pgcnt) && (pfn & (PAGES_PER_SECTION - 1)) == 0) { pgdat->first_deferred_pfn = pfn; return false; @@ -541,6 +581,9 @@ static inline void set_page_guard(struct zone *zone, struct page *page, return; page_ext = lookup_page_ext(page); + if (unlikely(!page_ext)) + return; + __set_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags); INIT_LIST_HEAD(&page->lru); @@ -558,6 +601,9 @@ static inline void clear_page_guard(struct zone *zone, struct page *page, return; page_ext = lookup_page_ext(page); + if (unlikely(!page_ext)) + return; + __clear_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags); set_page_private(page, 0); @@ -1508,14 +1554,14 @@ int move_freepages(struct zone *zone, #endif for (page = start_page; page <= end_page;) { - /* Make sure we are not inadvertently changing nodes */ - VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page); - if (!pfn_valid_within(page_to_pfn(page))) { page++; continue; } + /* Make sure we are not inadvertently changing nodes */ + VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page); + if (!PageBuddy(page)) { page++; continue; @@ -1729,13 +1775,25 @@ static void unreserve_highatomic_pageblock(const struct alloc_context *ac) struct page, lru); /* - * It should never happen but changes to locking could - * inadvertently allow a per-cpu drain to add pages - * to MIGRATE_HIGHATOMIC while unreserving so be safe - * and watch for underflows. + * In page freeing path, migratetype change is racy so + * we can counter several free pages in a pageblock + * in this loop althoug we changed the pageblock type + * from highatomic to ac->migratetype. So we should + * adjust the count once. */ - zone->nr_reserved_highatomic -= min(pageblock_nr_pages, - zone->nr_reserved_highatomic); + if (get_pageblock_migratetype(page) == + MIGRATE_HIGHATOMIC) { + /* + * It should never happen but changes to + * locking could inadvertently allow a per-cpu + * drain to add pages to MIGRATE_HIGHATOMIC + * while unreserving so be safe and watch for + * underflows. + */ + zone->nr_reserved_highatomic -= min( + pageblock_nr_pages, + zone->nr_reserved_highatomic); + } /* * Convert to ac->migratetype and avoid the normal @@ -2422,9 +2480,6 @@ static bool __zone_watermark_ok(struct zone *z, unsigned int order, if (!area->nr_free) continue; - if (alloc_harder) - return true; - for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) { if (!list_empty(&area->free_list[mt])) return true; @@ -2436,6 +2491,9 @@ static bool __zone_watermark_ok(struct zone *z, unsigned int order, return true; } #endif + if (alloc_harder && + !list_empty(&area->free_list[MIGRATE_HIGHATOMIC])) + return true; } return false; } @@ -2467,7 +2525,7 @@ static bool zone_local(struct zone *local_zone, struct zone *zone) static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) { - return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) < + return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <= RECLAIM_DISTANCE; } #else /* CONFIG_NUMA */ @@ -3063,8 +3121,6 @@ retry: * the allocation is high priority and these type of * allocations are system rather than user orientated */ - ac->zonelist = node_zonelist(numa_node_id(), gfp_mask); - page = __alloc_pages_high_priority(gfp_mask, order, ac); if (page) { @@ -3939,8 +3995,7 @@ static int __parse_numa_zonelist_order(char *s) user_zonelist_order = ZONELIST_ORDER_ZONE; } else { printk(KERN_WARNING - "Ignoring invalid numa_zonelist_order value: " - "%s\n", s); + "Ignoring invalid numa_zonelist_order value: %s\n", s); return -EINVAL; } return 0; @@ -4405,12 +4460,11 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) else page_group_by_mobility_disabled = 0; - pr_info("Built %i zonelists in %s order, mobility grouping %s. " - "Total pages: %ld\n", - nr_online_nodes, - zonelist_order_name[current_zonelist_order], - page_group_by_mobility_disabled ? "off" : "on", - vm_total_pages); + pr_info("Built %i zonelists in %s order, mobility grouping %s. Total pages: %ld\n", + nr_online_nodes, + zonelist_order_name[current_zonelist_order], + page_group_by_mobility_disabled ? "off" : "on", + vm_total_pages); #ifdef CONFIG_NUMA pr_info("Policy zone: %s\n", zone_names[policy_zone]); #endif @@ -5343,7 +5397,6 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, /* pg_data_t should be reset to zero when it's allocated */ WARN_ON(pgdat->nr_zones || pgdat->classzone_idx); - reset_deferred_meminit(pgdat); pgdat->node_id = nid; pgdat->node_start_pfn = node_start_pfn; #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP @@ -5362,6 +5415,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, (unsigned long)pgdat->node_mem_map); #endif + reset_deferred_meminit(pgdat); free_area_init_core(pgdat); } @@ -5696,15 +5750,18 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) sizeof(arch_zone_lowest_possible_pfn)); memset(arch_zone_highest_possible_pfn, 0, sizeof(arch_zone_highest_possible_pfn)); - arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions(); - arch_zone_highest_possible_pfn[0] = max_zone_pfn[0]; - for (i = 1; i < MAX_NR_ZONES; i++) { + + start_pfn = find_min_pfn_with_active_regions(); + + for (i = 0; i < MAX_NR_ZONES; i++) { if (i == ZONE_MOVABLE) continue; - arch_zone_lowest_possible_pfn[i] = - arch_zone_highest_possible_pfn[i-1]; - arch_zone_highest_possible_pfn[i] = - max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]); + + end_pfn = max(max_zone_pfn[i], start_pfn); + arch_zone_lowest_possible_pfn[i] = start_pfn; + arch_zone_highest_possible_pfn[i] = end_pfn; + + start_pfn = end_pfn; } arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0; arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0; @@ -5825,8 +5882,8 @@ unsigned long free_reserved_area(void *start, void *end, int poison, char *s) } if (pages && s) - pr_info("Freeing %s memory: %ldK (%p - %p)\n", - s, pages << (PAGE_SHIFT - 10), start, end); + pr_info("Freeing %s memory: %ldK\n", + s, pages << (PAGE_SHIFT - 10)); return pages; } @@ -5878,22 +5935,21 @@ void __init mem_init_print_info(const char *str) #undef adj_init_size - pr_info("Memory: %luK/%luK available " - "(%luK kernel code, %luK rwdata, %luK rodata, " - "%luK init, %luK bss, %luK reserved, %luK cma-reserved" + pr_info("Memory: %luK/%luK available (%luK kernel code, %luK rwdata, %luK rodata, %luK init, %luK bss, %luK reserved, %luK cma-reserved" #ifdef CONFIG_HIGHMEM - ", %luK highmem" + ", %luK highmem" #endif - "%s%s)\n", - nr_free_pages() << (PAGE_SHIFT-10), physpages << (PAGE_SHIFT-10), - codesize >> 10, datasize >> 10, rosize >> 10, - (init_data_size + init_code_size) >> 10, bss_size >> 10, - (physpages - totalram_pages - totalcma_pages) << (PAGE_SHIFT-10), - totalcma_pages << (PAGE_SHIFT-10), + "%s%s)\n", + nr_free_pages() << (PAGE_SHIFT - 10), + physpages << (PAGE_SHIFT - 10), + codesize >> 10, datasize >> 10, rosize >> 10, + (init_data_size + init_code_size) >> 10, bss_size >> 10, + (physpages - totalram_pages - totalcma_pages) << (PAGE_SHIFT - 10), + totalcma_pages << (PAGE_SHIFT - 10), #ifdef CONFIG_HIGHMEM - totalhigh_pages << (PAGE_SHIFT-10), + totalhigh_pages << (PAGE_SHIFT - 10), #endif - str ? ", " : "", str ? str : ""); + str ? ", " : "", str ? str : ""); } /** @@ -6037,6 +6093,7 @@ static void setup_per_zone_lowmem_reserve(void) static void __setup_per_zone_wmarks(void) { unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); + unsigned long pages_low = extra_free_kbytes >> (PAGE_SHIFT - 10); unsigned long lowmem_pages = 0; struct zone *zone; unsigned long flags; @@ -6048,11 +6105,14 @@ static void __setup_per_zone_wmarks(void) } for_each_zone(zone) { - u64 tmp; + u64 min, low; spin_lock_irqsave(&zone->lock, flags); - tmp = (u64)pages_min * zone->managed_pages; - do_div(tmp, lowmem_pages); + min = (u64)pages_min * zone->managed_pages; + do_div(min, lowmem_pages); + low = (u64)pages_low * zone->managed_pages; + do_div(low, vm_total_pages); + if (is_highmem(zone)) { /* * __GFP_HIGH and PF_MEMALLOC allocations usually don't @@ -6073,11 +6133,13 @@ static void __setup_per_zone_wmarks(void) * If it's a lowmem zone, reserve a number of pages * proportionate to the zone's size. */ - zone->watermark[WMARK_MIN] = tmp; + zone->watermark[WMARK_MIN] = min; } - zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); - zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); + zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + + low + (min >> 2); + zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + + low + (min >> 1); __mod_zone_page_state(zone, NR_ALLOC_BATCH, high_wmark_pages(zone) - low_wmark_pages(zone) - @@ -6200,7 +6262,7 @@ core_initcall(init_per_zone_wmark_min) /* * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so * that we can call two helper functions whenever min_free_kbytes - * changes. + * or extra_free_kbytes changes. */ int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) @@ -6782,7 +6844,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, /* Make sure the range is really isolated. */ if (test_pages_isolated(outer_start, end, false)) { - pr_info("%s: [%lx, %lx) PFNs busy\n", + pr_info_ratelimited("%s: [%lx, %lx) PFNs busy\n", __func__, outer_start, end); ret = -EBUSY; goto done; |
