diff options
| author | Linux Build Service Account <lnxbuild@quicinc.com> | 2017-11-28 03:16:26 -0800 |
|---|---|---|
| committer | Gerrit - the friendly Code Review server <code-review@localhost> | 2017-11-28 03:16:26 -0800 |
| commit | 5cac878f91ea571339f66eb9862596750c16978a (patch) | |
| tree | 4eccc4ca4337477af897ea52debba4616cfbe7db /arch | |
| parent | aa0356095ce99f1424df2770ce7eb77b59f0491d (diff) | |
| parent | efdbaef3e11a8a82f741baddad7d47f8606798aa (diff) | |
Merge "arm64: Hot-remove implementation for arm64"
Diffstat (limited to 'arch')
| -rw-r--r-- | arch/arm64/Kconfig | 1 | ||||
| -rw-r--r-- | arch/arm64/include/asm/mmu.h | 7 | ||||
| -rw-r--r-- | arch/arm64/include/asm/pgtable.h | 15 | ||||
| -rw-r--r-- | arch/arm64/mm/init.c | 85 | ||||
| -rw-r--r-- | arch/arm64/mm/mmu.c | 420 |
5 files changed, 521 insertions, 7 deletions
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 07090b418129..3cb501b93da6 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -653,6 +653,7 @@ config HOTPLUG_CPU can be controlled through /sys/devices/system/cpu. config ARCH_ENABLE_MEMORY_HOTPLUG + depends on !NUMA def_bool y config ARCH_ENABLE_MEMORY_HOTREMOVE diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h index 990124a67eeb..7ad6b91d12c5 100644 --- a/arch/arm64/include/asm/mmu.h +++ b/arch/arm64/include/asm/mmu.h @@ -35,5 +35,12 @@ extern void create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys, unsigned long virt, phys_addr_t size, pgprot_t prot); extern void *fixmap_remap_fdt(phys_addr_t dt_phys); +#ifdef CONFIG_MEMORY_HOTPLUG +extern void hotplug_paging(phys_addr_t start, phys_addr_t size); +#ifdef CONFIG_MEMORY_HOTREMOVE +extern void remove_pagetable(unsigned long start, + unsigned long end, bool direct); +#endif +#endif #endif diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 72b1c3fa1576..ecd7dc14330c 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -461,6 +461,11 @@ static inline phys_addr_t pmd_page_paddr(pmd_t pmd) return pmd_val(pmd) & PHYS_MASK & (s32)PAGE_MASK; } +static inline unsigned long pmd_page_vaddr(pmd_t pmd) +{ + return (unsigned long) __va(pmd_page_paddr(pmd)); +} + /* Find an entry in the third-level page table. */ #define pte_index(addr) (((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) @@ -512,6 +517,11 @@ static inline phys_addr_t pud_page_paddr(pud_t pud) return pud_val(pud) & PHYS_MASK & (s32)PAGE_MASK; } +static inline unsigned long pud_page_vaddr(pud_t pud) +{ + return (unsigned long) __va(pud_page_paddr(pud)); +} + /* Find an entry in the second-level page table. */ #define pmd_index(addr) (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1)) @@ -564,6 +574,11 @@ static inline phys_addr_t pgd_page_paddr(pgd_t pgd) return pgd_val(pgd) & PHYS_MASK & (s32)PAGE_MASK; } +static inline unsigned long pgd_page_vaddr(pgd_t pgd) +{ + return (unsigned long) __va(pgd_page_paddr(pgd)); +} + /* Find an entry in the frst-level page table. */ #define pud_index(addr) (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1)) diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 1d4dcd57ac85..75d363be3f36 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -503,14 +503,74 @@ int arch_add_memory(int nid, u64 start, u64 size, bool for_device) struct zone *zone; unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; + unsigned long end_pfn = start_pfn + nr_pages; + unsigned long max_sparsemem_pfn = 1UL << (MAX_PHYSMEM_BITS-PAGE_SHIFT); int ret; + if (end_pfn > max_sparsemem_pfn) { + pr_err("end_pfn too big"); + return -1; + } + hotplug_paging(start, size); + + /* + * Mark the first page in the range as unusable. This is needed + * because __add_section (within __add_pages) wants pfn_valid + * of it to be false, and in arm64 pfn falid is implemented by + * just checking at the nomap flag for existing blocks. + * + * A small trick here is that __add_section() requires only + * phys_start_pfn (that is the first pfn of a section) to be + * invalid. Regardless of whether it was assumed (by the function + * author) that all pfns within a section are either all valid + * or all invalid, it allows to avoid looping twice (once here, + * second when memblock_clear_nomap() is called) through all + * pfns of the section and modify only one pfn. Thanks to that, + * further, in __add_zone() only this very first pfn is skipped + * and corresponding page is not flagged reserved. Therefore it + * is enough to correct this setup only for it. + * + * When arch_add_memory() returns the walk_memory_range() function + * is called and passed with online_memory_block() callback, + * which execution finally reaches the memory_block_action() + * function, where also only the first pfn of a memory block is + * checked to be reserved. Above, it was first pfn of a section, + * here it is a block but + * (drivers/base/memory.c): + * sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE; + * (include/linux/memory.h): + * #define MIN_MEMORY_BLOCK_SIZE (1UL << SECTION_SIZE_BITS) + * so we can consider block and section equivalently + */ + memblock_mark_nomap(start, 1<<PAGE_SHIFT); + pgdat = NODE_DATA(nid); zone = pgdat->node_zones + zone_for_memory(nid, start, size, ZONE_NORMAL, for_device); ret = __add_pages(nid, zone, start_pfn, nr_pages); + /* + * Make the pages usable after they have been added. + * This will make pfn_valid return true + */ + memblock_clear_nomap(start, 1<<PAGE_SHIFT); + + /* + * This is a hack to avoid having to mix arch specific code + * into arch independent code. SetPageReserved is supposed + * to be called by __add_zone (within __add_section, within + * __add_pages). However, when it is called there, it assumes that + * pfn_valid returns true. For the way pfn_valid is implemented + * in arm64 (a check on the nomap flag), the only way to make + * this evaluate true inside __add_zone is to clear the nomap + * flags of blocks in architecture independent code. + * + * To avoid this, we set the Reserved flag here after we cleared + * the nomap flag in the line above. + */ + SetPageReserved(pfn_to_page(start_pfn)); + if (ret) pr_warn("%s: Problem encountered in __add_pages() ret=%d\n", __func__, ret); @@ -519,21 +579,32 @@ int arch_add_memory(int nid, u64 start, u64 size, bool for_device) } #ifdef CONFIG_MEMORY_HOTREMOVE +static void kernel_physical_mapping_remove(unsigned long start, + unsigned long end) +{ + start = (unsigned long)__va(start); + end = (unsigned long)__va(end); + + remove_pagetable(start, end, true); + +} + int arch_remove_memory(u64 start, u64 size) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; + struct page *page = pfn_to_page(start_pfn); struct zone *zone; - int ret; + int ret = 0; - zone = page_zone(pfn_to_page(start_pfn)); + zone = page_zone(page); ret = __remove_pages(zone, start_pfn, nr_pages); - if (ret) - pr_warn("%s: Problem encountered in __remove_pages() ret=%d\n", - __func__, ret); + WARN_ON_ONCE(ret); + + kernel_physical_mapping_remove(start, start + size); return ret; } -#endif -#endif +#endif /* CONFIG_MEMORY_HOTREMOVE */ +#endif /* CONFIG_MEMORY_HOTPLUG */ diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 6c444d968323..2b48e29ec30c 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -605,6 +605,423 @@ void __init paging_init(void) bootmem_init(); } +#ifdef CONFIG_MEMORY_HOTPLUG +static phys_addr_t pgd_pgtable_alloc(void) +{ + void *ptr = (void *)__get_free_page(PGALLOC_GFP); + if (!ptr || !pgtable_page_ctor(virt_to_page(ptr))) + BUG(); + + /* Ensure the zeroed page is visible to the page table walker */ + dsb(ishst); + return __pa(ptr); +} + +/* + * hotplug_paging() is used by memory hotplug to build new page tables + * for hot added memory. + */ +void hotplug_paging(phys_addr_t start, phys_addr_t size) +{ + + struct page *pg; + phys_addr_t pgd_phys = pgd_pgtable_alloc(); + pgd_t *pgd = pgd_set_fixmap(pgd_phys); + + memcpy(pgd, swapper_pg_dir, PAGE_SIZE); + + __create_pgd_mapping(pgd, start, __phys_to_virt(start), size, + PAGE_KERNEL, pgd_pgtable_alloc); + + cpu_replace_ttbr1(__va(pgd_phys)); + memcpy(swapper_pg_dir, pgd, PAGE_SIZE); + cpu_replace_ttbr1(swapper_pg_dir); + + pgd_clear_fixmap(); + + pg = phys_to_page(pgd_phys); + pgtable_page_dtor(pg); + __free_pages(pg, 0); +} + +#ifdef CONFIG_MEMORY_HOTREMOVE +#define PAGE_INUSE 0xFD + +static void free_pagetable(struct page *page, int order, bool direct) +{ + unsigned long magic; + unsigned int nr_pages = 1 << order; + + /* bootmem page has reserved flag */ + if (PageReserved(page)) { + __ClearPageReserved(page); + + magic = (unsigned long)page->lru.next; + if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) { + while (nr_pages--) + put_page_bootmem(page++); + } else { + while (nr_pages--) + free_reserved_page(page++); + } + } else { + /* + * Only direct pagetable allocation (those allocated via + * hotplug) call the pgtable_page_ctor; vmemmap pgtable + * allocations don't. + */ + if (direct) + pgtable_page_dtor(page); + + free_pages((unsigned long)page_address(page), order); + } +} + +static void free_pte_table(pmd_t *pmd, bool direct) +{ + pte_t *pte_start, *pte; + struct page *page; + int i; + + pte_start = (pte_t *) pmd_page_vaddr(*pmd); + /* Check if there is no valid entry in the PMD */ + for (i = 0; i < PTRS_PER_PTE; i++) { + pte = pte_start + i; + if (!pte_none(*pte)) + return; + } + + page = pmd_page(*pmd); + + free_pagetable(page, 0, direct); + + /* + * This spin lock could be only taken in _pte_aloc_kernel + * in mm/memory.c and nowhere else (for arm64). Not sure if + * the function above can be called concurrently. In doubt, + * I am living it here for now, but it probably can be removed + */ + spin_lock(&init_mm.page_table_lock); + pmd_clear(pmd); + spin_unlock(&init_mm.page_table_lock); +} + +static void free_pmd_table(pud_t *pud, bool direct) +{ + pmd_t *pmd_start, *pmd; + struct page *page; + int i; + + pmd_start = (pmd_t *) pud_page_vaddr(*pud); + /* Check if there is no valid entry in the PMD */ + for (i = 0; i < PTRS_PER_PMD; i++) { + pmd = pmd_start + i; + if (!pmd_none(*pmd)) + return; + } + + page = pud_page(*pud); + + free_pagetable(page, 0, direct); + + /* + * This spin lock could be only taken in _pte_aloc_kernel + * in mm/memory.c and nowhere else (for arm64). Not sure if + * the function above can be called concurrently. In doubt, + * I am living it here for now, but it probably can be removed + */ + spin_lock(&init_mm.page_table_lock); + pud_clear(pud); + spin_unlock(&init_mm.page_table_lock); +} + +/* + * When the PUD is folded on the PGD (three levels of paging), + * there's no need to free PUDs + */ +#if CONFIG_PGTABLE_LEVELS > 3 +static void free_pud_table(pgd_t *pgd, bool direct) +{ + pud_t *pud_start, *pud; + struct page *page; + int i; + + pud_start = (pud_t *) pgd_page_vaddr(*pgd); + /* Check if there is no valid entry in the PUD */ + for (i = 0; i < PTRS_PER_PUD; i++) { + pud = pud_start + i; + if (!pud_none(*pud)) + return; + } + + page = pgd_page(*pgd); + + free_pagetable(page, 0, direct); + + /* + * This spin lock could be only + * taken in _pte_aloc_kernel in + * mm/memory.c and nowhere else + * (for arm64). Not sure if the + * function above can be called + * concurrently. In doubt, + * I am living it here for now, + * but it probably can be removed. + */ + spin_lock(&init_mm.page_table_lock); + pgd_clear(pgd); + spin_unlock(&init_mm.page_table_lock); +} +#endif + +static void remove_pte_table(pte_t *pte, unsigned long addr, + unsigned long end, bool direct) +{ + unsigned long next; + void *page_addr; + + for (; addr < end; addr = next, pte++) { + next = (addr + PAGE_SIZE) & PAGE_MASK; + if (next > end) + next = end; + + if (!pte_present(*pte)) + continue; + + if (PAGE_ALIGNED(addr) && PAGE_ALIGNED(next)) { + /* + * Do not free direct mapping pages since they were + * freed when offlining, or simplely not in use. + */ + if (!direct) + free_pagetable(pte_page(*pte), 0, direct); + + /* + * This spin lock could be only + * taken in _pte_aloc_kernel in + * mm/memory.c and nowhere else + * (for arm64). Not sure if the + * function above can be called + * concurrently. In doubt, + * I am living it here for now, + * but it probably can be removed. + */ + spin_lock(&init_mm.page_table_lock); + pte_clear(&init_mm, addr, pte); + spin_unlock(&init_mm.page_table_lock); + } else { + /* + * If we are here, we are freeing vmemmap pages since + * direct mapped memory ranges to be freed are aligned. + * + * If we are not removing the whole page, it means + * other page structs in this page are being used and + * we canot remove them. So fill the unused page_structs + * with 0xFD, and remove the page when it is wholly + * filled with 0xFD. + */ + memset((void *)addr, PAGE_INUSE, next - addr); + + page_addr = page_address(pte_page(*pte)); + if (!memchr_inv(page_addr, PAGE_INUSE, PAGE_SIZE)) { + free_pagetable(pte_page(*pte), 0, direct); + + /* + * This spin lock could be only + * taken in _pte_aloc_kernel in + * mm/memory.c and nowhere else + * (for arm64). Not sure if the + * function above can be called + * concurrently. In doubt, + * I am living it here for now, + * but it probably can be removed. + */ + spin_lock(&init_mm.page_table_lock); + pte_clear(&init_mm, addr, pte); + spin_unlock(&init_mm.page_table_lock); + } + } + } + + // I am adding this flush here in simmetry to the x86 code. + // Why do I need to call it here and not in remove_p[mu]d + flush_tlb_all(); +} + +static void remove_pmd_table(pmd_t *pmd, unsigned long addr, + unsigned long end, bool direct) +{ + unsigned long next; + void *page_addr; + pte_t *pte; + + for (; addr < end; addr = next, pmd++) { + next = pmd_addr_end(addr, end); + + if (!pmd_present(*pmd)) + continue; + + // check if we are using 2MB section mappings + if (pmd_sect(*pmd)) { + if (PAGE_ALIGNED(addr) && PAGE_ALIGNED(next)) { + if (!direct) { + free_pagetable(pmd_page(*pmd), + get_order(PMD_SIZE), direct); + } + /* + * This spin lock could be only + * taken in _pte_aloc_kernel in + * mm/memory.c and nowhere else + * (for arm64). Not sure if the + * function above can be called + * concurrently. In doubt, + * I am living it here for now, + * but it probably can be removed. + */ + spin_lock(&init_mm.page_table_lock); + pmd_clear(pmd); + spin_unlock(&init_mm.page_table_lock); + } else { + /* If here, we are freeing vmemmap pages. */ + memset((void *)addr, PAGE_INUSE, next - addr); + + page_addr = page_address(pmd_page(*pmd)); + if (!memchr_inv(page_addr, PAGE_INUSE, + PMD_SIZE)) { + free_pagetable(pmd_page(*pmd), + get_order(PMD_SIZE), direct); + + /* + * This spin lock could be only + * taken in _pte_aloc_kernel in + * mm/memory.c and nowhere else + * (for arm64). Not sure if the + * function above can be called + * concurrently. In doubt, + * I am living it here for now, + * but it probably can be removed. + */ + spin_lock(&init_mm.page_table_lock); + pmd_clear(pmd); + spin_unlock(&init_mm.page_table_lock); + } + } + continue; + } + + BUG_ON(!pmd_table(*pmd)); + + pte = pte_offset_map(pmd, addr); + remove_pte_table(pte, addr, next, direct); + free_pte_table(pmd, direct); + } +} + +static void remove_pud_table(pud_t *pud, unsigned long addr, + unsigned long end, bool direct) +{ + unsigned long next; + pmd_t *pmd; + void *page_addr; + + for (; addr < end; addr = next, pud++) { + next = pud_addr_end(addr, end); + if (!pud_present(*pud)) + continue; + /* + * If we are using 4K granules, check if we are using + * 1GB section mapping. + */ + if (pud_sect(*pud)) { + if (PAGE_ALIGNED(addr) && PAGE_ALIGNED(next)) { + if (!direct) { + free_pagetable(pud_page(*pud), + get_order(PUD_SIZE), direct); + } + + /* + * This spin lock could be only + * taken in _pte_aloc_kernel in + * mm/memory.c and nowhere else + * (for arm64). Not sure if the + * function above can be called + * concurrently. In doubt, + * I am living it here for now, + * but it probably can be removed. + */ + spin_lock(&init_mm.page_table_lock); + pud_clear(pud); + spin_unlock(&init_mm.page_table_lock); + } else { + /* If here, we are freeing vmemmap pages. */ + memset((void *)addr, PAGE_INUSE, next - addr); + + page_addr = page_address(pud_page(*pud)); + if (!memchr_inv(page_addr, PAGE_INUSE, + PUD_SIZE)) { + + free_pagetable(pud_page(*pud), + get_order(PUD_SIZE), direct); + + /* + * This spin lock could be only + * taken in _pte_aloc_kernel in + * mm/memory.c and nowhere else + * (for arm64). Not sure if the + * function above can be called + * concurrently. In doubt, + * I am living it here for now, + * but it probably can be removed. + */ + spin_lock(&init_mm.page_table_lock); + pud_clear(pud); + spin_unlock(&init_mm.page_table_lock); + } + } + continue; + } + + BUG_ON(!pud_table(*pud)); + + pmd = pmd_offset(pud, addr); + remove_pmd_table(pmd, addr, next, direct); + free_pmd_table(pud, direct); + } +} + +void remove_pagetable(unsigned long start, unsigned long end, bool direct) +{ + unsigned long next; + unsigned long addr; + pgd_t *pgd; + pud_t *pud; + + for (addr = start; addr < end; addr = next) { + next = pgd_addr_end(addr, end); + + pgd = pgd_offset_k(addr); + if (pgd_none(*pgd)) + continue; + + pud = pud_offset(pgd, addr); + remove_pud_table(pud, addr, next, direct); + /* + * When the PUD is folded on the PGD (three levels of paging), + * I did already clear the PMD page in free_pmd_table, + * and reset the corresponding PGD==PUD entry. + */ +#if CONFIG_PGTABLE_LEVELS > 3 + free_pud_table(pgd, direct); +#endif + } + + flush_tlb_all(); +} + + +#endif /* CONFIG_MEMORY_HOTREMOVE */ +#endif /* CONFIG_MEMORY_HOTPLUG */ + /* * Check whether a kernel address is valid (derived from arch/x86/). */ @@ -686,6 +1103,9 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node) #endif /* CONFIG_ARM64_64K_PAGES */ void vmemmap_free(unsigned long start, unsigned long end) { +#ifdef CONFIG_MEMORY_HOTREMOVE + remove_pagetable(start, end, false); +#endif } #endif /* CONFIG_SPARSEMEM_VMEMMAP */ |
