diff options
Diffstat (limited to 'mm/hugetlb.c')
| -rw-r--r-- | mm/hugetlb.c | 211 | 
1 files changed, 152 insertions, 59 deletions
| diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 5f34bd8dda34..b8ce6f450956 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -53,6 +53,84 @@ static unsigned long __initdata default_hstate_size;   */  static DEFINE_SPINLOCK(hugetlb_lock); +static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) +{ +	bool free = (spool->count == 0) && (spool->used_hpages == 0); + +	spin_unlock(&spool->lock); + +	/* If no pages are used, and no other handles to the subpool +	 * remain, free the subpool the subpool remain */ +	if (free) +		kfree(spool); +} + +struct hugepage_subpool *hugepage_new_subpool(long nr_blocks) +{ +	struct hugepage_subpool *spool; + +	spool = kmalloc(sizeof(*spool), GFP_KERNEL); +	if (!spool) +		return NULL; + +	spin_lock_init(&spool->lock); +	spool->count = 1; +	spool->max_hpages = nr_blocks; +	spool->used_hpages = 0; + +	return spool; +} + +void hugepage_put_subpool(struct hugepage_subpool *spool) +{ +	spin_lock(&spool->lock); +	BUG_ON(!spool->count); +	spool->count--; +	unlock_or_release_subpool(spool); +} + +static int hugepage_subpool_get_pages(struct hugepage_subpool *spool, +				      long delta) +{ +	int ret = 0; + +	if (!spool) +		return 0; + +	spin_lock(&spool->lock); +	if ((spool->used_hpages + delta) <= spool->max_hpages) { +		spool->used_hpages += delta; +	} else { +		ret = -ENOMEM; +	} +	spin_unlock(&spool->lock); + +	return ret; +} + +static void hugepage_subpool_put_pages(struct hugepage_subpool *spool, +				       long delta) +{ +	if (!spool) +		return; + +	spin_lock(&spool->lock); +	spool->used_hpages -= delta; +	/* If hugetlbfs_put_super couldn't free spool due to +	* an outstanding quota reference, free it now. */ +	unlock_or_release_subpool(spool); +} + +static inline struct hugepage_subpool *subpool_inode(struct inode *inode) +{ +	return HUGETLBFS_SB(inode->i_sb)->spool; +} + +static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma) +{ +	return subpool_inode(vma->vm_file->f_dentry->d_inode); +} +  /*   * Region tracking -- allows tracking of reservations and instantiated pages   *                    across the pages in a mapping. @@ -454,14 +532,16 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,  				struct vm_area_struct *vma,  				unsigned long address, int avoid_reserve)  { -	struct page *page = NULL; +	struct page *page;  	struct mempolicy *mpol;  	nodemask_t *nodemask;  	struct zonelist *zonelist;  	struct zone *zone;  	struct zoneref *z; +	unsigned int cpuset_mems_cookie; -	get_mems_allowed(); +retry_cpuset: +	cpuset_mems_cookie = get_mems_allowed();  	zonelist = huge_zonelist(vma, address,  					htlb_alloc_mask, &mpol, &nodemask);  	/* @@ -488,10 +568,15 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,  			}  		}  	} -err: +  	mpol_cond_put(mpol); -	put_mems_allowed(); +	if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) +		goto retry_cpuset;  	return page; + +err: +	mpol_cond_put(mpol); +	return NULL;  }  static void update_and_free_page(struct hstate *h, struct page *page) @@ -533,9 +618,9 @@ static void free_huge_page(struct page *page)  	 */  	struct hstate *h = page_hstate(page);  	int nid = page_to_nid(page); -	struct address_space *mapping; +	struct hugepage_subpool *spool = +		(struct hugepage_subpool *)page_private(page); -	mapping = (struct address_space *) page_private(page);  	set_page_private(page, 0);  	page->mapping = NULL;  	BUG_ON(page_count(page)); @@ -551,8 +636,7 @@ static void free_huge_page(struct page *page)  		enqueue_huge_page(h, page);  	}  	spin_unlock(&hugetlb_lock); -	if (mapping) -		hugetlb_put_quota(mapping, 1); +	hugepage_subpool_put_pages(spool, 1);  }  static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) @@ -852,6 +936,7 @@ static int gather_surplus_pages(struct hstate *h, int delta)  	struct page *page, *tmp;  	int ret, i;  	int needed, allocated; +	bool alloc_ok = true;  	needed = (h->resv_huge_pages + delta) - h->free_huge_pages;  	if (needed <= 0) { @@ -867,17 +952,13 @@ retry:  	spin_unlock(&hugetlb_lock);  	for (i = 0; i < needed; i++) {  		page = alloc_buddy_huge_page(h, NUMA_NO_NODE); -		if (!page) -			/* -			 * We were not able to allocate enough pages to -			 * satisfy the entire reservation so we free what -			 * we've allocated so far. -			 */ -			goto free; - +		if (!page) { +			alloc_ok = false; +			break; +		}  		list_add(&page->lru, &surplus_list);  	} -	allocated += needed; +	allocated += i;  	/*  	 * After retaking hugetlb_lock, we need to recalculate 'needed' @@ -886,9 +967,16 @@ retry:  	spin_lock(&hugetlb_lock);  	needed = (h->resv_huge_pages + delta) -  			(h->free_huge_pages + allocated); -	if (needed > 0) -		goto retry; - +	if (needed > 0) { +		if (alloc_ok) +			goto retry; +		/* +		 * We were not able to allocate enough pages to +		 * satisfy the entire reservation so we free what +		 * we've allocated so far. +		 */ +		goto free; +	}  	/*  	 * The surplus_list now contains _at_least_ the number of extra pages  	 * needed to accommodate the reservation.  Add the appropriate number @@ -914,10 +1002,10 @@ retry:  		VM_BUG_ON(page_count(page));  		enqueue_huge_page(h, page);  	} +free:  	spin_unlock(&hugetlb_lock);  	/* Free unnecessary surplus pages to the buddy allocator */ -free:  	if (!list_empty(&surplus_list)) {  		list_for_each_entry_safe(page, tmp, &surplus_list, lru) {  			list_del(&page->lru); @@ -966,11 +1054,12 @@ static void return_unused_surplus_pages(struct hstate *h,  /*   * Determine if the huge page at addr within the vma has an associated   * reservation.  Where it does not we will need to logically increase - * reservation and actually increase quota before an allocation can occur. - * Where any new reservation would be required the reservation change is - * prepared, but not committed.  Once the page has been quota'd allocated - * an instantiated the change should be committed via vma_commit_reservation. - * No action is required on failure. + * reservation and actually increase subpool usage before an allocation + * can occur.  Where any new reservation would be required the + * reservation change is prepared, but not committed.  Once the page + * has been allocated from the subpool and instantiated the change should + * be committed via vma_commit_reservation.  No action is required on + * failure.   */  static long vma_needs_reservation(struct hstate *h,  			struct vm_area_struct *vma, unsigned long addr) @@ -1019,24 +1108,24 @@ static void vma_commit_reservation(struct hstate *h,  static struct page *alloc_huge_page(struct vm_area_struct *vma,  				    unsigned long addr, int avoid_reserve)  { +	struct hugepage_subpool *spool = subpool_vma(vma);  	struct hstate *h = hstate_vma(vma);  	struct page *page; -	struct address_space *mapping = vma->vm_file->f_mapping; -	struct inode *inode = mapping->host;  	long chg;  	/* -	 * Processes that did not create the mapping will have no reserves and -	 * will not have accounted against quota. Check that the quota can be -	 * made before satisfying the allocation -	 * MAP_NORESERVE mappings may also need pages and quota allocated -	 * if no reserve mapping overlaps. +	 * Processes that did not create the mapping will have no +	 * reserves and will not have accounted against subpool +	 * limit. Check that the subpool limit can be made before +	 * satisfying the allocation MAP_NORESERVE mappings may also +	 * need pages and subpool limit allocated allocated if no reserve +	 * mapping overlaps.  	 */  	chg = vma_needs_reservation(h, vma, addr);  	if (chg < 0)  		return ERR_PTR(-VM_FAULT_OOM);  	if (chg) -		if (hugetlb_get_quota(inode->i_mapping, chg)) +		if (hugepage_subpool_get_pages(spool, chg))  			return ERR_PTR(-VM_FAULT_SIGBUS);  	spin_lock(&hugetlb_lock); @@ -1046,12 +1135,12 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,  	if (!page) {  		page = alloc_buddy_huge_page(h, NUMA_NO_NODE);  		if (!page) { -			hugetlb_put_quota(inode->i_mapping, chg); +			hugepage_subpool_put_pages(spool, chg);  			return ERR_PTR(-VM_FAULT_SIGBUS);  		}  	} -	set_page_private(page, (unsigned long) mapping); +	set_page_private(page, (unsigned long)spool);  	vma_commit_reservation(h, vma, addr); @@ -2072,6 +2161,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)  {  	struct hstate *h = hstate_vma(vma);  	struct resv_map *reservations = vma_resv_map(vma); +	struct hugepage_subpool *spool = subpool_vma(vma);  	unsigned long reserve;  	unsigned long start;  	unsigned long end; @@ -2087,7 +2177,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)  		if (reserve) {  			hugetlb_acct_memory(h, -reserve); -			hugetlb_put_quota(vma->vm_file->f_mapping, reserve); +			hugepage_subpool_put_pages(spool, reserve);  		}  	}  } @@ -2241,16 +2331,23 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,  		if (huge_pmd_unshare(mm, &address, ptep))  			continue; +		pte = huge_ptep_get(ptep); +		if (huge_pte_none(pte)) +			continue; + +		/* +		 * HWPoisoned hugepage is already unmapped and dropped reference +		 */ +		if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) +			continue; + +		page = pte_page(pte);  		/*  		 * If a reference page is supplied, it is because a specific  		 * page is being unmapped, not a range. Ensure the page we  		 * are about to unmap is the actual page of interest.  		 */  		if (ref_page) { -			pte = huge_ptep_get(ptep); -			if (huge_pte_none(pte)) -				continue; -			page = pte_page(pte);  			if (page != ref_page)  				continue; @@ -2263,22 +2360,16 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,  		}  		pte = huge_ptep_get_and_clear(mm, address, ptep); -		if (huge_pte_none(pte)) -			continue; - -		/* -		 * HWPoisoned hugepage is already unmapped and dropped reference -		 */ -		if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) -			continue; - -		page = pte_page(pte);  		if (pte_dirty(pte))  			set_page_dirty(page);  		list_add(&page->lru, &page_list); + +		/* Bail out after unmapping reference page if supplied */ +		if (ref_page) +			break;  	} -	spin_unlock(&mm->page_table_lock);  	flush_tlb_range(vma, start, end); +	spin_unlock(&mm->page_table_lock);  	mmu_notifier_invalidate_range_end(mm, start, end);  	list_for_each_entry_safe(page, tmp, &page_list, lru) {  		page_remove_rmap(page); @@ -2316,7 +2407,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,  	 */  	address = address & huge_page_mask(h);  	pgoff = vma_hugecache_offset(h, vma, address); -	mapping = (struct address_space *)page_private(page); +	mapping = vma->vm_file->f_dentry->d_inode->i_mapping;  	/*  	 * Take the mapping lock for the duration of the table walk. As @@ -2869,11 +2960,12 @@ int hugetlb_reserve_pages(struct inode *inode,  {  	long ret, chg;  	struct hstate *h = hstate_inode(inode); +	struct hugepage_subpool *spool = subpool_inode(inode);  	/*  	 * Only apply hugepage reservation if asked. At fault time, an  	 * attempt will be made for VM_NORESERVE to allocate a page -	 * and filesystem quota without using reserves +	 * without using reserves  	 */  	if (vm_flags & VM_NORESERVE)  		return 0; @@ -2900,17 +2992,17 @@ int hugetlb_reserve_pages(struct inode *inode,  	if (chg < 0)  		return chg; -	/* There must be enough filesystem quota for the mapping */ -	if (hugetlb_get_quota(inode->i_mapping, chg)) +	/* There must be enough pages in the subpool for the mapping */ +	if (hugepage_subpool_get_pages(spool, chg))  		return -ENOSPC;  	/*  	 * Check enough hugepages are available for the reservation. -	 * Hand back the quota if there are not +	 * Hand the pages back to the subpool if there are not  	 */  	ret = hugetlb_acct_memory(h, chg);  	if (ret < 0) { -		hugetlb_put_quota(inode->i_mapping, chg); +		hugepage_subpool_put_pages(spool, chg);  		return ret;  	} @@ -2934,12 +3026,13 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)  {  	struct hstate *h = hstate_inode(inode);  	long chg = region_truncate(&inode->i_mapping->private_list, offset); +	struct hugepage_subpool *spool = subpool_inode(inode);  	spin_lock(&inode->i_lock);  	inode->i_blocks -= (blocks_per_huge_page(h) * freed);  	spin_unlock(&inode->i_lock); -	hugetlb_put_quota(inode->i_mapping, (chg - freed)); +	hugepage_subpool_put_pages(spool, (chg - freed));  	hugetlb_acct_memory(h, -(chg - freed));  } | 
