From 1f1cd7054fe7f45e65dd4963d0a38e5ab7a57cae Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 21 Jan 2014 15:49:16 -0800 Subject: mm/mlock: prepare params outside critical region All mlock related syscalls prepare lock limits, lengths and start parameters with the mmap_sem held. Move this logic outside of the critical region. For the case of mlock, continue incrementing the amount already locked by mm->locked_vm with the rwsem taken. Signed-off-by: Davidlohr Bueso Cc: Rik van Riel Reviewed-by: Michel Lespinasse Acked-by: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mlock.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'mm/mlock.c') diff --git a/mm/mlock.c b/mm/mlock.c index 192e6eebe4f2..10819ed4df3e 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -709,19 +709,21 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) lru_add_drain_all(); /* flush pagevec */ - down_write(¤t->mm->mmap_sem); len = PAGE_ALIGN(len + (start & ~PAGE_MASK)); start &= PAGE_MASK; - locked = len >> PAGE_SHIFT; - locked += current->mm->locked_vm; - lock_limit = rlimit(RLIMIT_MEMLOCK); lock_limit >>= PAGE_SHIFT; + locked = len >> PAGE_SHIFT; + + down_write(¤t->mm->mmap_sem); + + locked += current->mm->locked_vm; /* check against resource limits */ if ((locked <= lock_limit) || capable(CAP_IPC_LOCK)) error = do_mlock(start, len, 1); + up_write(¤t->mm->mmap_sem); if (!error) error = __mm_populate(start, len, 0); @@ -732,11 +734,13 @@ SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len) { int ret; - down_write(¤t->mm->mmap_sem); len = PAGE_ALIGN(len + (start & ~PAGE_MASK)); start &= PAGE_MASK; + + down_write(¤t->mm->mmap_sem); ret = do_mlock(start, len, 0); up_write(¤t->mm->mmap_sem); + return ret; } @@ -781,12 +785,12 @@ SYSCALL_DEFINE1(mlockall, int, flags) if (flags & MCL_CURRENT) lru_add_drain_all(); /* flush pagevec */ - down_write(¤t->mm->mmap_sem); - lock_limit = rlimit(RLIMIT_MEMLOCK); lock_limit >>= PAGE_SHIFT; ret = -ENOMEM; + down_write(¤t->mm->mmap_sem); + if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) || capable(CAP_IPC_LOCK)) ret = do_mlockall(flags); -- cgit v1.2.3 From 01cc2e58697e34c6ee9a40fb6cebc18bf5a1923f Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 23 Jan 2014 15:52:50 -0800 Subject: mm: munlock: fix potential race with THP page split Since commit ff6a6da60b89 ("mm: accelerate munlock() treatment of THP pages") munlock skips tail pages of a munlocked THP page. There is some attempt to prevent bad consequences of racing with a THP page split, but code inspection indicates that there are two problems that may lead to a non-fatal, yet wrong outcome. First, __split_huge_page_refcount() copies flags including PageMlocked from the head page to the tail pages. Clearing PageMlocked by munlock_vma_page() in the middle of this operation might result in part of tail pages left with PageMlocked flag. As the head page still appears to be a THP page until all tail pages are processed, munlock_vma_page() might think it munlocked the whole THP page and skip all the former tail pages. Before ff6a6da60, those pages would be cleared in further iterations of munlock_vma_pages_range(), but NR_MLOCK would still become undercounted (related the next point). Second, NR_MLOCK accounting is based on call to hpage_nr_pages() after the PageMlocked is cleared. The accounting might also become inconsistent due to race with __split_huge_page_refcount() - undercount when HUGE_PMD_NR is subtracted, but some tail pages are left with PageMlocked set and counted again (only possible before ff6a6da60) - overcount when hpage_nr_pages() sees a normal page (split has already finished), but the parallel split has meanwhile cleared PageMlocked from additional tail pages This patch prevents both problems via extending the scope of lru_lock in munlock_vma_page(). This is convenient because: - __split_huge_page_refcount() takes lru_lock for its whole operation - munlock_vma_page() typically takes lru_lock anyway for page isolation As this becomes a second function where page isolation is done with lru_lock already held, factor this out to a new __munlock_isolate_lru_page() function and clean up the code around. [akpm@linux-foundation.org: avoid a coding-style ugly] Signed-off-by: Vlastimil Babka Cc: Sasha Levin Cc: Michel Lespinasse Cc: Andrea Arcangeli Cc: Rik van Riel Cc: Mel Gorman Cc: Hugh Dickins Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mlock.c | 104 +++++++++++++++++++++++++++++++++++-------------------------- 1 file changed, 60 insertions(+), 44 deletions(-) (limited to 'mm/mlock.c') diff --git a/mm/mlock.c b/mm/mlock.c index 10819ed4df3e..b30adbe62034 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -90,6 +90,26 @@ void mlock_vma_page(struct page *page) } } +/* + * Isolate a page from LRU with optional get_page() pin. + * Assumes lru_lock already held and page already pinned. + */ +static bool __munlock_isolate_lru_page(struct page *page, bool getpage) +{ + if (PageLRU(page)) { + struct lruvec *lruvec; + + lruvec = mem_cgroup_page_lruvec(page, page_zone(page)); + if (getpage) + get_page(page); + ClearPageLRU(page); + del_page_from_lru_list(page, lruvec, page_lru(page)); + return true; + } + + return false; +} + /* * Finish munlock after successful page isolation * @@ -126,9 +146,9 @@ static void __munlock_isolated_page(struct page *page) static void __munlock_isolation_failed(struct page *page) { if (PageUnevictable(page)) - count_vm_event(UNEVICTABLE_PGSTRANDED); + __count_vm_event(UNEVICTABLE_PGSTRANDED); else - count_vm_event(UNEVICTABLE_PGMUNLOCKED); + __count_vm_event(UNEVICTABLE_PGMUNLOCKED); } /** @@ -152,28 +172,34 @@ static void __munlock_isolation_failed(struct page *page) unsigned int munlock_vma_page(struct page *page) { unsigned int nr_pages; + struct zone *zone = page_zone(page); BUG_ON(!PageLocked(page)); - if (TestClearPageMlocked(page)) { - nr_pages = hpage_nr_pages(page); - mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); - if (!isolate_lru_page(page)) - __munlock_isolated_page(page); - else - __munlock_isolation_failed(page); - } else { - nr_pages = hpage_nr_pages(page); - } - /* - * Regardless of the original PageMlocked flag, we determine nr_pages - * after touching the flag. This leaves a possible race with a THP page - * split, such that a whole THP page was munlocked, but nr_pages == 1. - * Returning a smaller mask due to that is OK, the worst that can - * happen is subsequent useless scanning of the former tail pages. - * The NR_MLOCK accounting can however become broken. + * Serialize with any parallel __split_huge_page_refcount() which + * might otherwise copy PageMlocked to part of the tail pages before + * we clear it in the head page. It also stabilizes hpage_nr_pages(). */ + spin_lock_irq(&zone->lru_lock); + + nr_pages = hpage_nr_pages(page); + if (!TestClearPageMlocked(page)) + goto unlock_out; + + __mod_zone_page_state(zone, NR_MLOCK, -nr_pages); + + if (__munlock_isolate_lru_page(page, true)) { + spin_unlock_irq(&zone->lru_lock); + __munlock_isolated_page(page); + goto out; + } + __munlock_isolation_failed(page); + +unlock_out: + spin_unlock_irq(&zone->lru_lock); + +out: return nr_pages - 1; } @@ -310,34 +336,24 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) struct page *page = pvec->pages[i]; if (TestClearPageMlocked(page)) { - struct lruvec *lruvec; - int lru; - - if (PageLRU(page)) { - lruvec = mem_cgroup_page_lruvec(page, zone); - lru = page_lru(page); - /* - * We already have pin from follow_page_mask() - * so we can spare the get_page() here. - */ - ClearPageLRU(page); - del_page_from_lru_list(page, lruvec, lru); - } else { - __munlock_isolation_failed(page); - goto skip_munlock; - } - - } else { -skip_munlock: /* - * We won't be munlocking this page in the next phase - * but we still need to release the follow_page_mask() - * pin. We cannot do it under lru_lock however. If it's - * the last pin, __page_cache_release would deadlock. + * We already have pin from follow_page_mask() + * so we can spare the get_page() here. */ - pagevec_add(&pvec_putback, pvec->pages[i]); - pvec->pages[i] = NULL; + if (__munlock_isolate_lru_page(page, false)) + continue; + else + __munlock_isolation_failed(page); } + + /* + * We won't be munlocking this page in the next phase + * but we still need to release the follow_page_mask() + * pin. We cannot do it under lru_lock however. If it's + * the last pin, __page_cache_release() would deadlock. + */ + pagevec_add(&pvec_putback, pvec->pages[i]); + pvec->pages[i] = NULL; } delta_munlocked = -nr + pagevec_count(&pvec_putback); __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked); -- cgit v1.2.3 From 309381feaee564281c3d9e90fbca8963bb7428ad Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Thu, 23 Jan 2014 15:52:54 -0800 Subject: mm: dump page when hitting a VM_BUG_ON using VM_BUG_ON_PAGE Most of the VM_BUG_ON assertions are performed on a page. Usually, when one of these assertions fails we'll get a BUG_ON with a call stack and the registers. I've recently noticed based on the requests to add a small piece of code that dumps the page to various VM_BUG_ON sites that the page dump is quite useful to people debugging issues in mm. This patch adds a VM_BUG_ON_PAGE(cond, page) which beyond doing what VM_BUG_ON() does, also dumps the page before executing the actual BUG_ON. [akpm@linux-foundation.org: fix up includes] Signed-off-by: Sasha Levin Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mlock.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm/mlock.c') diff --git a/mm/mlock.c b/mm/mlock.c index b30adbe62034..4e1a68162285 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -279,8 +279,8 @@ static int __mlock_posix_error_return(long retval) static bool __putback_lru_fast_prepare(struct page *page, struct pagevec *pvec, int *pgrescued) { - VM_BUG_ON(PageLRU(page)); - VM_BUG_ON(!PageLocked(page)); + VM_BUG_ON_PAGE(PageLRU(page), page); + VM_BUG_ON_PAGE(!PageLocked(page), page); if (page_mapcount(page) <= 1 && page_evictable(page)) { pagevec_add(pvec, page); -- cgit v1.2.3