diff options
Diffstat (limited to 'mm')
| -rw-r--r-- | mm/Kconfig | 3 | ||||
| -rw-r--r-- | mm/allocpercpu.c | 20 | ||||
| -rw-r--r-- | mm/filemap.c | 237 | ||||
| -rw-r--r-- | mm/filemap_xip.c | 2 | ||||
| -rw-r--r-- | mm/hugetlb.c | 2 | ||||
| -rw-r--r-- | mm/memcontrol.c | 364 | ||||
| -rw-r--r-- | mm/memory.c | 3 | ||||
| -rw-r--r-- | mm/migrate.c | 46 | ||||
| -rw-r--r-- | mm/nommu.c | 4 | ||||
| -rw-r--r-- | mm/page-writeback.c | 12 | ||||
| -rw-r--r-- | mm/pdflush.c | 4 | ||||
| -rw-r--r-- | mm/readahead.c | 6 | ||||
| -rw-r--r-- | mm/rmap.c | 16 | ||||
| -rw-r--r-- | mm/shmem.c | 52 | ||||
| -rw-r--r-- | mm/shmem_acl.c | 2 | ||||
| -rw-r--r-- | mm/slab.c | 11 | ||||
| -rw-r--r-- | mm/slob.c | 7 | ||||
| -rw-r--r-- | mm/slub.c | 13 | ||||
| -rw-r--r-- | mm/sparse.c | 2 | ||||
| -rw-r--r-- | mm/swap_state.c | 30 | ||||
| -rw-r--r-- | mm/swapfile.c | 10 | ||||
| -rw-r--r-- | mm/truncate.c | 6 | ||||
| -rw-r--r-- | mm/util.c | 55 | ||||
| -rw-r--r-- | mm/vmalloc.c | 6 | ||||
| -rw-r--r-- | mm/vmscan.c | 85 |
25 files changed, 634 insertions, 364 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index aa799007a11b..efee5d379df4 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -77,6 +77,9 @@ config FLAT_NODE_MEM_MAP def_bool y depends on !SPARSEMEM +config HAVE_GET_USER_PAGES_FAST + bool + # # Both the NUMA code and DISCONTIGMEM use arrays of pg_data_t's # to represent different areas of memory. This variable allows diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c index 843364594e23..4297bc41bfd2 100644 --- a/mm/allocpercpu.c +++ b/mm/allocpercpu.c @@ -18,27 +18,28 @@ * Depopulating per-cpu data for a cpu going offline would be a typical * use case. You need to register a cpu hotplug handler for that purpose. */ -void percpu_depopulate(void *__pdata, int cpu) +static void percpu_depopulate(void *__pdata, int cpu) { struct percpu_data *pdata = __percpu_disguise(__pdata); kfree(pdata->ptrs[cpu]); pdata->ptrs[cpu] = NULL; } -EXPORT_SYMBOL_GPL(percpu_depopulate); /** * percpu_depopulate_mask - depopulate per-cpu data for some cpu's * @__pdata: per-cpu data to depopulate * @mask: depopulate per-cpu data for cpu's selected through mask bits */ -void __percpu_depopulate_mask(void *__pdata, cpumask_t *mask) +static void __percpu_depopulate_mask(void *__pdata, cpumask_t *mask) { int cpu; for_each_cpu_mask_nr(cpu, *mask) percpu_depopulate(__pdata, cpu); } -EXPORT_SYMBOL_GPL(__percpu_depopulate_mask); + +#define percpu_depopulate_mask(__pdata, mask) \ + __percpu_depopulate_mask((__pdata), &(mask)) /** * percpu_populate - populate per-cpu data for given cpu @@ -51,7 +52,7 @@ EXPORT_SYMBOL_GPL(__percpu_depopulate_mask); * use case. You need to register a cpu hotplug handler for that purpose. * Per-cpu object is populated with zeroed buffer. */ -void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu) +static void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu) { struct percpu_data *pdata = __percpu_disguise(__pdata); int node = cpu_to_node(cpu); @@ -68,7 +69,6 @@ void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu) pdata->ptrs[cpu] = kzalloc(size, gfp); return pdata->ptrs[cpu]; } -EXPORT_SYMBOL_GPL(percpu_populate); /** * percpu_populate_mask - populate per-cpu data for more cpu's @@ -79,8 +79,8 @@ EXPORT_SYMBOL_GPL(percpu_populate); * * Per-cpu objects are populated with zeroed buffers. */ -int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp, - cpumask_t *mask) +static int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp, + cpumask_t *mask) { cpumask_t populated; int cpu; @@ -94,7 +94,9 @@ int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp, cpu_set(cpu, populated); return 0; } -EXPORT_SYMBOL_GPL(__percpu_populate_mask); + +#define percpu_populate_mask(__pdata, size, gfp, mask) \ + __percpu_populate_mask((__pdata), (size), (gfp), &(mask)) /** * percpu_alloc_mask - initial setup of per-cpu data diff --git a/mm/filemap.c b/mm/filemap.c index 7675b91f4f63..5de7633e1dbe 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -109,13 +109,13 @@ /* * Remove a page from the page cache and free it. Caller has to make * sure the page is locked and that nobody else uses it - or that usage - * is safe. The caller must hold a write_lock on the mapping's tree_lock. + * is safe. The caller must hold the mapping's tree_lock. */ void __remove_from_page_cache(struct page *page) { struct address_space *mapping = page->mapping; - mem_cgroup_uncharge_page(page); + mem_cgroup_uncharge_cache_page(page); radix_tree_delete(&mapping->page_tree, page->index); page->mapping = NULL; mapping->nrpages--; @@ -141,9 +141,9 @@ void remove_from_page_cache(struct page *page) BUG_ON(!PageLocked(page)); - write_lock_irq(&mapping->tree_lock); + spin_lock_irq(&mapping->tree_lock); __remove_from_page_cache(page); - write_unlock_irq(&mapping->tree_lock); + spin_unlock_irq(&mapping->tree_lock); } static int sync_page(void *word) @@ -442,48 +442,52 @@ int filemap_write_and_wait_range(struct address_space *mapping, } /** - * add_to_page_cache - add newly allocated pagecache pages + * add_to_page_cache_locked - add a locked page to the pagecache * @page: page to add * @mapping: the page's address_space * @offset: page index * @gfp_mask: page allocation mode * - * This function is used to add newly allocated pagecache pages; - * the page is new, so we can just run SetPageLocked() against it. - * The other page state flags were set by rmqueue(). - * + * This function is used to add a page to the pagecache. It must be locked. * This function does not add the page to the LRU. The caller must do that. */ -int add_to_page_cache(struct page *page, struct address_space *mapping, +int add_to_page_cache_locked(struct page *page, struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask) { - int error = mem_cgroup_cache_charge(page, current->mm, + int error; + + VM_BUG_ON(!PageLocked(page)); + + error = mem_cgroup_cache_charge(page, current->mm, gfp_mask & ~__GFP_HIGHMEM); if (error) goto out; error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); if (error == 0) { - write_lock_irq(&mapping->tree_lock); + page_cache_get(page); + page->mapping = mapping; + page->index = offset; + + spin_lock_irq(&mapping->tree_lock); error = radix_tree_insert(&mapping->page_tree, offset, page); - if (!error) { - page_cache_get(page); - SetPageLocked(page); - page->mapping = mapping; - page->index = offset; + if (likely(!error)) { mapping->nrpages++; __inc_zone_page_state(page, NR_FILE_PAGES); - } else - mem_cgroup_uncharge_page(page); + } else { + page->mapping = NULL; + mem_cgroup_uncharge_cache_page(page); + page_cache_release(page); + } - write_unlock_irq(&mapping->tree_lock); + spin_unlock_irq(&mapping->tree_lock); radix_tree_preload_end(); } else - mem_cgroup_uncharge_page(page); + mem_cgroup_uncharge_cache_page(page); out: return error; } -EXPORT_SYMBOL(add_to_page_cache); +EXPORT_SYMBOL(add_to_page_cache_locked); int add_to_page_cache_lru(struct page *page, struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask) @@ -633,15 +637,35 @@ void __lock_page_nosync(struct page *page) * Is there a pagecache struct page at the given (mapping, offset) tuple? * If yes, increment its refcount and return it; if no, return NULL. */ -struct page * find_get_page(struct address_space *mapping, pgoff_t offset) +struct page *find_get_page(struct address_space *mapping, pgoff_t offset) { + void **pagep; struct page *page; - read_lock_irq(&mapping->tree_lock); - page = radix_tree_lookup(&mapping->page_tree, offset); - if (page) - page_cache_get(page); - read_unlock_irq(&mapping->tree_lock); + rcu_read_lock(); +repeat: + page = NULL; + pagep = radix_tree_lookup_slot(&mapping->page_tree, offset); + if (pagep) { + page = radix_tree_deref_slot(pagep); + if (unlikely(!page || page == RADIX_TREE_RETRY)) + goto repeat; + + if (!page_cache_get_speculative(page)) + goto repeat; + + /* + * Has the page moved? + * This is part of the lockless pagecache protocol. See + * include/linux/pagemap.h for details. + */ + if (unlikely(page != *pagep)) { + page_cache_release(page); + goto repeat; + } + } + rcu_read_unlock(); + return page; } EXPORT_SYMBOL(find_get_page); @@ -656,32 +680,22 @@ EXPORT_SYMBOL(find_get_page); * * Returns zero if the page was not present. find_lock_page() may sleep. */ -struct page *find_lock_page(struct address_space *mapping, - pgoff_t offset) +struct page *find_lock_page(struct address_space *mapping, pgoff_t offset) { struct page *page; repeat: - read_lock_irq(&mapping->tree_lock); - page = radix_tree_lookup(&mapping->page_tree, offset); + page = find_get_page(mapping, offset); if (page) { - page_cache_get(page); - if (TestSetPageLocked(page)) { - read_unlock_irq(&mapping->tree_lock); - __lock_page(page); - - /* Has the page been truncated while we slept? */ - if (unlikely(page->mapping != mapping)) { - unlock_page(page); - page_cache_release(page); - goto repeat; - } - VM_BUG_ON(page->index != offset); - goto out; + lock_page(page); + /* Has the page been truncated? */ + if (unlikely(page->mapping != mapping)) { + unlock_page(page); + page_cache_release(page); + goto repeat; } + VM_BUG_ON(page->index != offset); } - read_unlock_irq(&mapping->tree_lock); -out: return page; } EXPORT_SYMBOL(find_lock_page); @@ -747,13 +761,39 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start, { unsigned int i; unsigned int ret; + unsigned int nr_found; + + rcu_read_lock(); +restart: + nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, + (void ***)pages, start, nr_pages); + ret = 0; + for (i = 0; i < nr_found; i++) { + struct page *page; +repeat: + page = radix_tree_deref_slot((void **)pages[i]); + if (unlikely(!page)) + continue; + /* + * this can only trigger if nr_found == 1, making livelock + * a non issue. + */ + if (unlikely(page == RADIX_TREE_RETRY)) + goto restart; - read_lock_irq(&mapping->tree_lock); - ret = radix_tree_gang_lookup(&mapping->page_tree, - (void **)pages, start, nr_pages); - for (i = 0; i < ret; i++) - page_cache_get(pages[i]); - read_unlock_irq(&mapping->tree_lock); + if (!page_cache_get_speculative(page)) + goto repeat; + + /* Has the page moved? */ + if (unlikely(page != *((void **)pages[i]))) { + page_cache_release(page); + goto repeat; + } + + pages[ret] = page; + ret++; + } + rcu_read_unlock(); return ret; } @@ -774,19 +814,44 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, { unsigned int i; unsigned int ret; + unsigned int nr_found; + + rcu_read_lock(); +restart: + nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, + (void ***)pages, index, nr_pages); + ret = 0; + for (i = 0; i < nr_found; i++) { + struct page *page; +repeat: + page = radix_tree_deref_slot((void **)pages[i]); + if (unlikely(!page)) + continue; + /* + * this can only trigger if nr_found == 1, making livelock + * a non issue. + */ + if (unlikely(page == RADIX_TREE_RETRY)) + goto restart; - read_lock_irq(&mapping->tree_lock); - ret = radix_tree_gang_lookup(&mapping->page_tree, - (void **)pages, index, nr_pages); - for (i = 0; i < ret; i++) { - if (pages[i]->mapping == NULL || pages[i]->index != index) + if (page->mapping == NULL || page->index != index) break; - page_cache_get(pages[i]); + if (!page_cache_get_speculative(page)) + goto repeat; + + /* Has the page moved? */ + if (unlikely(page != *((void **)pages[i]))) { + page_cache_release(page); + goto repeat; + } + + pages[ret] = page; + ret++; index++; } - read_unlock_irq(&mapping->tree_lock); - return i; + rcu_read_unlock(); + return ret; } EXPORT_SYMBOL(find_get_pages_contig); @@ -806,15 +871,43 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, { unsigned int i; unsigned int ret; + unsigned int nr_found; + + rcu_read_lock(); +restart: + nr_found = radix_tree_gang_lookup_tag_slot(&mapping->page_tree, + (void ***)pages, *index, nr_pages, tag); + ret = 0; + for (i = 0; i < nr_found; i++) { + struct page *page; +repeat: + page = radix_tree_deref_slot((void **)pages[i]); + if (unlikely(!page)) + continue; + /* + * this can only trigger if nr_found == 1, making livelock + * a non issue. + */ + if (unlikely(page == RADIX_TREE_RETRY)) + goto restart; + + if (!page_cache_get_speculative(page)) + goto repeat; + + /* Has the page moved? */ + if (unlikely(page != *((void **)pages[i]))) { + page_cache_release(page); + goto repeat; + } + + pages[ret] = page; + ret++; + } + rcu_read_unlock(); - read_lock_irq(&mapping->tree_lock); - ret = radix_tree_gang_lookup_tag(&mapping->page_tree, - (void **)pages, *index, nr_pages, tag); - for (i = 0; i < ret; i++) - page_cache_get(pages[i]); if (ret) *index = pages[ret - 1]->index + 1; - read_unlock_irq(&mapping->tree_lock); + return ret; } EXPORT_SYMBOL(find_get_pages_tag); @@ -1665,8 +1758,9 @@ static int __remove_suid(struct dentry *dentry, int kill) return notify_change(dentry, &newattrs); } -int remove_suid(struct dentry *dentry) +int file_remove_suid(struct file *file) { + struct dentry *dentry = file->f_path.dentry; int killsuid = should_remove_suid(dentry); int killpriv = security_inode_need_killpriv(dentry); int error = 0; @@ -1680,7 +1774,7 @@ int remove_suid(struct dentry *dentry) return error; } -EXPORT_SYMBOL(remove_suid); +EXPORT_SYMBOL(file_remove_suid); static size_t __iovec_copy_from_user_inatomic(char *vaddr, const struct iovec *iov, size_t base, size_t bytes) @@ -2436,7 +2530,7 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, if (count == 0) goto out; - err = remove_suid(file->f_path.dentry); + err = file_remove_suid(file); if (err) goto out; @@ -2563,9 +2657,8 @@ EXPORT_SYMBOL(generic_file_aio_write); * Otherwise return zero. * * The @gfp_mask argument specifies whether I/O may be performed to release - * this page (__GFP_IO), and whether the call may block (__GFP_WAIT). + * this page (__GFP_IO), and whether the call may block (__GFP_WAIT & __GFP_FS). * - * NOTE: @gfp_mask may go away, and this function may become non-blocking. */ int try_to_release_page(struct page *page, gfp_t gfp_mask) { diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 3e744abcce9d..98a3f31ccd6a 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -380,7 +380,7 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len, if (count == 0) goto out_backing; - ret = remove_suid(filp->f_path.dentry); + ret = file_remove_suid(filp); if (ret) goto out_backing; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 41341c414194..3be79dc18c5c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1026,7 +1026,6 @@ static void __init report_hugepages(void) } } -#ifdef CONFIG_SYSCTL #ifdef CONFIG_HIGHMEM static void try_to_free_low(struct hstate *h, unsigned long count) { @@ -1386,6 +1385,7 @@ static unsigned int cpuset_mems_nr(unsigned int *array) return nr; } +#ifdef CONFIG_SYSCTL int hugetlb_sysctl_handler(struct ctl_table *table, int write, struct file *file, void __user *buffer, size_t *length, loff_t *ppos) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e46451e1d9b7..fba566c51322 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -35,9 +35,9 @@ #include <asm/uaccess.h> -struct cgroup_subsys mem_cgroup_subsys; -static const int MEM_CGROUP_RECLAIM_RETRIES = 5; -static struct kmem_cache *page_cgroup_cache; +struct cgroup_subsys mem_cgroup_subsys __read_mostly; +static struct kmem_cache *page_cgroup_cache __read_mostly; +#define MEM_CGROUP_RECLAIM_RETRIES 5 /* * Statistics for memory cgroup. @@ -166,7 +166,6 @@ struct page_cgroup { struct list_head lru; /* per cgroup LRU list */ struct page *page; struct mem_cgroup *mem_cgroup; - int ref_cnt; /* cached, mapped, migrating */ int flags; }; #define PAGE_CGROUP_FLAG_CACHE (0x1) /* charged as cache */ @@ -185,6 +184,7 @@ static enum zone_type page_cgroup_zid(struct page_cgroup *pc) enum charge_type { MEM_CGROUP_CHARGE_TYPE_CACHE = 0, MEM_CGROUP_CHARGE_TYPE_MAPPED, + MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */ }; /* @@ -296,7 +296,7 @@ static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz, MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1; mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, false); - list_del_init(&pc->lru); + list_del(&pc->lru); } static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz, @@ -354,6 +354,9 @@ void mem_cgroup_move_lists(struct page *page, bool active) struct mem_cgroup_per_zone *mz; unsigned long flags; + if (mem_cgroup_subsys.disabled) + return; + /* * We cannot lock_page_cgroup while holding zone's lru_lock, * because other holders of lock_page_cgroup can be interrupted @@ -524,7 +527,8 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, * < 0 if the cgroup is over its limit */ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, - gfp_t gfp_mask, enum charge_type ctype) + gfp_t gfp_mask, enum charge_type ctype, + struct mem_cgroup *memcg) { struct mem_cgroup *mem; struct page_cgroup *pc; @@ -532,35 +536,8 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES; struct mem_cgroup_per_zone *mz; - if (mem_cgroup_subsys.disabled) - return 0; - - /* - * Should page_cgroup's go to their own slab? - * One could optimize the performance of the charging routine - * by saving a bit in the page_flags and using it as a lock - * to see if the cgroup page already has a page_cgroup associated - * with it - */ -retry: - lock_page_cgroup(page); - pc = page_get_page_cgroup(page); - /* - * The page_cgroup exists and - * the page has already been accounted. - */ - if (pc) { - VM_BUG_ON(pc->page != page); - VM_BUG_ON(pc->ref_cnt <= 0); - - pc->ref_cnt++; - unlock_page_cgroup(page); - goto done; - } - unlock_page_cgroup(page); - - pc = kmem_cache_zalloc(page_cgroup_cache, gfp_mask); - if (pc == NULL) + pc = kmem_cache_alloc(page_cgroup_cache, gfp_mask); + if (unlikely(pc == NULL)) goto err; /* @@ -569,16 +546,18 @@ retry: * thread group leader migrates. It's possible that mm is not * set, if so charge the init_mm (happens for pagecache usage). */ - if (!mm) - mm = &init_mm; - - rcu_read_lock(); - mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); - /* - * For every charge from the cgroup, increment reference count - */ - css_get(&mem->css); - rcu_read_unlock(); + if (likely(!memcg)) { + rcu_read_lock(); + mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); + /* + * For every charge from the cgroup, increment reference count + */ + css_get(&mem->css); + rcu_read_unlock(); + } else { + mem = memcg; + css_get(&memcg->css); + } while (res_counter_charge(&mem->res, PAGE_SIZE)) { if (!(gfp_mask & __GFP_WAIT)) @@ -603,25 +582,24 @@ retry: } } - pc->ref_cnt = 1; pc->mem_cgroup = mem; pc->page = page; - pc->flags = PAGE_CGROUP_FLAG_ACTIVE; + /* + * If a page is accounted as a page cache, insert to inactive list. + * If anon, insert to active list. + */ if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE) pc->flags = PAGE_CGROUP_FLAG_CACHE; + else + pc->flags = PAGE_CGROUP_FLAG_ACTIVE; lock_page_cgroup(page); - if (page_get_page_cgroup(page)) { + if (unlikely(page_get_page_cgroup(page))) { unlock_page_cgroup(page); - /* - * Another charge has been added to this page already. - * We take lock_page_cgroup(page) again and read - * page->cgroup, increment refcnt.... just retry is OK. - */ res_counter_uncharge(&mem->res, PAGE_SIZE); css_put(&mem->css); kmem_cache_free(page_cgroup_cache, pc); - goto retry; + goto done; } page_assign_page_cgroup(page, pc); @@ -642,24 +620,65 @@ err: int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) { + if (mem_cgroup_subsys.disabled) + return 0; + + /* + * If already mapped, we don't have to account. + * If page cache, page->mapping has address_space. + * But page->mapping may have out-of-use anon_vma pointer, + * detecit it by PageAnon() check. newly-mapped-anon's page->mapping + * is NULL. + */ + if (page_mapped(page) || (page->mapping && !PageAnon(page))) + return 0; + if (unlikely(!mm)) + mm = &init_mm; return mem_cgroup_charge_common(page, mm, gfp_mask, - MEM_CGROUP_CHARGE_TYPE_MAPPED); + MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL); } int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) { - if (!mm) + if (mem_cgroup_subsys.disabled) + return 0; + + /* + * Corner case handling. This is called from add_to_page_cache() + * in usual. But some FS (shmem) precharges this page before calling it + * and call add_to_page_cache() with GFP_NOWAIT. + * + * For GFP_NOWAIT case, the page may be pre-charged before calling + * add_to_page_cache(). (See shmem.c) check it here and avoid to call + * charge twice. (It works but has to pay a bit larger cost.) + */ + if (!(gfp_mask & __GFP_WAIT)) { + struct page_cgroup *pc; + + lock_page_cgroup(page); + pc = page_get_page_cgroup(page); + if (pc) { + VM_BUG_ON(pc->page != page); + VM_BUG_ON(!pc->mem_cgroup); + unlock_page_cgroup(page); + return 0; + } + unlock_page_cgroup(page); + } + + if (unlikely(!mm)) mm = &init_mm; + return mem_cgroup_charge_common(page, mm, gfp_mask, - MEM_CGROUP_CHARGE_TYPE_CACHE); + MEM_CGROUP_CHARGE_TYPE_CACHE, NULL); } /* - * Uncharging is always a welcome operation, we never complain, simply - * uncharge. + * uncharge if !page_mapped(page) */ -void mem_cgroup_uncharge_page(struct page *page) +static void +__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) { struct page_cgroup *pc; struct mem_cgroup *mem; @@ -674,98 +693,151 @@ void mem_cgroup_uncharge_page(struct page *page) */ lock_page_cgroup(page); pc = page_get_page_cgroup(page); - if (!pc) + if (unlikely(!pc)) goto unlock; VM_BUG_ON(pc->page != page); - VM_BUG_ON(pc->ref_cnt <= 0); - if (--(pc->ref_cnt) == 0) { - mz = page_cgroup_zoneinfo(pc); - spin_lock_irqsave(&mz->lru_lock, flags); - __mem_cgroup_remove_list(mz, pc); - spin_unlock_irqrestore(&mz->lru_lock, flags); + if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) + && ((pc->flags & PAGE_CGROUP_FLAG_CACHE) + || page_mapped(page))) + goto unlock; - page_assign_page_cgroup(page, NULL); - unlock_page_cgroup(page); + mz = page_cgroup_zoneinfo(pc); + spin_lock_irqsave(&mz->lru_lock, flags); + __mem_cgroup_remove_list(mz, pc); + spin_unlock_irqrestore(&mz->lru_lock, flags); - mem = pc->mem_cgroup; - res_counter_uncharge(&mem->res, PAGE_SIZE); - css_put(&mem->css); + page_assign_page_cgroup(page, NULL); + unlock_page_cgroup(page); - kmem_cache_free(page_cgroup_cache, pc); - return; - } + mem = pc->mem_cgroup; + res_counter_uncharge(&mem->res, PAGE_SIZE); + css_put(&mem->css); + kmem_cache_free(page_cgroup_cache, pc); + return; unlock: unlock_page_cgroup(page); } +void mem_cgroup_uncharge_page(struct page *page) +{ + __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED); +} + +void mem_cgroup_uncharge_cache_page(struct page *page) +{ + VM_BUG_ON(page_mapped(page)); + __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); +} + /* - * Returns non-zero if a page (under migration) has valid page_cgroup member. - * Refcnt of page_cgroup is incremented. + * Before starting migration, account against new page. */ -int mem_cgroup_prepare_migration(struct page *page) +int mem_cgroup_prepare_migration(struct page *page, struct page *newpage) { struct page_cgroup *pc; + struct mem_cgroup *mem = NULL; + enum charge_type ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED; + int ret = 0; if (mem_cgroup_subsys.disabled) return 0; lock_page_cgroup(page); pc = page_get_page_cgroup(page); - if (pc) - pc->ref_cnt++; + if (pc) { + mem = pc->mem_cgroup; + css_get(&mem->css); + if (pc->flags & PAGE_CGROUP_FLAG_CACHE) + ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; + } unlock_page_cgroup(page); - return pc != NULL; + if (mem) { + ret = mem_cgroup_charge_common(newpage, NULL, GFP_KERNEL, + ctype, mem); + css_put(&mem->css); + } + return ret; } -void mem_cgroup_end_migration(struct page *page) +/* remove redundant charge if migration failed*/ +void mem_cgroup_end_migration(struct page *newpage) { - mem_cgroup_uncharge_page(page); + /* + * At success, page->mapping is not NULL. + * special rollback care is necessary when + * 1. at migration failure. (newpage->mapping is cleared in this case) + * 2. the newpage was moved but not remapped again because the task + * exits and the newpage is obsolete. In this case, the new page + * may be a swapcache. So, we just call mem_cgroup_uncharge_page() + * always for avoiding mess. The page_cgroup will be removed if + * unnecessary. File cache pages is still on radix-tree. Don't + * care it. + */ + if (!newpage->mapping) + __mem_cgroup_uncharge_common(newpage, + MEM_CGROUP_CHARGE_TYPE_FORCE); + else if (PageAnon(newpage)) + mem_cgroup_uncharge_page(newpage); } /* - * We know both *page* and *newpage* are now not-on-LRU and PG_locked. - * And no race with uncharge() routines because page_cgroup for *page* - * has extra one reference by mem_cgroup_prepare_migration. + * A call to try to shrink memory usage under specified resource controller. + * This is typically used for page reclaiming for shmem for reducing side + * effect of page allocation from shmem, which is used by some mem_cgroup. */ -void mem_cgroup_page_migration(struct page *page, struct page *newpage) +int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask) { - struct page_cgroup *pc; - struct mem_cgroup_per_zone *mz; - unsigned long flags; + struct mem_cgroup *mem; + int progress = 0; + int retry = MEM_CGROUP_RECLAIM_RETRIES; - lock_page_cgroup(page); - pc = page_get_page_cgroup(page); - if (!pc) { - unlock_page_cgroup(page); - return; - } + if (mem_cgroup_subsys.disabled) + return 0; - mz = page_cgroup_zoneinfo(pc); - spin_lock_irqsave(&mz->lru_lock, flags); - __mem_cgroup_remove_list(mz, pc); - spin_unlock_irqrestore(&mz->lru_lock, flags); + rcu_read_lock(); + mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); + css_get(&mem->css); + rcu_read_unlock(); - page_assign_page_cgroup(page, NULL); - unlock_page_cgroup(page); + do { + progress = try_to_free_mem_cgroup_pages(mem, gfp_mask); + } while (!progress && --retry); - pc->page = newpage; - lock_page_cgroup(newpage); - page_assign_page_cgroup(newpage, pc); + css_put(&mem->css); + if (!retry) + return -ENOMEM; + return 0; +} - mz = page_cgroup_zoneinfo(pc); - spin_lock_irqsave(&mz->lru_lock, flags); - __mem_cgroup_add_list(mz, pc); - spin_unlock_irqrestore(&mz->lru_lock, flags); +int mem_cgroup_resize_limit(struct mem_cgroup *memcg, unsigned long long val) +{ + + int retry_count = MEM_CGROUP_RECLAIM_RETRIES; + int progress; + int ret = 0; - unlock_page_cgroup(newpage); + while (res_counter_set_limit(&memcg->res, val)) { + if (signal_pending(current)) { + ret = -EINTR; + break; + } + if (!retry_count) { + ret = -EBUSY; + break; + } + progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL); + if (!progress) + retry_count--; + } + return ret; } + /* * This routine traverse page_cgroup in given list and drop them all. - * This routine ignores page_cgroup->ref_cnt. * *And* this routine doesn't reclaim page itself, just removes page_cgroup. */ #define FORCE_UNCHARGE_BATCH (128) @@ -790,12 +862,20 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *mem, page = pc->page; get_page(page); spin_unlock_irqrestore(&mz->lru_lock, flags); - mem_cgroup_uncharge_page(page); - put_page(page); - if (--count <= 0) { - count = FORCE_UNCHARGE_BATCH; + /* + * Check if this page is on LRU. !LRU page can be found + * if it's under page migration. + */ + if (PageLRU(page)) { + __mem_cgroup_uncharge_common(page, + MEM_CGROUP_CHARGE_TYPE_FORCE); + put_page(page); + if (--count <= 0) { + count = FORCE_UNCHARGE_BATCH; + cond_resched(); + } + } else cond_resched(); - } spin_lock_irqsave(&mz->lru_lock, flags); } spin_unlock_irqrestore(&mz->lru_lock, flags); @@ -810,9 +890,6 @@ static int mem_cgroup_force_empty(struct mem_cgroup *mem) int ret = -EBUSY; int node, zid; - if (mem_cgroup_subsys.disabled) - return 0; - css_get(&mem->css); /* * page reclaim code (kswapd etc..) will move pages between @@ -838,32 +915,34 @@ out: return ret; } -static int mem_cgroup_write_strategy(char *buf, unsigned long long *tmp) -{ - *tmp = memparse(buf, &buf); - if (*buf != '\0') - return -EINVAL; - - /* - * Round up the value to the closest page size - */ - *tmp = ((*tmp + PAGE_SIZE - 1) >> PAGE_SHIFT) << PAGE_SHIFT; - return 0; -} - static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) { return res_counter_read_u64(&mem_cgroup_from_cont(cont)->res, cft->private); } - -static ssize_t mem_cgroup_write(struct cgroup *cont, struct cftype *cft, - struct file *file, const char __user *userbuf, - size_t nbytes, loff_t *ppos) +/* + * The user of this function is... + * RES_LIMIT. + */ +static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, + const char *buffer) { - return res_counter_write(&mem_cgroup_from_cont(cont)->res, - cft->private, userbuf, nbytes, ppos, - mem_cgroup_write_strategy); + struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + unsigned long long val; + int ret; + + switch (cft->private) { + case RES_LIMIT: + /* This function does all necessary parse...reuse it */ + ret = res_counter_memparse_write_strategy(buffer, &val); + if (!ret) + ret = mem_cgroup_resize_limit(memcg, val); + break; + default: + ret = -EINVAL; /* should be BUG() ? */ + break; + } + return ret; } static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) @@ -940,7 +1019,7 @@ static struct cftype mem_cgroup_files[] = { { .name = "limit_in_bytes", .private = RES_LIMIT, - .write = mem_cgroup_write, + .write_string = mem_cgroup_write, .read_u64 = mem_cgroup_read, }, { @@ -1070,8 +1149,6 @@ static void mem_cgroup_destroy(struct cgroup_subsys *ss, static int mem_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont) { - if (mem_cgroup_subsys.disabled) - return 0; return cgroup_add_files(cont, ss, mem_cgroup_files, ARRAY_SIZE(mem_cgroup_files)); } @@ -1084,9 +1161,6 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss, struct mm_struct *mm; struct mem_cgroup *mem, *old_mem; - if (mem_cgroup_subsys.disabled) - return; - mm = get_task_mm(p); if (mm == NULL) return; diff --git a/mm/memory.c b/mm/memory.c index 262e3eb6601a..a8ca04faaea6 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -374,7 +374,8 @@ static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss) * * The calling function must still handle the error. */ -void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr) +static void print_bad_pte(struct vm_area_struct *vma, pte_t pte, + unsigned long vaddr) { printk(KERN_ERR "Bad pte = %08llx, process = %s, " "vm_flags = %lx, vaddr = %lx\n", diff --git a/mm/migrate.c b/mm/migrate.c index 376cceba82f9..153572fb60b8 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -285,7 +285,15 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, page = migration_entry_to_page(entry); - get_page(page); + /* + * Once radix-tree replacement of page migration started, page_count + * *must* be zero. And, we don't want to call wait_on_page_locked() + * against a page without get_page(). + * So, we use get_page_unless_zero(), here. Even failed, page fault + * will occur again. + */ + if (!get_page_unless_zero(page)) + goto out; pte_unmap_unlock(ptep, ptl); wait_on_page_locked(page); put_page(page); @@ -305,6 +313,7 @@ out: static int migrate_page_move_mapping(struct address_space *mapping, struct page *newpage, struct page *page) { + int expected_count; void **pslot; if (!mapping) { @@ -314,14 +323,20 @@ static int migrate_page_move_mapping(struct address_space *mapping, return 0; } - write_lock_irq(&mapping->tree_lock); + spin_lock_irq(&mapping->tree_lock); pslot = radix_tree_lookup_slot(&mapping->page_tree, page_index(page)); - if (page_count(page) != 2 + !!PagePrivate(page) || + expected_count = 2 + !!PagePrivate(page); + if (page_count(page) != expected_count || (struct page *)radix_tree_deref_slot(pslot) != page) { - write_unlock_irq(&mapping->tree_lock); + spin_unlock_irq(&mapping->tree_lock); + return -EAGAIN; + } + + if (!page_freeze_refs(page, expected_count)) { + spin_unlock_irq(&mapping->tree_lock); return -EAGAIN; } @@ -338,6 +353,7 @@ static int migrate_page_move_mapping(struct address_space *mapping, radix_tree_replace_slot(pslot, newpage); + page_unfreeze_refs(page, expected_count); /* * Drop cache reference from old page. * We know this isn't the last reference. @@ -357,7 +373,9 @@ static int migrate_page_move_mapping(struct address_space *mapping, __dec_zone_page_state(page, NR_FILE_PAGES); __inc_zone_page_state(newpage, NR_FILE_PAGES); - write_unlock_irq(&mapping->tree_lock); + spin_unlock_irq(&mapping->tree_lock); + if (!PageSwapCache(newpage)) + mem_cgroup_uncharge_cache_page(page); return 0; } @@ -611,7 +629,6 @@ static int move_to_new_page(struct page *newpage, struct page *page) rc = fallback_migrate_page(mapping, newpage, page); if (!rc) { - mem_cgroup_page_migration(page, newpage); remove_migration_ptes(page, newpage); } else newpage->mapping = NULL; @@ -641,6 +658,14 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, /* page was freed from under us. So we are done. */ goto move_newpage; + charge = mem_cgroup_prepare_migration(page, newpage); + if (charge == -ENOMEM) { + rc = -ENOMEM; + goto move_newpage; + } + /* prepare cgroup just returns 0 or -ENOMEM */ + BUG_ON(charge); + rc = -EAGAIN; if (TestSetPageLocked(page)) { if (!force) @@ -692,19 +717,14 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, goto rcu_unlock; } - charge = mem_cgroup_prepare_migration(page); /* Establish migration ptes or remove ptes */ try_to_unmap(page, 1); if (!page_mapped(page)) rc = move_to_new_page(newpage, page); - if (rc) { + if (rc) remove_migration_ptes(page, page); - if (charge) - mem_cgroup_end_migration(page); - } else if (charge) - mem_cgroup_end_migration(newpage); rcu_unlock: if (rcu_locked) rcu_read_unlock(); @@ -725,6 +745,8 @@ unlock: } move_newpage: + if (!charge) + mem_cgroup_end_migration(newpage); /* * Move the new page to the LRU. If migration was not successful * then this will free the page. diff --git a/mm/nommu.c b/mm/nommu.c index 4462b6a3fcb9..5edccd9c9218 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -22,7 +22,7 @@ #include <linux/pagemap.h> #include <linux/slab.h> #include <linux/vmalloc.h> -#include <linux/ptrace.h> +#include <linux/tracehook.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/mount.h> @@ -745,7 +745,7 @@ static unsigned long determine_vm_flags(struct file *file, * it's being traced - otherwise breakpoints set in it may interfere * with another untraced process */ - if ((flags & MAP_PRIVATE) && (current->ptrace & PT_PTRACED)) + if ((flags & MAP_PRIVATE) && tracehook_expect_breakpoints(current)) vm_flags &= ~VM_MAYSHARE; return vm_flags; diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 94c6d8988ab3..24de8b65fdbd 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1088,7 +1088,7 @@ int __set_page_dirty_nobuffers(struct page *page) if (!mapping) return 1; - write_lock_irq(&mapping->tree_lock); + spin_lock_irq(&mapping->tree_lock); mapping2 = page_mapping(page); if (mapping2) { /* Race with truncate? */ BUG_ON(mapping2 != mapping); @@ -1102,7 +1102,7 @@ int __set_page_dirty_nobuffers(struct page *page) radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); } - write_unlock_irq(&mapping->tree_lock); + spin_unlock_irq(&mapping->tree_lock); if (mapping->host) { /* !PageAnon && !swapper_space */ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); @@ -1258,7 +1258,7 @@ int test_clear_page_writeback(struct page *page) struct backing_dev_info *bdi = mapping->backing_dev_info; unsigned long flags; - write_lock_irqsave(&mapping->tree_lock, flags); + spin_lock_irqsave(&mapping->tree_lock, flags); ret = TestClearPageWriteback(page); if (ret) { radix_tree_tag_clear(&mapping->page_tree, @@ -1269,7 +1269,7 @@ int test_clear_page_writeback(struct page *page) __bdi_writeout_inc(bdi); } } - write_unlock_irqrestore(&mapping->tree_lock, flags); + spin_unlock_irqrestore(&mapping->tree_lock, flags); } else { ret = TestClearPageWriteback(page); } @@ -1287,7 +1287,7 @@ int test_set_page_writeback(struct page *page) struct backing_dev_info *bdi = mapping->backing_dev_info; unsigned long flags; - write_lock_irqsave(&mapping->tree_lock, flags); + spin_lock_irqsave(&mapping->tree_lock, flags); ret = TestSetPageWriteback(page); if (!ret) { radix_tree_tag_set(&mapping->page_tree, @@ -1300,7 +1300,7 @@ int test_set_page_writeback(struct page *page) radix_tree_tag_clear(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); - write_unlock_irqrestore(&mapping->tree_lock, flags); + spin_unlock_irqrestore(&mapping->tree_lock, flags); } else { ret = TestSetPageWriteback(page); } diff --git a/mm/pdflush.c b/mm/pdflush.c index 9d834aa4b979..0cbe0c60c6bf 100644 --- a/mm/pdflush.c +++ b/mm/pdflush.c @@ -130,7 +130,7 @@ static int __pdflush(struct pdflush_work *my_work) * Thread creation: For how long have there been zero * available threads? */ - if (jiffies - last_empty_jifs > 1 * HZ) { + if (time_after(jiffies, last_empty_jifs + 1 * HZ)) { /* unlocked list_empty() test is OK here */ if (list_empty(&pdflush_list)) { /* unlocked test is OK here */ @@ -151,7 +151,7 @@ static int __pdflush(struct pdflush_work *my_work) if (nr_pdflush_threads <= MIN_PDFLUSH_THREADS) continue; pdf = list_entry(pdflush_list.prev, struct pdflush_work, list); - if (jiffies - pdf->when_i_went_to_sleep > 1 * HZ) { + if (time_after(jiffies, pdf->when_i_went_to_sleep + 1 * HZ)) { /* Limit exit rate */ pdf->when_i_went_to_sleep = jiffies; break; /* exeunt */ diff --git a/mm/readahead.c b/mm/readahead.c index d8723a5f6496..77e8ddf945e9 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -382,9 +382,9 @@ ondemand_readahead(struct address_space *mapping, if (hit_readahead_marker) { pgoff_t start; - read_lock_irq(&mapping->tree_lock); - start = radix_tree_next_hole(&mapping->page_tree, offset, max+1); - read_unlock_irq(&mapping->tree_lock); + rcu_read_lock(); + start = radix_tree_next_hole(&mapping->page_tree, offset,max+1); + rcu_read_unlock(); if (!start || start - offset > max) return 0; diff --git a/mm/rmap.c b/mm/rmap.c index bf0a5b7cfb8e..39ae5a9bf382 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -138,7 +138,7 @@ void anon_vma_unlink(struct vm_area_struct *vma) anon_vma_free(anon_vma); } -static void anon_vma_ctor(struct kmem_cache *cachep, void *data) +static void anon_vma_ctor(void *data) { struct anon_vma *anon_vma = data; @@ -576,14 +576,8 @@ void page_add_anon_rmap(struct page *page, VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); if (atomic_inc_and_test(&page->_mapcount)) __page_set_anon_rmap(page, vma, address); - else { + else __page_check_anon_rmap(page, vma, address); - /* - * We unconditionally charged during prepare, we uncharge here - * This takes care of balancing the reference counts - */ - mem_cgroup_uncharge_page(page); - } } /** @@ -614,12 +608,6 @@ void page_add_file_rmap(struct page *page) { if (atomic_inc_and_test(&page->_mapcount)) __inc_zone_page_state(page, NR_FILE_MAPPED); - else - /* - * We unconditionally charged during prepare, we uncharge here - * This takes care of balancing the reference counts - */ - mem_cgroup_uncharge_page(page); } #ifdef CONFIG_DEBUG_VM diff --git a/mm/shmem.c b/mm/shmem.c index 9ffbea9b79e1..952d361774bb 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -922,20 +922,26 @@ found: error = 1; if (!inode) goto out; - /* Precharge page while we can wait, compensate afterwards */ + /* Precharge page using GFP_KERNEL while we can wait */ error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL); if (error) goto out; error = radix_tree_preload(GFP_KERNEL); - if (error) - goto uncharge; + if (error) { + mem_cgroup_uncharge_cache_page(page); + goto out; + } error = 1; spin_lock(&info->lock); ptr = shmem_swp_entry(info, idx, NULL); - if (ptr && ptr->val == entry.val) - error = add_to_page_cache(page, inode->i_mapping, + if (ptr && ptr->val == entry.val) { + error = add_to_page_cache_locked(page, inode->i_mapping, idx, GFP_NOWAIT); + /* does mem_cgroup_uncharge_cache_page on error */ + } else /* we must compensate for our precharge above */ + mem_cgroup_uncharge_cache_page(page); + if (error == -EEXIST) { struct page *filepage = find_get_page(inode->i_mapping, idx); error = 1; @@ -961,8 +967,6 @@ found: shmem_swp_unmap(ptr); spin_unlock(&info->lock); radix_tree_preload_end(); -uncharge: - mem_cgroup_uncharge_page(page); out: unlock_page(page); page_cache_release(page); @@ -1297,8 +1301,8 @@ repeat: SetPageUptodate(filepage); set_page_dirty(filepage); swap_free(swap); - } else if (!(error = add_to_page_cache( - swappage, mapping, idx, GFP_NOWAIT))) { + } else if (!(error = add_to_page_cache_locked(swappage, mapping, + idx, GFP_NOWAIT))) { info->flags |= SHMEM_PAGEIN; shmem_swp_set(info, entry, 0); shmem_swp_unmap(entry); @@ -1311,17 +1315,14 @@ repeat: shmem_swp_unmap(entry); spin_unlock(&info->lock); unlock_page(swappage); + page_cache_release(swappage); if (error == -ENOMEM) { /* allow reclaim from this memory cgroup */ - error = mem_cgroup_cache_charge(swappage, - current->mm, gfp & ~__GFP_HIGHMEM); - if (error) { - page_cache_release(swappage); + error = mem_cgroup_shrink_usage(current->mm, + gfp); + if (error) goto failed; - } - mem_cgroup_uncharge_page(swappage); } - page_cache_release(swappage); goto repeat; } } else if (sgp == SGP_READ && !filepage) { @@ -1358,6 +1359,8 @@ repeat: } if (!filepage) { + int ret; + spin_unlock(&info->lock); filepage = shmem_alloc_page(gfp, info, idx); if (!filepage) { @@ -1386,10 +1389,18 @@ repeat: swap = *entry; shmem_swp_unmap(entry); } - if (error || swap.val || 0 != add_to_page_cache_lru( - filepage, mapping, idx, GFP_NOWAIT)) { + ret = error || swap.val; + if (ret) + mem_cgroup_uncharge_cache_page(filepage); + else + ret = add_to_page_cache_lru(filepage, mapping, + idx, GFP_NOWAIT); + /* + * At add_to_page_cache_lru() failure, uncharge will + * be done automatically. + */ + if (ret) { spin_unlock(&info->lock); - mem_cgroup_uncharge_page(filepage); page_cache_release(filepage); shmem_unacct_blocks(info->flags, 1); shmem_free_blocks(inode, 1); @@ -1398,7 +1409,6 @@ repeat: goto failed; goto repeat; } - mem_cgroup_uncharge_page(filepage); info->flags |= SHMEM_PAGEIN; } @@ -2342,7 +2352,7 @@ static void shmem_destroy_inode(struct inode *inode) kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); } -static void init_once(struct kmem_cache *cachep, void *foo) +static void init_once(void *foo) { struct shmem_inode_info *p = (struct shmem_inode_info *) foo; diff --git a/mm/shmem_acl.c b/mm/shmem_acl.c index f5664c5b9eb1..8e5aadd7dcd6 100644 --- a/mm/shmem_acl.c +++ b/mm/shmem_acl.c @@ -191,7 +191,7 @@ shmem_check_acl(struct inode *inode, int mask) * shmem_permission - permission() inode operation */ int -shmem_permission(struct inode *inode, int mask, struct nameidata *nd) +shmem_permission(struct inode *inode, int mask) { return generic_permission(inode, mask, shmem_check_acl); } diff --git a/mm/slab.c b/mm/slab.c index 052e7d64537e..918f04f7fef1 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -406,7 +406,7 @@ struct kmem_cache { unsigned int dflags; /* dynamic flags */ /* constructor func */ - void (*ctor)(struct kmem_cache *, void *); + void (*ctor)(void *obj); /* 5) cache creation/removal */ const char *name; @@ -2137,8 +2137,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep) */ struct kmem_cache * kmem_cache_create (const char *name, size_t size, size_t align, - unsigned long flags, - void (*ctor)(struct kmem_cache *, void *)) + unsigned long flags, void (*ctor)(void *)) { size_t left_over, slab_size, ralign; struct kmem_cache *cachep = NULL, *pc; @@ -2653,7 +2652,7 @@ static void cache_init_objs(struct kmem_cache *cachep, * They must also be threaded. */ if (cachep->ctor && !(cachep->flags & SLAB_POISON)) - cachep->ctor(cachep, objp + obj_offset(cachep)); + cachep->ctor(objp + obj_offset(cachep)); if (cachep->flags & SLAB_RED_ZONE) { if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) @@ -2669,7 +2668,7 @@ static void cache_init_objs(struct kmem_cache *cachep, cachep->buffer_size / PAGE_SIZE, 0); #else if (cachep->ctor) - cachep->ctor(cachep, objp); + cachep->ctor(objp); #endif slab_bufctl(slabp)[i] = i + 1; } @@ -3093,7 +3092,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, #endif objp += obj_offset(cachep); if (cachep->ctor && cachep->flags & SLAB_POISON) - cachep->ctor(cachep, objp); + cachep->ctor(objp); #if ARCH_SLAB_MINALIGN if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) { printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n", diff --git a/mm/slob.c b/mm/slob.c index de268eb7ac70..d8fbd4d1bfa7 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -525,12 +525,11 @@ struct kmem_cache { unsigned int size, align; unsigned long flags; const char *name; - void (*ctor)(struct kmem_cache *, void *); + void (*ctor)(void *); }; struct kmem_cache *kmem_cache_create(const char *name, size_t size, - size_t align, unsigned long flags, - void (*ctor)(struct kmem_cache *, void *)) + size_t align, unsigned long flags, void (*ctor)(void *)) { struct kmem_cache *c; @@ -575,7 +574,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node) b = slob_new_page(flags, get_order(c->size), node); if (c->ctor) - c->ctor(c, b); + c->ctor(b); return b; } diff --git a/mm/slub.c b/mm/slub.c index 77c21cf53ff9..b7e2cd5d82db 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1012,7 +1012,7 @@ __setup("slub_debug", setup_slub_debug); static unsigned long kmem_cache_flags(unsigned long objsize, unsigned long flags, const char *name, - void (*ctor)(struct kmem_cache *, void *)) + void (*ctor)(void *)) { /* * Enable debugging if selected on the kernel commandline. @@ -1040,7 +1040,7 @@ static inline int check_object(struct kmem_cache *s, struct page *page, static inline void add_full(struct kmem_cache_node *n, struct page *page) {} static inline unsigned long kmem_cache_flags(unsigned long objsize, unsigned long flags, const char *name, - void (*ctor)(struct kmem_cache *, void *)) + void (*ctor)(void *)) { return flags; } @@ -1103,7 +1103,7 @@ static void setup_object(struct kmem_cache *s, struct page *page, { setup_object_debug(s, page, object); if (unlikely(s->ctor)) - s->ctor(s, object); + s->ctor(object); } static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) @@ -2286,7 +2286,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, const char *name, size_t size, size_t align, unsigned long flags, - void (*ctor)(struct kmem_cache *, void *)) + void (*ctor)(void *)) { memset(s, 0, kmem_size); s->name = name; @@ -3042,7 +3042,7 @@ static int slab_unmergeable(struct kmem_cache *s) static struct kmem_cache *find_mergeable(size_t size, size_t align, unsigned long flags, const char *name, - void (*ctor)(struct kmem_cache *, void *)) + void (*ctor)(void *)) { struct kmem_cache *s; @@ -3082,8 +3082,7 @@ static struct kmem_cache *find_mergeable(size_t size, } struct kmem_cache *kmem_cache_create(const char *name, size_t size, - size_t align, unsigned long flags, - void (*ctor)(struct kmem_cache *, void *)) + size_t align, unsigned long flags, void (*ctor)(void *)) { struct kmem_cache *s; diff --git a/mm/sparse.c b/mm/sparse.c index 8ffc08990008..5d9dbbb9d39e 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -377,7 +377,7 @@ struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid) } #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ -struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) +static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) { struct page *map; struct mem_section *ms = __nr_to_section(pnum); diff --git a/mm/swap_state.c b/mm/swap_state.c index d8aadaf2a0ba..b8035b055129 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -39,7 +39,7 @@ static struct backing_dev_info swap_backing_dev_info = { struct address_space swapper_space = { .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), - .tree_lock = __RW_LOCK_UNLOCKED(swapper_space.tree_lock), + .tree_lock = __SPIN_LOCK_UNLOCKED(swapper_space.tree_lock), .a_ops = &swap_aops, .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear), .backing_dev_info = &swap_backing_dev_info, @@ -56,7 +56,8 @@ static struct { void show_swap_cache_info(void) { - printk("Swap cache: add %lu, delete %lu, find %lu/%lu\n", + printk("%lu pages in swap cache\n", total_swapcache_pages); + printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n", swap_cache_info.add_total, swap_cache_info.del_total, swap_cache_info.find_success, swap_cache_info.find_total); printk("Free swap = %lukB\n", nr_swap_pages << (PAGE_SHIFT - 10)); @@ -64,7 +65,7 @@ void show_swap_cache_info(void) } /* - * add_to_swap_cache resembles add_to_page_cache on swapper_space, + * add_to_swap_cache resembles add_to_page_cache_locked on swapper_space, * but sets SwapCache flag and private instead of mapping and index. */ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) @@ -76,19 +77,26 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) BUG_ON(PagePrivate(page)); error = radix_tree_preload(gfp_mask); if (!error) { - write_lock_irq(&swapper_space.tree_lock); + page_cache_get(page); + SetPageSwapCache(page); + set_page_private(page, entry.val); + + spin_lock_irq(&swapper_space.tree_lock); error = radix_tree_insert(&swapper_space.page_tree, entry.val, page); - if (!error) { - page_cache_get(page); - SetPageSwapCache(page); - set_page_private(page, entry.val); + if (likely(!error)) { total_swapcache_pages++; __inc_zone_page_state(page, NR_FILE_PAGES); INC_CACHE_INFO(add_total); } - write_unlock_irq(&swapper_space.tree_lock); + spin_unlock_irq(&swapper_space.tree_lock); radix_tree_preload_end(); + + if (unlikely(error)) { + set_page_private(page, 0UL); + ClearPageSwapCache(page); + page_cache_release(page); + } } return error; } @@ -175,9 +183,9 @@ void delete_from_swap_cache(struct page *page) entry.val = page_private(page); - write_lock_irq(&swapper_space.tree_lock); + spin_lock_irq(&swapper_space.tree_lock); __delete_from_swap_cache(page); - write_unlock_irq(&swapper_space.tree_lock); + spin_unlock_irq(&swapper_space.tree_lock); swap_free(entry); page_cache_release(page); diff --git a/mm/swapfile.c b/mm/swapfile.c index 2f33edb8bee9..6beb6251e99d 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -33,8 +33,8 @@ #include <asm/tlbflush.h> #include <linux/swapops.h> -DEFINE_SPINLOCK(swap_lock); -unsigned int nr_swapfiles; +static DEFINE_SPINLOCK(swap_lock); +static unsigned int nr_swapfiles; long total_swap_pages; static int swap_overflow; static int least_priority; @@ -44,7 +44,7 @@ static const char Unused_file[] = "Unused swap file entry "; static const char Bad_offset[] = "Bad swap offset entry "; static const char Unused_offset[] = "Unused swap offset entry "; -struct swap_list_t swap_list = {-1, -1}; +static struct swap_list_t swap_list = {-1, -1}; static struct swap_info_struct swap_info[MAX_SWAPFILES]; @@ -369,13 +369,13 @@ int remove_exclusive_swap_page(struct page *page) retval = 0; if (p->swap_map[swp_offset(entry)] == 1) { /* Recheck the page count with the swapcache lock held.. */ - write_lock_irq(&swapper_space.tree_lock); + spin_lock_irq(&swapper_space.tree_lock); if ((page_count(page) == 2) && !PageWriteback(page)) { __delete_from_swap_cache(page); SetPageDirty(page); retval = 1; } - write_unlock_irq(&swapper_space.tree_lock); + spin_unlock_irq(&swapper_space.tree_lock); } spin_unlock(&swap_lock); diff --git a/mm/truncate.c b/mm/truncate.c index b8961cb63414..e68443d74567 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -349,18 +349,18 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL)) return 0; - write_lock_irq(&mapping->tree_lock); + spin_lock_irq(&mapping->tree_lock); if (PageDirty(page)) goto failed; BUG_ON(PagePrivate(page)); __remove_from_page_cache(page); - write_unlock_irq(&mapping->tree_lock); + spin_unlock_irq(&mapping->tree_lock); ClearPageUptodate(page); page_cache_release(page); /* pagecache ref */ return 1; failed: - write_unlock_irq(&mapping->tree_lock); + spin_unlock_irq(&mapping->tree_lock); return 0; } diff --git a/mm/util.c b/mm/util.c index 8f18683825bc..9341ca77bd88 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1,7 +1,9 @@ +#include <linux/mm.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/module.h> #include <linux/err.h> +#include <linux/sched.h> #include <asm/uaccess.h> /** @@ -68,25 +70,22 @@ void *kmemdup(const void *src, size_t len, gfp_t gfp) EXPORT_SYMBOL(kmemdup); /** - * krealloc - reallocate memory. The contents will remain unchanged. + * __krealloc - like krealloc() but don't free @p. * @p: object to reallocate memory for. * @new_size: how many bytes of memory are required. * @flags: the type of memory to allocate. * - * The contents of the object pointed to are preserved up to the - * lesser of the new and old sizes. If @p is %NULL, krealloc() - * behaves exactly like kmalloc(). If @size is 0 and @p is not a - * %NULL pointer, the object pointed to is freed. + * This function is like krealloc() except it never frees the originally + * allocated buffer. Use this if you don't want to free the buffer immediately + * like, for example, with RCU. */ -void *krealloc(const void *p, size_t new_size, gfp_t flags) +void *__krealloc(const void *p, size_t new_size, gfp_t flags) { void *ret; size_t ks = 0; - if (unlikely(!new_size)) { - kfree(p); + if (unlikely(!new_size)) return ZERO_SIZE_PTR; - } if (p) ks = ksize(p); @@ -95,10 +94,37 @@ void *krealloc(const void *p, size_t new_size, gfp_t flags) return (void *)p; ret = kmalloc_track_caller(new_size, flags); - if (ret && p) { + if (ret && p) memcpy(ret, p, ks); + + return ret; +} +EXPORT_SYMBOL(__krealloc); + +/** + * krealloc - reallocate memory. The contents will remain unchanged. + * @p: object to reallocate memory for. + * @new_size: how many bytes of memory are required. + * @flags: the type of memory to allocate. + * + * The contents of the object pointed to are preserved up to the + * lesser of the new and old sizes. If @p is %NULL, krealloc() + * behaves exactly like kmalloc(). If @size is 0 and @p is not a + * %NULL pointer, the object pointed to is freed. + */ +void *krealloc(const void *p, size_t new_size, gfp_t flags) +{ + void *ret; + + if (unlikely(!new_size)) { kfree(p); + return ZERO_SIZE_PTR; } + + ret = __krealloc(p, new_size, flags); + if (ret && p != ret) + kfree(p); + return ret; } EXPORT_SYMBOL(krealloc); @@ -136,3 +162,12 @@ char *strndup_user(const char __user *s, long n) return p; } EXPORT_SYMBOL(strndup_user); + +#ifndef HAVE_ARCH_PICK_MMAP_LAYOUT +void arch_pick_mmap_layout(struct mm_struct *mm) +{ + mm->mmap_base = TASK_UNMAPPED_BASE; + mm->get_unmapped_area = arch_get_unmapped_area; + mm->unmap_area = arch_unmap_area; +} +#endif diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 35f293816294..85b9a0d2c877 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -381,16 +381,14 @@ static void __vunmap(const void *addr, int deallocate_pages) return; if ((PAGE_SIZE-1) & (unsigned long)addr) { - printk(KERN_ERR "Trying to vfree() bad address (%p)\n", addr); - WARN_ON(1); + WARN(1, KERN_ERR "Trying to vfree() bad address (%p)\n", addr); return; } area = remove_vm_area(addr); if (unlikely(!area)) { - printk(KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", + WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", addr); - WARN_ON(1); return; } diff --git a/mm/vmscan.c b/mm/vmscan.c index 967d30ccd92b..8f71761bc4b7 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -38,6 +38,7 @@ #include <linux/kthread.h> #include <linux/freezer.h> #include <linux/memcontrol.h> +#include <linux/delayacct.h> #include <asm/tlbflush.h> #include <asm/div64.h> @@ -390,17 +391,15 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, } /* - * Attempt to detach a locked page from its ->mapping. If it is dirty or if - * someone else has a ref on the page, abort and return 0. If it was - * successfully detached, return 1. Assumes the caller has a single ref on - * this page. + * Same as remove_mapping, but if the page is removed from the mapping, it + * gets returned with a refcount of 0. */ -int remove_mapping(struct address_space *mapping, struct page *page) +static int __remove_mapping(struct address_space *mapping, struct page *page) { BUG_ON(!PageLocked(page)); BUG_ON(mapping != page_mapping(page)); - write_lock_irq(&mapping->tree_lock); + spin_lock_irq(&mapping->tree_lock); /* * The non racy check for a busy page. * @@ -426,28 +425,48 @@ int remove_mapping(struct address_space *mapping, struct page *page) * Note that if SetPageDirty is always performed via set_page_dirty, * and thus under tree_lock, then this ordering is not required. */ - if (unlikely(page_count(page) != 2)) + if (!page_freeze_refs(page, 2)) goto cannot_free; - smp_rmb(); - if (unlikely(PageDirty(page))) + /* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */ + if (unlikely(PageDirty(page))) { + page_unfreeze_refs(page, 2); goto cannot_free; + } if (PageSwapCache(page)) { swp_entry_t swap = { .val = page_private(page) }; __delete_from_swap_cache(page); - write_unlock_irq(&mapping->tree_lock); + spin_unlock_irq(&mapping->tree_lock); swap_free(swap); - __put_page(page); /* The pagecache ref */ - return 1; + } else { + __remove_from_page_cache(page); + spin_unlock_irq(&mapping->tree_lock); } - __remove_from_page_cache(page); - write_unlock_irq(&mapping->tree_lock); - __put_page(page); return 1; cannot_free: - write_unlock_irq(&mapping->tree_lock); + spin_unlock_irq(&mapping->tree_lock); + return 0; +} + +/* + * Attempt to detach a locked page from its ->mapping. If it is dirty or if + * someone else has a ref on the page, abort and return 0. If it was + * successfully detached, return 1. Assumes the caller has a single ref on + * this page. + */ +int remove_mapping(struct address_space *mapping, struct page *page) +{ + if (__remove_mapping(mapping, page)) { + /* + * Unfreezing the refcount with 1 rather than 2 effectively + * drops the pagecache ref for us without requiring another + * atomic operation. + */ + page_unfreeze_refs(page, 1); + return 1; + } return 0; } @@ -597,18 +616,34 @@ static unsigned long shrink_page_list(struct list_head *page_list, if (PagePrivate(page)) { if (!try_to_release_page(page, sc->gfp_mask)) goto activate_locked; - if (!mapping && page_count(page) == 1) - goto free_it; + if (!mapping && page_count(page) == 1) { + unlock_page(page); + if (put_page_testzero(page)) + goto free_it; + else { + /* + * rare race with speculative reference. + * the speculative reference will free + * this page shortly, so we may + * increment nr_reclaimed here (and + * leave it off the LRU). + */ + nr_reclaimed++; + continue; + } + } } - if (!mapping || !remove_mapping(mapping, page)) + if (!mapping || !__remove_mapping(mapping, page)) goto keep_locked; -free_it: unlock_page(page); +free_it: nr_reclaimed++; - if (!pagevec_add(&freed_pvec, page)) - __pagevec_release_nonlru(&freed_pvec); + if (!pagevec_add(&freed_pvec, page)) { + __pagevec_free(&freed_pvec); + pagevec_reinit(&freed_pvec); + } continue; activate_locked: @@ -622,7 +657,7 @@ keep: } list_splice(&ret_pages, page_list); if (pagevec_count(&freed_pvec)) - __pagevec_release_nonlru(&freed_pvec); + __pagevec_free(&freed_pvec); count_vm_events(PGACTIVATE, pgactivate); return nr_reclaimed; } @@ -1316,6 +1351,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, struct zone *zone; enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); + delayacct_freepages_start(); + if (scan_global_lru(sc)) count_vm_event(ALLOCSTALL); /* @@ -1396,6 +1433,8 @@ out: } else mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority); + delayacct_freepages_end(); + return ret; } |
