From 78f11a255749d09025f54d4e2df4fbcb031530e2 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Wed, 27 Apr 2011 15:26:45 -0700 Subject: mm: thp: fix /dev/zero MAP_PRIVATE and vm_flags cleanups The huge_memory.c THP page fault was allowed to run if vm_ops was null (which would succeed for /dev/zero MAP_PRIVATE, as the f_op->mmap wouldn't setup a special vma->vm_ops and it would fallback to regular anonymous memory) but other THP logics weren't fully activated for vmas with vm_file not NULL (/dev/zero has a not NULL vma->vm_file). So this removes the vm_file checks so that /dev/zero also can safely use THP (the other albeit safer approach to fix this bug would have been to prevent the THP initial page fault to run if vm_file was set). After removing the vm_file checks, this also makes huge_memory.c stricter in khugepaged for the DEBUG_VM=y case. It doesn't replace the vm_file check with a is_pfn_mapping check (but it keeps checking for VM_PFNMAP under VM_BUG_ON) because for a is_cow_mapping() mapping VM_PFNMAP should only be allowed to exist before the first page fault, and in turn when vma->anon_vma is null (so preventing khugepaged registration). So I tend to think the previous comment saying if vm_file was set, VM_PFNMAP might have been set and we could still be registered in khugepaged (despite anon_vma was not NULL to be registered in khugepaged) was too paranoid. The is_linear_pfn_mapping check is also I think superfluous (as described by comment) but under DEBUG_VM it is safe to stay. Addresses https://bugzilla.kernel.org/show_bug.cgi?id=33682 Signed-off-by: Andrea Arcangeli Reported-by: Caspar Zhang Acked-by: Mel Gorman Acked-by: Rik van Riel Cc: [2.6.38.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 692dbae6ffa7..2348db26bc3d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -137,7 +137,8 @@ extern unsigned int kobjsize(const void *objp); #define VM_RandomReadHint(v) ((v)->vm_flags & VM_RAND_READ) /* - * special vmas that are non-mergable, non-mlock()able + * Special vmas that are non-mergable, non-mlock()able. + * Note: mm/huge_memory.c VM_NO_THP depends on this definition. */ #define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP) -- cgit v1.2.3 From a09a79f66874c905af35d5bb5e5f2fdc7b6b894d Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Mon, 9 May 2011 13:01:09 +0200 Subject: Don't lock guardpage if the stack is growing up Linux kernel excludes guard page when performing mlock on a VMA with down-growing stack. However, some architectures have up-growing stack and locking the guard page should be excluded in this case too. This patch fixes lvm2 on PA-RISC (and possibly other architectures with up-growing stack). lvm2 calculates number of used pages when locking and when unlocking and reports an internal error if the numbers mismatch. [ Patch changed fairly extensively to also fix /proc//maps for the grows-up case, and to move things around a bit to clean it all up and share the infrstructure with the /proc bits. Tested on ia64 that has both grow-up and grow-down segments - Linus ] Signed-off-by: Mikulas Patocka Tested-by: Tony Luck Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- include/linux/mm.h | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 2348db26bc3d..6507dde38b16 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1011,11 +1011,33 @@ int set_page_dirty_lock(struct page *page); int clear_page_dirty_for_io(struct page *page); /* Is the vma a continuation of the stack vma above it? */ -static inline int vma_stack_continue(struct vm_area_struct *vma, unsigned long addr) +static inline int vma_growsdown(struct vm_area_struct *vma, unsigned long addr) { return vma && (vma->vm_end == addr) && (vma->vm_flags & VM_GROWSDOWN); } +static inline int stack_guard_page_start(struct vm_area_struct *vma, + unsigned long addr) +{ + return (vma->vm_flags & VM_GROWSDOWN) && + (vma->vm_start == addr) && + !vma_growsdown(vma->vm_prev, addr); +} + +/* Is the vma a continuation of the stack vma below it? */ +static inline int vma_growsup(struct vm_area_struct *vma, unsigned long addr) +{ + return vma && (vma->vm_start == addr) && (vma->vm_flags & VM_GROWSUP); +} + +static inline int stack_guard_page_end(struct vm_area_struct *vma, + unsigned long addr) +{ + return (vma->vm_flags & VM_GROWSUP) && + (vma->vm_end == addr) && + !vma_growsup(vma->vm_next, addr); +} + extern unsigned long move_page_tables(struct vm_area_struct *vma, unsigned long old_addr, struct vm_area_struct *new_vma, unsigned long new_addr, unsigned long len); -- cgit v1.2.3 From 7bf02ea22c6cdd09e2d3f1d3c3fe366b834ae9af Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 24 May 2011 17:11:16 -0700 Subject: arch, mm: filter disallowed nodes from arch specific show_mem functions Architectures that implement their own show_mem() function did not pass the filter argument to show_free_areas() to appropriately avoid emitting the state of nodes that are disallowed in the current context. This patch now passes the filter argument to show_free_areas() so those nodes are now avoided. This patch also removes the show_free_areas() wrapper around __show_free_areas() and converts existing callers to pass an empty filter. ia64 emits additional information for each node, so skip_free_areas_zone() must be made global to filter disallowed nodes and it is converted to use a nid argument rather than a zone for this use case. Signed-off-by: David Rientjes Cc: Russell King Cc: Tony Luck Cc: Fenghua Yu Cc: Kyle McMartin Cc: Helge Deller Cc: James Bottomley Cc: "David S. Miller" Cc: Guan Xuetao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 6507dde38b16..1746f67c33de 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -862,13 +862,13 @@ extern void pagefault_out_of_memory(void); #define offset_in_page(p) ((unsigned long)(p) & ~PAGE_MASK) /* - * Flags passed to show_mem() and __show_free_areas() to suppress output in + * Flags passed to show_mem() and show_free_areas() to suppress output in * various contexts. */ #define SHOW_MEM_FILTER_NODES (0x0001u) /* filter disallowed nodes */ -extern void show_free_areas(void); -extern void __show_free_areas(unsigned int flags); +extern void show_free_areas(unsigned int flags); +extern bool skip_free_areas_node(unsigned int flags, int nid); int shmem_lock(struct file *file, int lock, struct user_struct *user); struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags); -- cgit v1.2.3 From 37b23e0525d393d48a7d59f870b3bc061a30ccdb Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 24 May 2011 17:11:30 -0700 Subject: x86,mm: make pagefault killable When an oom killing occurs, almost all processes are getting stuck at the following two points. 1) __alloc_pages_nodemask 2) __lock_page_or_retry 1) is not very problematic because TIF_MEMDIE leads to an allocation failure and getting out from page allocator. 2) is more problematic. In an OOM situation, zones typically don't have page cache at all and memory starvation might lead to greatly reduced IO performance. When a fork bomb occurs, TIF_MEMDIE tasks don't die quickly, meaning that a fork bomb may create new process quickly rather than the oom-killer killing it. Then, the system may become livelocked. This patch makes the pagefault interruptible by SIGKILL. Signed-off-by: KOSAKI Motohiro Reviewed-by: KAMEZAWA Hiroyuki Cc: Minchan Kim Cc: Matthew Wilcox Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 1746f67c33de..57d3d5fade16 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -153,6 +153,7 @@ extern pgprot_t protection_map[16]; #define FAULT_FLAG_MKWRITE 0x04 /* Fault was mkwrite of existing pte */ #define FAULT_FLAG_ALLOW_RETRY 0x08 /* Retry fault if blocking */ #define FAULT_FLAG_RETRY_NOWAIT 0x10 /* Don't drop mmap_sem and wait when retrying */ +#define FAULT_FLAG_KILLABLE 0x20 /* The fault task is in SIGKILL killable region */ /* * This interface is used by x86 PAT code to identify a pfn mapping that is -- cgit v1.2.3 From 1b79acc91115ba47e744b70bb166b77bd94f5855 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 24 May 2011 17:11:32 -0700 Subject: mm, mem-hotplug: recalculate lowmem_reserve when memory hotplug occurs Currently, memory hotplug calls setup_per_zone_wmarks() and calculate_zone_inactive_ratio(), but doesn't call setup_per_zone_lowmem_reserve(). It means the number of reserved pages aren't updated even if memory hot plug occur. This patch fixes it. Signed-off-by: KOSAKI Motohiro Reviewed-by: KAMEZAWA Hiroyuki Acked-by: Mel Gorman Reviewed-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 57d3d5fade16..e173cd297d88 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1381,7 +1381,7 @@ extern void set_dma_reserve(unsigned long new_dma_reserve); extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long, enum memmap_context); extern void setup_per_zone_wmarks(void); -extern void calculate_zone_inactive_ratio(struct zone *zone); +extern int __meminit init_per_zone_wmark_min(void); extern void mem_init(void); extern void __init mmap_init(void); extern void show_mem(unsigned int flags); -- cgit v1.2.3 From d05f3169c0fbca16132ec7c2be71685c6de638b5 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 24 May 2011 17:11:44 -0700 Subject: mm: make expand_downwards() symmetrical with expand_upwards() Currently we have expand_upwards exported while expand_downwards is accessible only via expand_stack or expand_stack_downwards. check_stack_guard_page is a nice example of the asymmetry. It uses expand_stack for VM_GROWSDOWN while expand_upwards is called for VM_GROWSUP case. Let's clean this up by exporting both functions and make those names consistent. Let's use expand_{upwards,downwards} because expanding doesn't always involve stack manipulation (an example is ia64_do_page_fault which uses expand_upwards for registers backing store expansion). expand_downwards has to be defined for both CONFIG_STACK_GROWS{UP,DOWN} because get_arg_page calls the downwards version in the early process initialization phase for growsup configuration. Signed-off-by: Michal Hocko Acked-by: Hugh Dickins Cc: James Bottomley Cc: "Luck, Tony" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index e173cd297d88..d2948af126ca 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1518,15 +1518,17 @@ unsigned long ra_submit(struct file_ra_state *ra, struct address_space *mapping, struct file *filp); -/* Do stack extension */ +/* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */ extern int expand_stack(struct vm_area_struct *vma, unsigned long address); + +/* CONFIG_STACK_GROWSUP still needs to to grow downwards at some places */ +extern int expand_downwards(struct vm_area_struct *vma, + unsigned long address); #if VM_GROWSUP extern int expand_upwards(struct vm_area_struct *vma, unsigned long address); #else #define expand_upwards(vma, address) do { } while (0) #endif -extern int expand_stack_downwards(struct vm_area_struct *vma, - unsigned long address); /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr); -- cgit v1.2.3 From d16dfc550f5326a4000f3322582a7c05dec91d7a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 24 May 2011 17:11:45 -0700 Subject: mm: mmu_gather rework Rework the existing mmu_gather infrastructure. The direct purpose of these patches was to allow preemptible mmu_gather, but even without that I think these patches provide an improvement to the status quo. The first 9 patches rework the mmu_gather infrastructure. For review purpose I've split them into generic and per-arch patches with the last of those a generic cleanup. The next patch provides generic RCU page-table freeing, and the followup is a patch converting s390 to use this. I've also got 4 patches from DaveM lined up (not included in this series) that uses this to implement gup_fast() for sparc64. Then there is one patch that extends the generic mmu_gather batching. After that follow the mm preemptibility patches, these make part of the mm a lot more preemptible. It converts i_mmap_lock and anon_vma->lock to mutexes which together with the mmu_gather rework makes mmu_gather preemptible as well. Making i_mmap_lock a mutex also enables a clean-up of the truncate code. This also allows for preemptible mmu_notifiers, something that XPMEM I think wants. Furthermore, it removes the new and universially detested unmap_mutex. This patch: Remove the first obstacle towards a fully preemptible mmu_gather. The current scheme assumes mmu_gather is always done with preemption disabled and uses per-cpu storage for the page batches. Change this to try and allocate a page for batching and in case of failure, use a small on-stack array to make some progress. Preemptible mmu_gather is desired in general and usable once i_mmap_lock becomes a mutex. Doing it before the mutex conversion saves us from having to rework the code by moving the mmu_gather bits inside the pte_lock. Also avoid flushing the tlb batches from under the pte lock, this is useful even without the i_mmap_lock conversion as it significantly reduces pte lock hold times. [akpm@linux-foundation.org: fix comment tpyo] Signed-off-by: Peter Zijlstra Cc: Benjamin Herrenschmidt Cc: David Miller Cc: Martin Schwidefsky Cc: Russell King Cc: Paul Mundt Cc: Jeff Dike Cc: Richard Weinberger Cc: Tony Luck Reviewed-by: KAMEZAWA Hiroyuki Acked-by: Hugh Dickins Acked-by: Mel Gorman Cc: KOSAKI Motohiro Cc: Nick Piggin Cc: Namhyung Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index d2948af126ca..ffcce9bf2b54 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -906,7 +906,7 @@ int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, unsigned long size); unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, unsigned long size, struct zap_details *); -unsigned long unmap_vmas(struct mmu_gather **tlb, +unsigned long unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma, unsigned long start_addr, unsigned long end_addr, unsigned long *nr_accounted, struct zap_details *); -- cgit v1.2.3 From 97a894136f29802da19a15541de3c019e1ca147e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 24 May 2011 17:12:04 -0700 Subject: mm: Remove i_mmap_lock lockbreak Hugh says: "The only significant loser, I think, would be page reclaim (when concurrent with truncation): could spin for a long time waiting for the i_mmap_mutex it expects would soon be dropped? " Counter points: - cpu contention makes the spin stop (need_resched()) - zap pages should be freeing pages at a higher rate than reclaim ever can I think the simplification of the truncate code is definitely worth it. Effectively reverts: 2aa15890f3c ("mm: prevent concurrent unmap_mapping_range() on the same inode") and takes out the code that caused its problem. Signed-off-by: Peter Zijlstra Reviewed-by: KAMEZAWA Hiroyuki Cc: Hugh Dickins Cc: Benjamin Herrenschmidt Cc: David Miller Cc: Martin Schwidefsky Cc: Russell King Cc: Paul Mundt Cc: Jeff Dike Cc: Richard Weinberger Cc: Tony Luck Cc: Mel Gorman Cc: KOSAKI Motohiro Cc: Nick Piggin Cc: Namhyung Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index ffcce9bf2b54..2ad0ac8c3f32 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -895,8 +895,6 @@ struct zap_details { struct address_space *check_mapping; /* Check page->mapping if set */ pgoff_t first_index; /* Lowest page->index to unmap */ pgoff_t last_index; /* Highest page->index to unmap */ - spinlock_t *i_mmap_lock; /* For unmap_mapping_range: */ - unsigned long truncate_count; /* Compare vm_truncate_count */ }; struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, -- cgit v1.2.3 From a238ab5b0239575c179f4976064192c3f7409dad Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Tue, 24 May 2011 17:12:16 -0700 Subject: mm: break out page allocation warning code This originally started as a simple patch to give vmalloc() some more verbose output on failure on top of the plain page allocator messages. Johannes suggested that it might be nicer to lead with the vmalloc() info _before_ the page allocator messages. But, I do think there's a lot of value in what __alloc_pages_slowpath() does with its filtering and so forth. This patch creates a new function which other allocators can call instead of relying on the internal page allocator warnings. It also gives this function private rate-limiting which separates it from other printk_ratelimit() users. Signed-off-by: Dave Hansen Cc: Johannes Weiner Cc: David Rientjes Cc: Michal Nazarewicz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 2ad0ac8c3f32..8e2f2cd821f7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1387,6 +1387,8 @@ extern void si_meminfo(struct sysinfo * val); extern void si_meminfo_node(struct sysinfo *val, int nid); extern int after_bootmem; +extern void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...); + extern void setup_per_cpu_pageset(void); extern void zone_pcp_update(struct zone *zone); -- cgit v1.2.3 From a09ed5e00084448453c8bada4dcd31e5fbfc2f21 Mon Sep 17 00:00:00 2001 From: Ying Han Date: Tue, 24 May 2011 17:12:26 -0700 Subject: vmscan: change shrink_slab() interfaces by passing shrink_control Consolidate the existing parameters to shrink_slab() into a new shrink_control struct. This is needed later to pass the same struct to shrinkers. Signed-off-by: Ying Han Cc: KOSAKI Motohiro Cc: Minchan Kim Acked-by: Pavel Emelyanov Cc: KAMEZAWA Hiroyuki Cc: Mel Gorman Acked-by: Rik van Riel Cc: Johannes Weiner Cc: Hugh Dickins Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 8e2f2cd821f7..32cfa9602d00 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1161,6 +1161,15 @@ static inline void sync_mm_rss(struct task_struct *task, struct mm_struct *mm) } #endif +/* + * This struct is used to pass information from page reclaim to the shrinkers. + * We consolidate the values for easier extention later. + */ +struct shrink_control { + unsigned long nr_scanned; + gfp_t gfp_mask; +}; + /* * A callback you can register to apply pressure to ageable caches. * @@ -1630,8 +1639,8 @@ int in_gate_area_no_mm(unsigned long addr); int drop_caches_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); -unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, - unsigned long lru_pages); +unsigned long shrink_slab(struct shrink_control *shrink, + unsigned long lru_pages); #ifndef CONFIG_MMU #define randomize_va_space 0 -- cgit v1.2.3 From 1495f230fa7750479c79e3656286b9183d662077 Mon Sep 17 00:00:00 2001 From: Ying Han Date: Tue, 24 May 2011 17:12:27 -0700 Subject: vmscan: change shrinker API by passing shrink_control struct Change each shrinker's API by consolidating the existing parameters into shrink_control struct. This will simplify any further features added w/o touching each file of shrinker. [akpm@linux-foundation.org: fix build] [akpm@linux-foundation.org: fix warning] [kosaki.motohiro@jp.fujitsu.com: fix up new shrinker API] [akpm@linux-foundation.org: fix xfs warning] [akpm@linux-foundation.org: update gfs2] Signed-off-by: Ying Han Cc: KOSAKI Motohiro Cc: Minchan Kim Acked-by: Pavel Emelyanov Cc: KAMEZAWA Hiroyuki Cc: Mel Gorman Acked-by: Rik van Riel Cc: Johannes Weiner Cc: Hugh Dickins Cc: Dave Hansen Cc: Steven Whitehouse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 32cfa9602d00..5cbbf78eaac7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1166,18 +1166,20 @@ static inline void sync_mm_rss(struct task_struct *task, struct mm_struct *mm) * We consolidate the values for easier extention later. */ struct shrink_control { - unsigned long nr_scanned; gfp_t gfp_mask; + + /* How many slab objects shrinker() should scan and try to reclaim */ + unsigned long nr_to_scan; }; /* * A callback you can register to apply pressure to ageable caches. * - * 'shrink' is passed a count 'nr_to_scan' and a 'gfpmask'. It should - * look through the least-recently-used 'nr_to_scan' entries and - * attempt to free them up. It should return the number of objects - * which remain in the cache. If it returns -1, it means it cannot do - * any scanning at this time (eg. there is a risk of deadlock). + * 'sc' is passed shrink_control which includes a count 'nr_to_scan' + * and a 'gfpmask'. It should look through the least-recently-used + * 'nr_to_scan' entries and attempt to free them up. It should return + * the number of objects which remain in the cache. If it returns -1, it means + * it cannot do any scanning at this time (eg. there is a risk of deadlock). * * The 'gfpmask' refers to the allocation we are currently trying to * fulfil. @@ -1186,7 +1188,7 @@ struct shrink_control { * querying the cache size, so a fastpath for that case is appropriate. */ struct shrinker { - int (*shrink)(struct shrinker *, int nr_to_scan, gfp_t gfp_mask); + int (*shrink)(struct shrinker *, struct shrink_control *sc); int seeks; /* seeks to recreate an obj */ /* These are for internal use */ @@ -1640,7 +1642,8 @@ int in_gate_area_no_mm(unsigned long addr); int drop_caches_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); unsigned long shrink_slab(struct shrink_control *shrink, - unsigned long lru_pages); + unsigned long nr_pages_scanned, + unsigned long lru_pages); #ifndef CONFIG_MMU #define randomize_va_space 0 -- cgit v1.2.3 From bf4e8902ee5080f5d2c810b639e7e778c8082b52 Mon Sep 17 00:00:00 2001 From: Daniel Kiper Date: Tue, 24 May 2011 17:12:32 -0700 Subject: mm: enable set_page_section() only if CONFIG_SPARSEMEM and !CONFIG_SPARSEMEM_VMEMMAP set_page_section() is meaningful only in CONFIG_SPARSEMEM and !CONFIG_SPARSEMEM_VMEMMAP context. Move it to proper place and amend accordingly functions which are using it. Signed-off-by: Daniel Kiper Acked-by: Dave Hansen Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 5cbbf78eaac7..6702171f8d0a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -682,6 +682,12 @@ static inline struct zone *page_zone(struct page *page) } #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) +static inline void set_page_section(struct page *page, unsigned long section) +{ + page->flags &= ~(SECTIONS_MASK << SECTIONS_PGSHIFT); + page->flags |= (section & SECTIONS_MASK) << SECTIONS_PGSHIFT; +} + static inline unsigned long page_to_section(struct page *page) { return (page->flags >> SECTIONS_PGSHIFT) & SECTIONS_MASK; @@ -700,18 +706,14 @@ static inline void set_page_node(struct page *page, unsigned long node) page->flags |= (node & NODES_MASK) << NODES_PGSHIFT; } -static inline void set_page_section(struct page *page, unsigned long section) -{ - page->flags &= ~(SECTIONS_MASK << SECTIONS_PGSHIFT); - page->flags |= (section & SECTIONS_MASK) << SECTIONS_PGSHIFT; -} - static inline void set_page_links(struct page *page, enum zone_type zone, unsigned long node, unsigned long pfn) { set_page_zone(page, zone); set_page_node(page, node); +#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) set_page_section(page, pfn_to_section_nr(pfn)); +#endif } /* -- cgit v1.2.3 From ba93fa81b5f2bf0076407a3a777fff122ce16220 Mon Sep 17 00:00:00 2001 From: Daniel Kiper Date: Tue, 24 May 2011 17:12:34 -0700 Subject: mm: do not define PFN_SECTION_SHIFT if !CONFIG_SPARSEMEM Do not define PFN_SECTION_SHIFT if !CONFIG_SPARSEMEM. Signed-off-by: Daniel Kiper Acked-by: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 6702171f8d0a..32309f6542e8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -605,10 +605,6 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) #define NODE_NOT_IN_PAGE_FLAGS #endif -#ifndef PFN_SECTION_SHIFT -#define PFN_SECTION_SHIFT 0 -#endif - /* * Define the bit shifts to access each section. For non-existent * sections we define the shift as 0; that plus a 0 mask ensures -- cgit v1.2.3 From 172703b08cd05e2d5196ac13e94cc186f629d58b Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Tue, 24 May 2011 17:12:36 -0700 Subject: mm: delete non-atomic mm counter implementation The problem with having two different types of counters is that developers adding new code need to keep in mind whether it's safe to use both the atomic and non-atomic implementations. For example, when adding new callers of the *_mm_counter() functions a developer needs to ensure that those paths are always executed with page_table_lock held, in case we're using the non-atomic implementation of mm counters. Hugh Dickins introduced the atomic mm counters in commit f412ac08c986 ("[PATCH] mm: fix rss and mmlist locking"). When asked why he left the non-atomic counters around he said, | The only reason was to avoid adding costly atomic operations into a | configuration that had no need for them there: the page_table_lock | sufficed. | | Certainly it would be simpler just to delete the non-atomic variant. | | And I think it's fair to say that any configuration on which we're | measuring performance to that degree (rather than "does it boot fast?" | type measurements), would already be going the split ptlocks route. Removing the non-atomic counters eases the maintenance burden because developers no longer have to mindful of the two implementations when using *_mm_counter(). Note that all architectures provide a means of atomically updating atomic_long_t variables, even if they have to revert to the generic spinlock implementation because they don't support 64-bit atomic instructions (see lib/atomic64.c). Signed-off-by: Matt Fleming Acked-by: Dave Hansen Acked-by: KAMEZAWA Hiroyuki Cc: Hugh Dickins Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 44 +++++++------------------------------------- 1 file changed, 7 insertions(+), 37 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 32309f6542e8..48e458190d88 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1053,65 +1053,35 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, /* * per-process(per-mm_struct) statistics. */ -#if defined(SPLIT_RSS_COUNTING) -/* - * The mm counters are not protected by its page_table_lock, - * so must be incremented atomically. - */ static inline void set_mm_counter(struct mm_struct *mm, int member, long value) { atomic_long_set(&mm->rss_stat.count[member], value); } +#if defined(SPLIT_RSS_COUNTING) unsigned long get_mm_counter(struct mm_struct *mm, int member); - -static inline void add_mm_counter(struct mm_struct *mm, int member, long value) -{ - atomic_long_add(value, &mm->rss_stat.count[member]); -} - -static inline void inc_mm_counter(struct mm_struct *mm, int member) -{ - atomic_long_inc(&mm->rss_stat.count[member]); -} - -static inline void dec_mm_counter(struct mm_struct *mm, int member) -{ - atomic_long_dec(&mm->rss_stat.count[member]); -} - -#else /* !USE_SPLIT_PTLOCKS */ -/* - * The mm counters are protected by its page_table_lock, - * so can be incremented directly. - */ -static inline void set_mm_counter(struct mm_struct *mm, int member, long value) -{ - mm->rss_stat.count[member] = value; -} - +#else static inline unsigned long get_mm_counter(struct mm_struct *mm, int member) { - return mm->rss_stat.count[member]; + return atomic_long_read(&mm->rss_stat.count[member]); } +#endif static inline void add_mm_counter(struct mm_struct *mm, int member, long value) { - mm->rss_stat.count[member] += value; + atomic_long_add(value, &mm->rss_stat.count[member]); } static inline void inc_mm_counter(struct mm_struct *mm, int member) { - mm->rss_stat.count[member]++; + atomic_long_inc(&mm->rss_stat.count[member]); } static inline void dec_mm_counter(struct mm_struct *mm, int member) { - mm->rss_stat.count[member]--; + atomic_long_dec(&mm->rss_stat.count[member]); } -#endif /* !USE_SPLIT_PTLOCKS */ - static inline unsigned long get_mm_rss(struct mm_struct *mm) { return get_mm_counter(mm, MM_FILEPAGES) + -- cgit v1.2.3 From c856507f2b2b47a49d8587afb58930b463f6bff4 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 24 May 2011 17:12:40 -0700 Subject: mm: remove last trace of shmem_get_unmapped_area Remove noMMU declaration of shmem_get_unmapped_area() from mm.h: it fell out of use in 2.6.21 and ceased to exist in 2.6.29. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 48e458190d88..8eb969ebf904 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -873,14 +873,6 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user); struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags); int shmem_zero_setup(struct vm_area_struct *); -#ifndef CONFIG_MMU -extern unsigned long shmem_get_unmapped_area(struct file *file, - unsigned long addr, - unsigned long len, - unsigned long pgoff, - unsigned long flags); -#endif - extern int can_do_mlock(void); extern int user_shm_lock(size_t, struct user_struct *); extern void user_shm_unlock(size_t, struct user_struct *); -- cgit v1.2.3 From ca16d140af91febe25daeb9e032bf8bd46b8c31f Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Thu, 26 May 2011 19:16:19 +0900 Subject: mm: don't access vm_flags as 'int' The type of vma->vm_flags is 'unsigned long'. Neither 'int' nor 'unsigned int'. This patch fixes such misuse. Signed-off-by: KOSAKI Motohiro [ Changed to use a typedef - we'll extend it to cover more cases later, since there has been discussion about making it a 64-bit type.. - Linus ] Signed-off-by: Linus Torvalds --- include/linux/mm.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 8eb969ebf904..fb8e814f78dc 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -165,12 +165,12 @@ extern pgprot_t protection_map[16]; */ static inline int is_linear_pfn_mapping(struct vm_area_struct *vma) { - return (vma->vm_flags & VM_PFN_AT_MMAP); + return !!(vma->vm_flags & VM_PFN_AT_MMAP); } static inline int is_pfn_mapping(struct vm_area_struct *vma) { - return (vma->vm_flags & VM_PFNMAP); + return !!(vma->vm_flags & VM_PFNMAP); } /* @@ -1432,7 +1432,7 @@ extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, unsigned long flag, unsigned long pgoff); extern unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long len, unsigned long flags, - unsigned int vm_flags, unsigned long pgoff); + vm_flags_t vm_flags, unsigned long pgoff); static inline unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, -- cgit v1.2.3 From 3864601387cf4196371e3c1897fdffa5228296f9 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Thu, 26 May 2011 16:25:46 -0700 Subject: mm: extract exe_file handling from procfs Setup and cleanup of mm_struct->exe_file is currently done in fs/proc/. This was because exe_file was needed only for /proc//exe. Since we will need the exe_file functionality also for core dumps (so core name can contain full binary path), built this functionality always into the kernel. To achieve that move that out of proc FS to the kernel/ where in fact it should belong. By doing that we can make dup_mm_exe_file static. Also we can drop linux/proc_fs.h inclusion in fs/exec.c and kernel/fork.c. Signed-off-by: Jiri Slaby Cc: Alexander Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index fb8e814f78dc..9670f71d7be9 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1408,17 +1408,11 @@ extern void exit_mmap(struct mm_struct *); extern int mm_take_all_locks(struct mm_struct *mm); extern void mm_drop_all_locks(struct mm_struct *mm); -#ifdef CONFIG_PROC_FS /* From fs/proc/base.c. callers must _not_ hold the mm's exe_file_lock */ extern void added_exe_file_vma(struct mm_struct *mm); extern void removed_exe_file_vma(struct mm_struct *mm); -#else -static inline void added_exe_file_vma(struct mm_struct *mm) -{} - -static inline void removed_exe_file_vma(struct mm_struct *mm) -{} -#endif /* CONFIG_PROC_FS */ +extern void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file); +extern struct file *get_mm_exe_file(struct mm_struct *mm); extern int may_expand_vm(struct mm_struct *mm, unsigned long npages); extern int install_special_mapping(struct mm_struct *mm, -- cgit v1.2.3