From 3b8967d713d7426e9dd107d065208b84adface91 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 11 Sep 2013 14:19:37 -0700 Subject: include/linux/smp.h:on_each_cpu(): switch back to a C function Revert commit c846ef7deba2 ("include/linux/smp.h:on_each_cpu(): switch back to a macro"). It turns out that the problematic linux/irqflags.h include was fixed within ia64 and mn10300. Cc: Geert Uytterhoeven Cc: David Daney Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/smp.h | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/smp.h b/include/linux/smp.h index c181399f2c20..c8488763277f 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -11,6 +11,7 @@ #include #include #include +#include extern void cpu_idle(void); @@ -139,14 +140,17 @@ static inline int up_smp_call_function(smp_call_func_t func, void *info) } #define smp_call_function(func, info, wait) \ (up_smp_call_function(func, info)) -#define on_each_cpu(func, info, wait) \ - ({ \ - unsigned long __flags; \ - local_irq_save(__flags); \ - func(info); \ - local_irq_restore(__flags); \ - 0; \ - }) + +static inline int on_each_cpu(smp_call_func_t func, void *info, int wait) +{ + unsigned long flags; + + local_irq_save(flags); + func(info); + local_irq_restore(flags); + return 0; +} + /* * Note we still need to test the mask even for UP * because we actually can get an empty mask from -- cgit v1.2.3 From e1403b8edf669ff49bbdf602cc97fefa2760cb15 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 11 Sep 2013 14:20:06 -0700 Subject: include/linux/sched.h: don't use task->pid/tgid in same_thread_group/has_group_leader_pid task_struct->pid/tgid should go away. 1. Change same_thread_group() to use task->signal for comparison. 2. Change has_group_leader_pid(task) to compare task_pid(task) with signal->leader_pid. Signed-off-by: Oleg Nesterov Cc: Michal Hocko Cc: Sergey Dyasly Reviewed-by: "Eric W. Biederman" Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index ce1e1c0aaa33..45f254dddafc 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2169,15 +2169,15 @@ static inline bool thread_group_leader(struct task_struct *p) * all we care about is that we have a task with the appropriate * pid, we don't actually care if we have the right task. */ -static inline int has_group_leader_pid(struct task_struct *p) +static inline bool has_group_leader_pid(struct task_struct *p) { - return p->pid == p->tgid; + return task_pid(p) == p->signal->leader_pid; } static inline -int same_thread_group(struct task_struct *p1, struct task_struct *p2) +bool same_thread_group(struct task_struct *p1, struct task_struct *p2) { - return p1->tgid == p2->tgid; + return p1->signal == p2->signal; } static inline struct task_struct *next_thread(const struct task_struct *p) -- cgit v1.2.3 From bab55417b10c95e6bff8cea315c315adfa009487 Mon Sep 17 00:00:00 2001 From: Cai Zhiyong Date: Wed, 11 Sep 2013 14:20:09 -0700 Subject: block: support embedded device command line partition Read block device partition table from command line. The partition used for fixed block device (eMMC) embedded device. It is no MBR, save storage space. Bootloader can be easily accessed by absolute address of data on the block device. Users can easily change the partition. This code reference MTD partition, source "drivers/mtd/cmdlinepart.c" About the partition verbose reference "Documentation/block/cmdline-partition.txt" [akpm@linux-foundation.org: fix printk text] [yongjun_wei@trendmicro.com.cn: fix error return code in parse_parts()] Signed-off-by: Cai Zhiyong Cc: Karel Zak Cc: "Wanglin (Albert)" Cc: Marius Groeger Cc: David Woodhouse Cc: Jens Axboe Cc: Brian Norris Cc: Artem Bityutskiy Signed-off-by: Wei Yongjun Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cmdline-parser.h | 43 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100644 include/linux/cmdline-parser.h (limited to 'include/linux') diff --git a/include/linux/cmdline-parser.h b/include/linux/cmdline-parser.h new file mode 100644 index 000000000000..98e892ef6d5a --- /dev/null +++ b/include/linux/cmdline-parser.h @@ -0,0 +1,43 @@ +/* + * Parsing command line, get the partitions information. + * + * Written by Cai Zhiyong + * + */ +#ifndef CMDLINEPARSEH +#define CMDLINEPARSEH + +#include + +/* partition flags */ +#define PF_RDONLY 0x01 /* Device is read only */ +#define PF_POWERUP_LOCK 0x02 /* Always locked after reset */ + +struct cmdline_subpart { + char name[BDEVNAME_SIZE]; /* partition name, such as 'rootfs' */ + sector_t from; + sector_t size; + int flags; + struct cmdline_subpart *next_subpart; +}; + +struct cmdline_parts { + char name[BDEVNAME_SIZE]; /* block device, such as 'mmcblk0' */ + unsigned int nr_subparts; + struct cmdline_subpart *subpart; + struct cmdline_parts *next_parts; +}; + +void cmdline_parts_free(struct cmdline_parts **parts); + +int cmdline_parts_parse(struct cmdline_parts **parts, const char *cmdline); + +struct cmdline_parts *cmdline_parts_find(struct cmdline_parts *parts, + const char *bdev); + +void cmdline_parts_set(struct cmdline_parts *parts, sector_t disk_size, + int slot, + int (*add_part)(int, struct cmdline_subpart *, void *), + void *param); + +#endif /* CMDLINEPARSEH */ -- cgit v1.2.3 From ef0855d334e1e4af7c3e0c42146a8479ea14a5ab Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 11 Sep 2013 14:20:14 -0700 Subject: mm: mempolicy: turn vma_set_policy() into vma_dup_policy() Simple cleanup. Every user of vma_set_policy() does the same work, this looks a bit annoying imho. And the new trivial helper which does mpol_dup() + vma_set_policy() to simplify the callers. Signed-off-by: Oleg Nesterov Cc: KOSAKI Motohiro Cc: Mel Gorman Cc: Rik van Riel Cc: Andi Kleen Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mempolicy.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 0d7df39a5885..b2f897789838 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -91,7 +91,6 @@ static inline struct mempolicy *mpol_dup(struct mempolicy *pol) } #define vma_policy(vma) ((vma)->vm_policy) -#define vma_set_policy(vma, pol) ((vma)->vm_policy = (pol)) static inline void mpol_get(struct mempolicy *pol) { @@ -126,6 +125,7 @@ struct shared_policy { spinlock_t lock; }; +int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst); void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol); int mpol_set_shared_policy(struct shared_policy *info, struct vm_area_struct *vma, @@ -240,7 +240,12 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx) } #define vma_policy(vma) NULL -#define vma_set_policy(vma, pol) do {} while(0) + +static inline int +vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) +{ + return 0; +} static inline void numa_policy_init(void) { -- cgit v1.2.3 From 9824cf9753ecbe8f5b47aa9b2f218207defea211 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 11 Sep 2013 14:20:23 -0700 Subject: mm: vmstats: tlb flush counters I was investigating some TLB flush scaling issues and realized that we do not have any good methods for figuring out how many TLB flushes we are doing. It would be nice to be able to do these in generic code, but the arch-independent calls don't explicitly specify whether we actually need to do remote flushes or not. In the end, we really need to know if we actually _did_ global vs. local invalidations, so that leaves us with few options other than to muck with the counters from arch-specific code. Signed-off-by: Dave Hansen Cc: Peter Zijlstra Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vm_event_item.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index bd6cf61142be..dc2cdf07ac14 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -70,6 +70,11 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, THP_ZERO_PAGE_ALLOC, THP_ZERO_PAGE_ALLOC_FAILED, #endif + NR_TLB_REMOTE_FLUSH, /* cpu tried to flush others' tlbs */ + NR_TLB_REMOTE_FLUSH_RECEIVED,/* cpu received ipi for flush */ + NR_TLB_LOCAL_FLUSH_ALL, + NR_TLB_LOCAL_FLUSH_ONE, + NR_TLB_LOCAL_FLUSH_ONE_KERNEL, NR_VM_EVENT_ITEMS }; -- cgit v1.2.3 From 6df46865ff8715932e7d42e52cac17e8461758cb Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 11 Sep 2013 14:20:24 -0700 Subject: mm: vmstats: track TLB flush stats on UP too The previous patch doing vmstats for TLB flushes ("mm: vmstats: tlb flush counters") effectively missed UP since arch/x86/mm/tlb.c is only compiled for SMP. UP systems do not do remote TLB flushes, so compile those counters out on UP. arch/x86/kernel/cpu/mtrr/generic.c calls __flush_tlb() directly. This is probably an optimization since both the mtrr code and __flush_tlb() write cr4. It would probably be safe to make that a flush_tlb_all() (and then get these statistics), but the mtrr code is ancient and I'm hesitant to touch it other than to just stick in the counters. [akpm@linux-foundation.org: tweak comments] Signed-off-by: Dave Hansen Cc: Peter Zijlstra Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vm_event_item.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index dc2cdf07ac14..1855f0a22add 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -70,11 +70,12 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, THP_ZERO_PAGE_ALLOC, THP_ZERO_PAGE_ALLOC_FAILED, #endif +#ifdef CONFIG_SMP NR_TLB_REMOTE_FLUSH, /* cpu tried to flush others' tlbs */ NR_TLB_REMOTE_FLUSH_RECEIVED,/* cpu received ipi for flush */ +#endif NR_TLB_LOCAL_FLUSH_ALL, NR_TLB_LOCAL_FLUSH_ONE, - NR_TLB_LOCAL_FLUSH_ONE_KERNEL, NR_VM_EVENT_ITEMS }; -- cgit v1.2.3 From 2a8f9449343260373398d59228a62a4332ea513a Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 11 Sep 2013 14:20:28 -0700 Subject: swap: change block allocation algorithm for SSD I'm using a fast SSD to do swap. scan_swap_map() sometimes uses up to 20~30% CPU time (when cluster is hard to find, the CPU time can be up to 80%), which becomes a bottleneck. scan_swap_map() scans a byte array to search a 256 page cluster, which is very slow. Here I introduced a simple algorithm to search cluster. Since we only care about 256 pages cluster, we can just use a counter to track if a cluster is free. Every 256 pages use one int to store the counter. If the counter of a cluster is 0, the cluster is free. All free clusters will be added to a list, so searching cluster is very efficient. With this, scap_swap_map() overhead disappears. This might help low end SD card swap too. Because if the cluster is aligned, SD firmware can do flash erase more efficiently. We only enable the algorithm for SSD. Hard disk swap isn't fast enough and has downside with the algorithm which might introduce regression (see below). The patch slightly changes which cluster is choosen. It always adds free cluster to list tail. This can help wear leveling for low end SSD too. And if no cluster found, the scan_swap_map() will do search from the end of last cluster. So if no cluster found, the scan_swap_map() will do search from the end of last free cluster, which is random. For SSD, this isn't a problem at all. Another downside is the cluster must be aligned to 256 pages, which will reduce the chance to find a cluster. I would expect this isn't a big problem for SSD because of the non-seek penality. (And this is the reason I only enable the algorithm for SSD). Signed-off-by: Shaohua Li Cc: Rik van Riel Cc: Minchan Kim Cc: Kyungmin Park Cc: Hugh Dickins Cc: Rafael Aquini Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index d95cde5e257d..cb5baebf31d6 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -181,6 +181,23 @@ enum { #define COUNT_CONTINUED 0x80 /* See swap_map continuation for full count */ #define SWAP_MAP_SHMEM 0xbf /* Owned by shmem/tmpfs, in first swap_map */ +/* + * We use this to track usage of a cluster. A cluster is a block of swap disk + * space with SWAPFILE_CLUSTER pages long and naturally aligns in disk. All + * free clusters are organized into a list. We fetch an entry from the list to + * get a free cluster. + * + * The data field stores next cluster if the cluster is free or cluster usage + * counter otherwise. The flags field determines if a cluster is free. This is + * protected by swap_info_struct.lock. + */ +struct swap_cluster_info { + unsigned int data:24; + unsigned int flags:8; +}; +#define CLUSTER_FLAG_FREE 1 /* This cluster is free */ +#define CLUSTER_FLAG_NEXT_NULL 2 /* This cluster has no next cluster */ + /* * The in-memory structure used to track swap areas. */ @@ -191,6 +208,9 @@ struct swap_info_struct { signed char next; /* next type on the swap list */ unsigned int max; /* extent of the swap_map */ unsigned char *swap_map; /* vmalloc'ed array of usage counts */ + struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */ + struct swap_cluster_info free_cluster_head; /* free cluster list head */ + struct swap_cluster_info free_cluster_tail; /* free cluster list tail */ unsigned int lowest_bit; /* index of first free in swap_map */ unsigned int highest_bit; /* index of last free in swap_map */ unsigned int pages; /* total of usable pages of swap */ -- cgit v1.2.3 From 815c2c543d3aeb914a361f981440ece552778724 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 11 Sep 2013 14:20:30 -0700 Subject: swap: make swap discard async swap can do cluster discard for SSD, which is good, but there are some problems here: 1. swap do the discard just before page reclaim gets a swap entry and writes the disk sectors. This is useless for high end SSD, because an overwrite to a sector implies a discard to original sector too. A discard + overwrite == overwrite. 2. the purpose of doing discard is to improve SSD firmware garbage collection. Idealy we should send discard as early as possible, so firmware can do something smart. Sending discard just after swap entry is freed is considered early compared to sending discard before write. Of course, if workload is already bound to gc speed, sending discard earlier or later doesn't make 3. block discard is a sync API, which will delay scan_swap_map() significantly. 4. Write and discard command can be executed parallel in PCIe SSD. Making swap discard async can make execution more efficiently. This patch makes swap discard async and moves discard to where swap entry is freed. Discard and write have no dependence now, so above issues can be avoided. Idealy we should do discard for any freed sectors, but some SSD discard is very slow. This patch still does discard for a whole cluster. My test does a several round of 'mmap, write, unmap', which will trigger a lot of swap discard. In a fusionio card, with this patch, the test runtime is reduced to 18% of the time without it, so around 5.5x faster. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Shaohua Li Cc: Rik van Riel Cc: Minchan Kim Cc: Kyungmin Park Cc: Hugh Dickins Cc: Rafael Aquini Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index cb5baebf31d6..8a3c4a1caa14 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -217,8 +217,6 @@ struct swap_info_struct { unsigned int inuse_pages; /* number of those currently in use */ unsigned int cluster_next; /* likely index for next allocation */ unsigned int cluster_nr; /* countdown to next cluster search */ - unsigned int lowest_alloc; /* while preparing discard cluster */ - unsigned int highest_alloc; /* while preparing discard cluster */ struct swap_extent *curr_swap_extent; struct swap_extent first_swap_extent; struct block_device *bdev; /* swap device or bdev of swap file */ @@ -232,14 +230,18 @@ struct swap_info_struct { * protect map scan related fields like * swap_map, lowest_bit, highest_bit, * inuse_pages, cluster_next, - * cluster_nr, lowest_alloc and - * highest_alloc. other fields are only - * changed at swapon/swapoff, so are - * protected by swap_lock. changing - * flags need hold this lock and - * swap_lock. If both locks need hold, - * hold swap_lock first. + * cluster_nr, lowest_alloc, + * highest_alloc, free/discard cluster + * list. other fields are only changed + * at swapon/swapoff, so are protected + * by swap_lock. changing flags need + * hold this lock and swap_lock. If + * both locks need hold, hold swap_lock + * first. */ + struct work_struct discard_work; /* discard worker */ + struct swap_cluster_info discard_cluster_head; /* list head of discard clusters */ + struct swap_cluster_info discard_cluster_tail; /* list tail of discard clusters */ }; struct swap_list_t { -- cgit v1.2.3 From ebc2a1a69111eadfeda8487e577f1a5d42ef0dae Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 11 Sep 2013 14:20:32 -0700 Subject: swap: make cluster allocation per-cpu swap cluster allocation is to get better request merge to improve performance. But the cluster is shared globally, if multiple tasks are doing swap, this will cause interleave disk access. While multiple tasks swap is quite common, for example, each numa node has a kswapd thread doing swap and multiple threads/processes doing direct page reclaim. ioscheduler can't help too much here, because tasks don't send swapout IO down to block layer in the meantime. Block layer does merge some IOs, but a lot not, depending on how many tasks are doing swapout concurrently. In practice, I've seen a lot of small size IO in swapout workloads. We makes the cluster allocation per-cpu here. The interleave disk access issue goes away. All tasks swapout to their own cluster, so swapout will become sequential, which can be easily merged to big size IO. If one CPU can't get its per-cpu cluster (for example, there is no free cluster anymore in the swap), it will fallback to scan swap_map. The CPU can still continue swap. We don't need recycle free swap entries of other CPUs. In my test (swap to a 2-disk raid0 partition), this improves around 10% swapout throughput, and request size is increased significantly. How does this impact swap readahead is uncertain though. On one side, page reclaim always isolates and swaps several adjancent pages, this will make page reclaim write the pages sequentially and benefit readahead. On the other side, several CPU write pages interleave means the pages don't live _sequentially_ but relatively _near_. In the per-cpu allocation case, if adjancent pages are written by different cpus, they will live relatively _far_. So how this impacts swap readahead depends on how many pages page reclaim isolates and swaps one time. If the number is big, this patch will benefit swap readahead. Of course, this is about sequential access pattern. The patch has no impact for random access pattern, because the new cluster allocation algorithm is just for SSD. Alternative solution is organizing swap layout to be per-mm instead of this per-cpu approach. In the per-mm layout, we allocate a disk range for each mm, so pages of one mm live in swap disk adjacently. per-mm layout has potential issues of lock contention if multiple reclaimers are swap pages from one mm. For a sequential workload, per-mm layout is better to implement swap readahead, because pages from the mm are adjacent in disk. But per-cpu layout isn't very bad in this workload, as page reclaim always isolates and swaps several pages one time, such pages will still live in disk sequentially and readahead can utilize this. For a random workload, per-mm layout isn't beneficial of request merge, because it's quite possible pages from different mm are swapout in the meantime and IO can't be merged in per-mm layout. while with per-cpu layout we can merge requests from any mm. Considering random workload is more popular in workloads with swap (and per-cpu approach isn't too bad for sequential workload too), I'm choosing per-cpu layout. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Shaohua Li Cc: Rik van Riel Cc: Minchan Kim Cc: Kyungmin Park Cc: Hugh Dickins Cc: Rafael Aquini Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 8a3c4a1caa14..24db9142e93b 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -198,6 +198,16 @@ struct swap_cluster_info { #define CLUSTER_FLAG_FREE 1 /* This cluster is free */ #define CLUSTER_FLAG_NEXT_NULL 2 /* This cluster has no next cluster */ +/* + * We assign a cluster to each CPU, so each CPU can allocate swap entry from + * its own cluster and swapout sequentially. The purpose is to optimize swapout + * throughput. + */ +struct percpu_cluster { + struct swap_cluster_info index; /* Current cluster index */ + unsigned int next; /* Likely next allocation offset */ +}; + /* * The in-memory structure used to track swap areas. */ @@ -217,6 +227,7 @@ struct swap_info_struct { unsigned int inuse_pages; /* number of those currently in use */ unsigned int cluster_next; /* likely index for next allocation */ unsigned int cluster_nr; /* countdown to next cluster search */ + struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */ struct swap_extent *curr_swap_extent; struct swap_extent first_swap_extent; struct block_device *bdev; /* swap device or bdev of swap file */ -- cgit v1.2.3 From 81c0a2bb515fd4daae8cab64352877480792b515 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 11 Sep 2013 14:20:47 -0700 Subject: mm: page_alloc: fair zone allocator policy Each zone that holds userspace pages of one workload must be aged at a speed proportional to the zone size. Otherwise, the time an individual page gets to stay in memory depends on the zone it happened to be allocated in. Asymmetry in the zone aging creates rather unpredictable aging behavior and results in the wrong pages being reclaimed, activated etc. But exactly this happens right now because of the way the page allocator and kswapd interact. The page allocator uses per-node lists of all zones in the system, ordered by preference, when allocating a new page. When the first iteration does not yield any results, kswapd is woken up and the allocator retries. Due to the way kswapd reclaims zones below the high watermark while a zone can be allocated from when it is above the low watermark, the allocator may keep kswapd running while kswapd reclaim ensures that the page allocator can keep allocating from the first zone in the zonelist for extended periods of time. Meanwhile the other zones rarely see new allocations and thus get aged much slower in comparison. The result is that the occasional page placed in lower zones gets relatively more time in memory, even gets promoted to the active list after its peers have long been evicted. Meanwhile, the bulk of the working set may be thrashing on the preferred zone even though there may be significant amounts of memory available in the lower zones. Even the most basic test -- repeatedly reading a file slightly bigger than memory -- shows how broken the zone aging is. In this scenario, no single page should be able stay in memory long enough to get referenced twice and activated, but activation happens in spades: $ grep active_file /proc/zoneinfo nr_inactive_file 0 nr_active_file 0 nr_inactive_file 0 nr_active_file 8 nr_inactive_file 1582 nr_active_file 11994 $ cat data data data data >/dev/null $ grep active_file /proc/zoneinfo nr_inactive_file 0 nr_active_file 70 nr_inactive_file 258753 nr_active_file 443214 nr_inactive_file 149793 nr_active_file 12021 Fix this with a very simple round robin allocator. Each zone is allowed a batch of allocations that is proportional to the zone's size, after which it is treated as full. The batch counters are reset when all zones have been tried and the allocator enters the slowpath and kicks off kswapd reclaim. Allocation and reclaim is now fairly spread out to all available/allowable zones: $ grep active_file /proc/zoneinfo nr_inactive_file 0 nr_active_file 0 nr_inactive_file 174 nr_active_file 4865 nr_inactive_file 53 nr_active_file 860 $ cat data data data data >/dev/null $ grep active_file /proc/zoneinfo nr_inactive_file 0 nr_active_file 0 nr_inactive_file 666622 nr_active_file 4988 nr_inactive_file 190969 nr_active_file 937 When zone_reclaim_mode is enabled, allocations will now spread out to all zones on the local node, not just the first preferred zone (which on a 4G node might be a tiny Normal zone). Signed-off-by: Johannes Weiner Acked-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Paul Bolle Cc: Zlatko Calusic Tested-by: Kevin Hilman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index af4a3b77a8de..ac1ea796ec0f 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -105,6 +105,7 @@ struct zone_padding { enum zone_stat_item { /* First 128 byte cacheline (assuming 64 bit words) */ NR_FREE_PAGES, + NR_ALLOC_BATCH, NR_LRU_BASE, NR_INACTIVE_ANON = NR_LRU_BASE, /* must match order of LRU_[IN]ACTIVE */ NR_ACTIVE_ANON, /* " " " " " */ -- cgit v1.2.3 From d2cf5ad6312ca9913464fac40fb47ba47ad945c4 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 11 Sep 2013 14:21:29 -0700 Subject: swap: clean-up #ifdef in page_mapping() PageSwapCache() is always false when !CONFIG_SWAP, so compiler properly discard related code. Therefore, we don't need #ifdef explicitly. Signed-off-by: Joonsoo Kim Acked-by: Johannes Weiner Cc: Minchan Kim Cc: Mel Gorman Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 24db9142e93b..c03c139219c9 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -447,6 +447,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) #else /* CONFIG_SWAP */ +#define swap_address_space(entry) (NULL) #define get_nr_swap_pages() 0L #define total_swap_pages 0L #define total_swapcache_pages() 0UL -- cgit v1.2.3 From 2bb921e526656556e68f99f5f15a4a1bf2691844 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 11 Sep 2013 14:21:30 -0700 Subject: vmstat: create separate function to fold per cpu diffs into local counters The main idea behind this patchset is to reduce the vmstat update overhead by avoiding interrupt enable/disable and the use of per cpu atomics. This patch (of 3): It is better to have a separate folding function because refresh_cpu_vm_stats() also does other things like expire pages in the page allocator caches. If we have a separate function then refresh_cpu_vm_stats() is only called from the local cpu which allows additional optimizations. The folding function is only called when a cpu is being downed and therefore no other processor will be accessing the counters. Also simplifies synchronization. [akpm@linux-foundation.org: fix UP build] Signed-off-by: Christoph Lameter Cc: KOSAKI Motohiro CC: Tejun Heo Cc: Joonsoo Kim Cc: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmstat.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index c586679b6fef..502767f4e4d4 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -198,7 +198,7 @@ extern void __inc_zone_state(struct zone *, enum zone_stat_item); extern void dec_zone_state(struct zone *, enum zone_stat_item); extern void __dec_zone_state(struct zone *, enum zone_stat_item); -void refresh_cpu_vm_stats(int); +void cpu_vm_stats_fold(int cpu); void refresh_zone_stat_thresholds(void); void drain_zonestat(struct zone *zone, struct per_cpu_pageset *); @@ -255,6 +255,7 @@ static inline void __dec_zone_page_state(struct page *page, static inline void refresh_cpu_vm_stats(int cpu) { } static inline void refresh_zone_stat_thresholds(void) { } +static inline void cpu_vm_stats_fold(int cpu) { } static inline void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset) { } -- cgit v1.2.3 From 674470d97958a0ec72f72caf7f6451da40159cc7 Mon Sep 17 00:00:00 2001 From: Joonyoung Shim Date: Wed, 11 Sep 2013 14:21:43 -0700 Subject: lib/genalloc.c: fix overflow of ending address of memory chunk In struct gen_pool_chunk, end_addr means the end address of memory chunk (inclusive), but in the implementation it is treated as address + size of memory chunk (exclusive), so it points to the address plus one instead of correct ending address. The ending address of memory chunk plus one will cause overflow on the memory chunk including the last address of memory map, e.g. when starting address is 0xFFF00000 and size is 0x100000 on 32bit machine, ending address will be 0x100000000. Use correct ending address like starting address + size - 1. [akpm@linux-foundation.org: add comment to struct gen_pool_chunk:end_addr] Signed-off-by: Joonyoung Shim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/genalloc.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/genalloc.h b/include/linux/genalloc.h index 661d374aeb2d..f8d41cb1cbe0 100644 --- a/include/linux/genalloc.h +++ b/include/linux/genalloc.h @@ -66,8 +66,8 @@ struct gen_pool_chunk { struct list_head next_chunk; /* next chunk in pool */ atomic_t avail; phys_addr_t phys_addr; /* physical starting address of memory chunk */ - unsigned long start_addr; /* starting address of memory chunk */ - unsigned long end_addr; /* ending address of memory chunk */ + unsigned long start_addr; /* start address of memory chunk */ + unsigned long end_addr; /* end address of memory chunk (inclusive) */ unsigned long bits[0]; /* bitmap for allocating memory chunk */ }; -- cgit v1.2.3 From 31caf665e666b51fe36efd1e54031ed29e86c0b4 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 11 Sep 2013 14:21:59 -0700 Subject: mm: migrate: make core migration code aware of hugepage Currently hugepage migration is available only for soft offlining, but it's also useful for some other users of page migration (clearly because users of hugepage can enjoy the benefit of mempolicy and memory hotplug.) So this patchset tries to extend such users to support hugepage migration. The target of this patchset is to enable hugepage migration for NUMA related system calls (migrate_pages(2), move_pages(2), and mbind(2)), and memory hotplug. This patchset does not add hugepage migration for memory compaction, because users of memory compaction mainly expect to construct thp by arranging raw pages, and there's little or no need to compact hugepages. CMA, another user of page migration, can have benefit from hugepage migration, but is not enabled to support it for now (just because of lack of testing and expertise in CMA.) Hugepage migration of non pmd-based hugepage (for example 1GB hugepage in x86_64, or hugepages in architectures like ia64) is not enabled for now (again, because of lack of testing.) As for how these are achived, I extended the API (migrate_pages()) to handle hugepage (with patch 1 and 2) and adjusted code of each caller to check and collect movable hugepages (with patch 3-7). Remaining 2 patches are kind of miscellaneous ones to avoid unexpected behavior. Patch 8 is about making sure that we only migrate pmd-based hugepages. And patch 9 is about choosing appropriate zone for hugepage allocation. My test is mainly functional one, simply kicking hugepage migration via each entry point and confirm that migration is done correctly. Test code is available here: git://github.com/Naoya-Horiguchi/test_hugepage_migration_extension.git And I always run libhugetlbfs test when changing hugetlbfs's code. With this patchset, no regression was found in the test. This patch (of 9): Before enabling each user of page migration to support hugepage, this patch enables the list of pages for migration to link not only LRU pages, but also hugepages. As a result, putback_movable_pages() and migrate_pages() can handle both of LRU pages and hugepages. Signed-off-by: Naoya Horiguchi Acked-by: Andi Kleen Reviewed-by: Wanpeng Li Acked-by: Hillf Danton Cc: Mel Gorman Cc: Hugh Dickins Cc: KOSAKI Motohiro Cc: Michal Hocko Cc: Rik van Riel Cc: "Aneesh Kumar K.V" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index c2b1801a160b..bc8d8370cd0d 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -66,6 +66,8 @@ int hugetlb_reserve_pages(struct inode *inode, long from, long to, vm_flags_t vm_flags); void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed); int dequeue_hwpoisoned_huge_page(struct page *page); +bool isolate_huge_page(struct page *page, struct list_head *list); +void putback_active_hugepage(struct page *page); void copy_huge_page(struct page *dst, struct page *src); #ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE @@ -134,6 +136,8 @@ static inline int dequeue_hwpoisoned_huge_page(struct page *page) return 0; } +#define isolate_huge_page(p, l) false +#define putback_active_hugepage(p) do {} while (0) static inline void copy_huge_page(struct page *dst, struct page *src) { } -- cgit v1.2.3 From b8ec1cee5a4375c1244b85709138a2eac2d89cb6 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 11 Sep 2013 14:22:01 -0700 Subject: mm: soft-offline: use migrate_pages() instead of migrate_huge_page() Currently migrate_huge_page() takes a pointer to a hugepage to be migrated as an argument, instead of taking a pointer to the list of hugepages to be migrated. This behavior was introduced in commit 189ebff28 ("hugetlb: simplify migrate_huge_page()"), and was OK because until now hugepage migration is enabled only for soft-offlining which migrates only one hugepage in a single call. But the situation will change in the later patches in this series which enable other users of page migration to support hugepage migration. They can kick migration for both of normal pages and hugepages in a single call, so we need to go back to original implementation which uses linked lists to collect the hugepages to be migrated. With this patch, soft_offline_huge_page() switches to use migrate_pages(), and migrate_huge_page() is not used any more. So let's remove it. Signed-off-by: Naoya Horiguchi Acked-by: Andi Kleen Reviewed-by: Wanpeng Li Acked-by: Hillf Danton Cc: Mel Gorman Cc: Hugh Dickins Cc: KOSAKI Motohiro Cc: Michal Hocko Cc: Rik van Riel Cc: "Aneesh Kumar K.V" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/migrate.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index a405d3dc0f61..6fe521420631 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -41,8 +41,6 @@ extern int migrate_page(struct address_space *, struct page *, struct page *, enum migrate_mode); extern int migrate_pages(struct list_head *l, new_page_t x, unsigned long private, enum migrate_mode mode, int reason); -extern int migrate_huge_page(struct page *, new_page_t x, - unsigned long private, enum migrate_mode mode); extern int fail_migrate_page(struct address_space *, struct page *, struct page *); @@ -62,9 +60,6 @@ static inline void putback_movable_pages(struct list_head *l) {} static inline int migrate_pages(struct list_head *l, new_page_t x, unsigned long private, enum migrate_mode mode, int reason) { return -ENOSYS; } -static inline int migrate_huge_page(struct page *page, new_page_t x, - unsigned long private, enum migrate_mode mode) - { return -ENOSYS; } static inline int migrate_prep(void) { return -ENOSYS; } static inline int migrate_prep_local(void) { return -ENOSYS; } -- cgit v1.2.3 From 74060e4d78795c7c43805133cb717d82533d4e0d Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 11 Sep 2013 14:22:06 -0700 Subject: mm: mbind: add hugepage migration code to mbind() Extend do_mbind() to handle vma with VM_HUGETLB set. We will be able to migrate hugepage with mbind(2) after applying the enablement patch which comes later in this series. Signed-off-by: Naoya Horiguchi Acked-by: Andi Kleen Reviewed-by: Wanpeng Li Acked-by: Hillf Danton Cc: Mel Gorman Cc: Hugh Dickins Cc: KOSAKI Motohiro Cc: Michal Hocko Cc: Rik van Riel Cc: "Aneesh Kumar K.V" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index bc8d8370cd0d..d1db00790a84 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -265,6 +265,8 @@ struct huge_bootmem_page { }; struct page *alloc_huge_page_node(struct hstate *h, int nid); +struct page *alloc_huge_page_noerr(struct vm_area_struct *vma, + unsigned long addr, int avoid_reserve); /* arch callback */ int __init alloc_bootmem_huge_page(struct hstate *h); @@ -378,6 +380,7 @@ static inline pgoff_t basepage_index(struct page *page) #else /* CONFIG_HUGETLB_PAGE */ struct hstate {}; #define alloc_huge_page_node(h, nid) NULL +#define alloc_huge_page_noerr(v, a, r) NULL #define alloc_bootmem_huge_page(h) NULL #define hstate_file(f) NULL #define hstate_sizelog(s) NULL -- cgit v1.2.3 From 71ea2efb1e936a127690a0a540b3a6162f95e48a Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 11 Sep 2013 14:22:08 -0700 Subject: mm: migrate: remove VM_HUGETLB from vma flag check in vma_migratable() Enable hugepage migration from migrate_pages(2), move_pages(2), and mbind(2). Signed-off-by: Naoya Horiguchi Acked-by: Hillf Danton Acked-by: Andi Kleen Reviewed-by: Wanpeng Li Cc: Mel Gorman Cc: Hugh Dickins Cc: KOSAKI Motohiro Cc: Michal Hocko Cc: Rik van Riel Cc: "Aneesh Kumar K.V" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mempolicy.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index b2f897789838..da6716b9e3fe 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -173,7 +173,7 @@ extern int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol); /* Check if a vma is migratable */ static inline int vma_migratable(struct vm_area_struct *vma) { - if (vma->vm_flags & (VM_IO | VM_HUGETLB | VM_PFNMAP)) + if (vma->vm_flags & (VM_IO | VM_PFNMAP)) return 0; /* * Migration allocates pages in the highest zone. If we cannot -- cgit v1.2.3 From c8721bbbdd36382de51cd6b7a56322e0acca2414 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 11 Sep 2013 14:22:09 -0700 Subject: mm: memory-hotplug: enable memory hotplug to handle hugepage Until now we can't offline memory blocks which contain hugepages because a hugepage is considered as an unmovable page. But now with this patch series, a hugepage has become movable, so by using hugepage migration we can offline such memory blocks. What's different from other users of hugepage migration is that we need to decompose all the hugepages inside the target memory block into free buddy pages after hugepage migration, because otherwise free hugepages remaining in the memory block intervene the memory offlining. For this reason we introduce new functions dissolve_free_huge_page() and dissolve_free_huge_pages(). Other than that, what this patch does is straightforwardly to add hugepage migration code, that is, adding hugepage code to the functions which scan over pfn and collect hugepages to be migrated, and adding a hugepage allocation function to alloc_migrate_target(). As for larger hugepages (1GB for x86_64), it's not easy to do hotremove over them because it's larger than memory block. So we now simply leave it to fail as it is. [yongjun_wei@trendmicro.com.cn: remove duplicated include] Signed-off-by: Naoya Horiguchi Acked-by: Andi Kleen Cc: Hillf Danton Cc: Wanpeng Li Cc: Mel Gorman Cc: Hugh Dickins Cc: KOSAKI Motohiro Cc: Michal Hocko Cc: Rik van Riel Cc: "Aneesh Kumar K.V" Signed-off-by: Wei Yongjun Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index d1db00790a84..2e02c4ed1035 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -68,6 +68,7 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed); int dequeue_hwpoisoned_huge_page(struct page *page); bool isolate_huge_page(struct page *page, struct list_head *list); void putback_active_hugepage(struct page *page); +bool is_hugepage_active(struct page *page); void copy_huge_page(struct page *dst, struct page *src); #ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE @@ -138,6 +139,7 @@ static inline int dequeue_hwpoisoned_huge_page(struct page *page) #define isolate_huge_page(p, l) false #define putback_active_hugepage(p) do {} while (0) +#define is_hugepage_active(x) false static inline void copy_huge_page(struct page *dst, struct page *src) { } @@ -377,6 +379,9 @@ static inline pgoff_t basepage_index(struct page *page) return __basepage_index(page); } +extern void dissolve_free_huge_pages(unsigned long start_pfn, + unsigned long end_pfn); + #else /* CONFIG_HUGETLB_PAGE */ struct hstate {}; #define alloc_huge_page_node(h, nid) NULL @@ -403,6 +408,7 @@ static inline pgoff_t basepage_index(struct page *page) { return page->index; } +#define dissolve_free_huge_pages(s, e) do {} while (0) #endif /* CONFIG_HUGETLB_PAGE */ #endif /* _LINUX_HUGETLB_H */ -- cgit v1.2.3 From 83467efbdb7948146581a56cbd683a22a0684bbb Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 11 Sep 2013 14:22:11 -0700 Subject: mm: migrate: check movability of hugepage in unmap_and_move_huge_page() Currently hugepage migration works well only for pmd-based hugepages (mainly due to lack of testing,) so we had better not enable migration of other levels of hugepages until we are ready for it. Some users of hugepage migration (mbind, move_pages, and migrate_pages) do page table walk and check pud/pmd_huge() there, so they are safe. But the other users (softoffline and memory hotremove) don't do this, so without this patch they can try to migrate unexpected types of hugepages. To prevent this, we introduce hugepage_migration_support() as an architecture dependent check of whether hugepage are implemented on a pmd basis or not. And on some architecture multiple sizes of hugepages are available, so hugepage_migration_support() also checks hugepage size. Signed-off-by: Naoya Horiguchi Cc: Andi Kleen Cc: Hillf Danton Cc: Wanpeng Li Cc: Mel Gorman Cc: Hugh Dickins Cc: KOSAKI Motohiro Cc: Michal Hocko Cc: Rik van Riel Cc: "Aneesh Kumar K.V" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 2e02c4ed1035..0393270466c3 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -381,6 +381,16 @@ static inline pgoff_t basepage_index(struct page *page) extern void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn); +int pmd_huge_support(void); +/* + * Currently hugepage migration is enabled only for pmd-based hugepage. + * This function will be updated when hugepage migration is more widely + * supported. + */ +static inline int hugepage_migration_support(struct hstate *h) +{ + return pmd_huge_support() && (huge_page_shift(h) == PMD_SHIFT); +} #else /* CONFIG_HUGETLB_PAGE */ struct hstate {}; @@ -409,6 +419,8 @@ static inline pgoff_t basepage_index(struct page *page) return page->index; } #define dissolve_free_huge_pages(s, e) do {} while (0) +#define pmd_huge_support() 0 +#define hugepage_migration_support(h) 0 #endif /* CONFIG_HUGETLB_PAGE */ #endif /* _LINUX_HUGETLB_H */ -- cgit v1.2.3 From e76b63f80d938a1319eb5fb0ae7ea69bddfbae38 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 11 Sep 2013 14:22:17 -0700 Subject: memblock, numa: binary search node id Current early_pfn_to_nid() on arch that support memblock go over memblock.memory one by one, so will take too many try near the end. We can use existing memblock_search to find the node id for given pfn, that could save some time on bigger system that have many entries memblock.memory array. Here are the timing differences for several machines. In each case with the patch less time was spent in __early_pfn_to_nid(). 3.11-rc5 with patch difference (%) -------- ---------- -------------- UV1: 256 nodes 9TB: 411.66 402.47 -9.19 (2.23%) UV2: 255 nodes 16TB: 1141.02 1138.12 -2.90 (0.25%) UV2: 64 nodes 2TB: 128.15 126.53 -1.62 (1.26%) UV2: 32 nodes 2TB: 121.87 121.07 -0.80 (0.66%) Time in seconds. Signed-off-by: Yinghai Lu Cc: Tejun Heo Acked-by: Russ Anderson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memblock.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index f388203db7e8..31e95acddb4d 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -60,6 +60,8 @@ int memblock_reserve(phys_addr_t base, phys_addr_t size); void memblock_trim_memory(phys_addr_t align); #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP +int memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn, + unsigned long *end_pfn); void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn, unsigned long *out_end_pfn, int *out_nid); -- cgit v1.2.3 From d9104d1ca9662498339c0de975b4666c30485f4e Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Wed, 11 Sep 2013 14:22:24 -0700 Subject: mm: track vma changes with VM_SOFTDIRTY bit Pavel reported that in case if vma area get unmapped and then mapped (or expanded) in-place, the soft dirty tracker won't be able to recognize this situation since it works on pte level and ptes are get zapped on unmap, loosing soft dirty bit of course. So to resolve this situation we need to track actions on vma level, there VM_SOFTDIRTY flag comes in. When new vma area created (or old expanded) we set this bit, and keep it here until application calls for clearing soft dirty bit. Thus when user space application track memory changes now it can detect if vma area is renewed. Reported-by: Pavel Emelyanov Signed-off-by: Cyrill Gorcunov Cc: Andy Lutomirski Cc: Matt Mackall Cc: Xiao Guangrong Cc: Marcelo Tosatti Cc: KOSAKI Motohiro Cc: Stephen Rothwell Cc: Peter Zijlstra Cc: "Aneesh Kumar K.V" Cc: Rob Landley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index d2d59b4149d0..dce24569f8fc 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -115,6 +115,12 @@ extern unsigned int kobjsize(const void *objp); #define VM_ARCH_1 0x01000000 /* Architecture-specific flag */ #define VM_DONTDUMP 0x04000000 /* Do not include in the core dump */ +#ifdef CONFIG_MEM_SOFT_DIRTY +# define VM_SOFTDIRTY 0x08000000 /* Not soft dirty clean area */ +#else +# define VM_SOFTDIRTY 0 +#endif + #define VM_MIXEDMAP 0x10000000 /* Can contain "struct page" and pure PFN pages */ #define VM_HUGEPAGE 0x20000000 /* MADV_HUGEPAGE marked this vma */ #define VM_NOHUGEPAGE 0x40000000 /* MADV_NOHUGEPAGE marked this vma */ -- cgit v1.2.3 From 7a8010cd36273ff5f6fea5201ef9232f30cebbd9 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 11 Sep 2013 14:22:35 -0700 Subject: mm: munlock: manual pte walk in fast path instead of follow_page_mask() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently munlock_vma_pages_range() calls follow_page_mask() to obtain each individual struct page. This entails repeated full page table translations and page table lock taken for each page separately. This patch avoids the costly follow_page_mask() where possible, by iterating over ptes within single pmd under single page table lock. The first pte is obtained by get_locked_pte() for non-THP page acquired by the initial follow_page_mask(). The rest of the on-stack pagevec for munlock is filled up using pte_walk as long as pte_present() and vm_normal_page() are sufficient to obtain the struct page. After this patch, a 14% speedup was measured for munlocking a 56GB large memory area with THP disabled. Signed-off-by: Vlastimil Babka Cc: Jörn Engel Cc: Mel Gorman Cc: Michel Lespinasse Cc: Hugh Dickins Cc: Rik van Riel Cc: Johannes Weiner Cc: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index dce24569f8fc..03f84b8d7359 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -643,12 +643,12 @@ static inline enum zone_type page_zonenum(const struct page *page) #endif /* - * The identification function is only used by the buddy allocator for - * determining if two pages could be buddies. We are not really - * identifying a zone since we could be using a the section number - * id if we have not node id available in page flags. - * We guarantee only that it will return the same value for two - * combinable pages in a zone. + * The identification function is mainly used by the buddy allocator for + * determining if two pages could be buddies. We are not really identifying + * the zone since we could be using the section number id if we do not have + * node id available in page flags. + * We only guarantee that it will return the same value for two combinable + * pages in a zone. */ static inline int page_zone_id(struct page *page) { -- cgit v1.2.3 From 6e543d5780e36ff5ee56c44d7e2e30db3457a7ed Mon Sep 17 00:00:00 2001 From: Lisa Du Date: Wed, 11 Sep 2013 14:22:36 -0700 Subject: mm: vmscan: fix do_try_to_free_pages() livelock This patch is based on KOSAKI's work and I add a little more description, please refer https://lkml.org/lkml/2012/6/14/74. Currently, I found system can enter a state that there are lots of free pages in a zone but only order-0 and order-1 pages which means the zone is heavily fragmented, then high order allocation could make direct reclaim path's long stall(ex, 60 seconds) especially in no swap and no compaciton enviroment. This problem happened on v3.4, but it seems issue still lives in current tree, the reason is do_try_to_free_pages enter live lock: kswapd will go to sleep if the zones have been fully scanned and are still not balanced. As kswapd thinks there's little point trying all over again to avoid infinite loop. Instead it changes order from high-order to 0-order because kswapd think order-0 is the most important. Look at 73ce02e9 in detail. If watermarks are ok, kswapd will go back to sleep and may leave zone->all_unreclaimable =3D 0. It assume high-order users can still perform direct reclaim if they wish. Direct reclaim continue to reclaim for a high order which is not a COSTLY_ORDER without oom-killer until kswapd turn on zone->all_unreclaimble= . This is because to avoid too early oom-kill. So it means direct_reclaim depends on kswapd to break this loop. In worst case, direct-reclaim may continue to page reclaim forever when kswapd sleeps forever until someone like watchdog detect and finally kill the process. As described in: http://thread.gmane.org/gmane.linux.kernel.mm/103737 We can't turn on zone->all_unreclaimable from direct reclaim path because direct reclaim path don't take any lock and this way is racy. Thus this patch removes zone->all_unreclaimable field completely and recalculates zone reclaimable state every time. Note: we can't take the idea that direct-reclaim see zone->pages_scanned directly and kswapd continue to use zone->all_unreclaimable. Because, it is racy. commit 929bea7c71 (vmscan: all_unreclaimable() use zone->all_unreclaimable as a name) describes the detail. [akpm@linux-foundation.org: uninline zone_reclaimable_pages() and zone_reclaimable()] Cc: Aaditya Kumar Cc: Ying Han Cc: Nick Piggin Acked-by: Rik van Riel Cc: Mel Gorman Cc: KAMEZAWA Hiroyuki Cc: Christoph Lameter Cc: Bob Liu Cc: Neil Zhang Cc: Russell King - ARM Linux Reviewed-by: Michal Hocko Acked-by: Minchan Kim Acked-by: Johannes Weiner Signed-off-by: KOSAKI Motohiro Signed-off-by: Lisa Du Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_inline.h | 1 + include/linux/mmzone.h | 1 - include/linux/vmstat.h | 1 - 3 files changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 1397ccf81e91..cf55945c83fb 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -2,6 +2,7 @@ #define LINUX_MM_INLINE_H #include +#include /** * page_is_file_cache - should the page be on a file LRU or anon LRU? diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index ac1ea796ec0f..bd791e452ad7 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -353,7 +353,6 @@ struct zone { * free areas of different sizes */ spinlock_t lock; - int all_unreclaimable; /* All pages pinned */ #if defined CONFIG_COMPACTION || defined CONFIG_CMA /* Set to true when the PG_migrate_skip bits should be cleared */ bool compact_blockskip_flush; diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 502767f4e4d4..e4b948080d20 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -143,7 +143,6 @@ static inline unsigned long zone_page_state_snapshot(struct zone *zone, } extern unsigned long global_reclaimable_pages(void); -extern unsigned long zone_reclaimable_pages(struct zone *zone); #ifdef CONFIG_NUMA /* -- cgit v1.2.3 From 7d9f073b8da45a894bb7148433bd84d21eed6757 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Wed, 11 Sep 2013 14:22:40 -0700 Subject: mm/writeback: make writeback_inodes_wb static It's not used globally and could be static. Signed-off-by: Wanpeng Li Cc: Dave Hansen Cc: Rik van Riel Cc: Fengguang Wu Cc: Joonsoo Kim Cc: Johannes Weiner Cc: Tejun Heo Cc: Yasuaki Ishimatsu Cc: David Rientjes Cc: KOSAKI Motohiro Cc: Jiri Kosina Cc: Wanpeng Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/writeback.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 4e198ca1f685..021b8a319b9e 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -98,8 +98,6 @@ int try_to_writeback_inodes_sb(struct super_block *, enum wb_reason reason); int try_to_writeback_inodes_sb_nr(struct super_block *, unsigned long nr, enum wb_reason reason); void sync_inodes_sb(struct super_block *); -long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages, - enum wb_reason reason); void wakeup_flusher_threads(long nr_pages, enum wb_reason reason); void inode_wait_for_writeback(struct inode *inode); -- cgit v1.2.3 From 5a53748568f79641eaf40e41081a2f4987f005c2 Mon Sep 17 00:00:00 2001 From: Maxim Patlasov Date: Wed, 11 Sep 2013 14:22:46 -0700 Subject: mm/page-writeback.c: add strictlimit feature The feature prevents mistrusted filesystems (ie: FUSE mounts created by unprivileged users) to grow a large number of dirty pages before throttling. For such filesystems balance_dirty_pages always check bdi counters against bdi limits. I.e. even if global "nr_dirty" is under "freerun", it's not allowed to skip bdi checks. The only use case for now is fuse: it sets bdi max_ratio to 1% by default and system administrators are supposed to expect that this limit won't be exceeded. The feature is on if a BDI is marked by BDI_CAP_STRICTLIMIT flag. A filesystem may set the flag when it initializes its BDI. The problematic scenario comes from the fact that nobody pays attention to the NR_WRITEBACK_TEMP counter (i.e. number of pages under fuse writeback). The implementation of fuse writeback releases original page (by calling end_page_writeback) almost immediately. A fuse request queued for real processing bears a copy of original page. Hence, if userspace fuse daemon doesn't finalize write requests in timely manner, an aggressive mmap writer can pollute virtually all memory by those temporary fuse page copies. They are carefully accounted in NR_WRITEBACK_TEMP, but nobody cares. To make further explanations shorter, let me use "NR_WRITEBACK_TEMP problem" as a shortcut for "a possibility of uncontrolled grow of amount of RAM consumed by temporary pages allocated by kernel fuse to process writeback". The problem was very easy to reproduce. There is a trivial example filesystem implementation in fuse userspace distribution: fusexmp_fh.c. I added "sleep(1);" to the write methods, then recompiled and mounted it. Then created a huge file on the mount point and run a simple program which mmap-ed the file to a memory region, then wrote a data to the region. An hour later I observed almost all RAM consumed by fuse writeback. Since then some unrelated changes in kernel fuse made it more difficult to reproduce, but it is still possible now. Putting this theoretical happens-in-the-lab thing aside, there is another thing that really hurts real world (FUSE) users. This is write-through page cache policy FUSE currently uses. I.e. handling write(2), kernel fuse populates page cache and flushes user data to the server synchronously. This is excessively suboptimal. Pavel Emelyanov's patches ("writeback cache policy") solve the problem, but they also make resolving NR_WRITEBACK_TEMP problem absolutely necessary. Otherwise, simply copying a huge file to a fuse mount would result in memory starvation. Miklos, the maintainer of FUSE, believes strictlimit feature the way to go. And eventually putting FUSE topics aside, there is one more use-case for strictlimit feature. Using a slow USB stick (mass storage) in a machine with huge amount of RAM installed is a well-known pain. Let's make simple computations. Assuming 64GB of RAM installed, existing implementation of balance_dirty_pages will start throttling only after 9.6GB of RAM becomes dirty (freerun == 15% of total RAM). So, the command "cp 9GB_file /media/my-usb-storage/" may return in a few seconds, but subsequent "umount /media/my-usb-storage/" will take more than two hours if effective throughput of the storage is, to say, 1MB/sec. After inclusion of strictlimit feature, it will be trivial to add a knob (e.g. /sys/devices/virtual/bdi/x:y/strictlimit) to enable it on demand. Manually or via udev rule. May be I'm wrong, but it seems to be quite a natural desire to limit the amount of dirty memory for some devices we are not fully trust (in the sense of sustainable throughput). [akpm@linux-foundation.org: fix warning in page-writeback.c] Signed-off-by: Maxim Patlasov Cc: Jan Kara Cc: Miklos Szeredi Cc: Wu Fengguang Cc: Pavel Emelyanov Cc: James Bottomley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/backing-dev.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index c3881553f7d1..5f66d519a726 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -243,6 +243,8 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); * BDI_CAP_EXEC_MAP: Can be mapped for execution * * BDI_CAP_SWAP_BACKED: Count shmem/tmpfs objects as swap-backed. + * + * BDI_CAP_STRICTLIMIT: Keep number of dirty pages below bdi threshold. */ #define BDI_CAP_NO_ACCT_DIRTY 0x00000001 #define BDI_CAP_NO_WRITEBACK 0x00000002 @@ -254,6 +256,7 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); #define BDI_CAP_NO_ACCT_WB 0x00000080 #define BDI_CAP_SWAP_BACKED 0x00000100 #define BDI_CAP_STABLE_WRITES 0x00000200 +#define BDI_CAP_STRICTLIMIT 0x00000400 #define BDI_CAP_VMFLAGS \ (BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP) -- cgit v1.2.3 From f9121153fdfbfaa930bf65077a5597e20d3ac608 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Wed, 11 Sep 2013 14:22:52 -0700 Subject: mm/hwpoison: don't need to hold compound lock for hugetlbfs page compound lock is introduced by commit e9da73d67("thp: compound_lock."), it is used to serialize put_page against __split_huge_page_refcount(). In addition, transparent hugepages will be splitted in hwpoison handler and just one subpage will be poisoned. There is unnecessary to hold compound lock for hugetlbfs page. This patch replace compound_trans_order by compond_order in the place where the page is hugetlbfs page. Signed-off-by: Wanpeng Li Reviewed-by: Naoya Horiguchi Cc: Andi Kleen Cc: Tony Luck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 03f84b8d7359..caf543c7eaa7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -495,20 +495,6 @@ static inline int compound_order(struct page *page) return (unsigned long)page[1].lru.prev; } -static inline int compound_trans_order(struct page *page) -{ - int order; - unsigned long flags; - - if (!PageHead(page)) - return 0; - - flags = compound_lock_irqsave(page); - order = compound_order(page); - compound_unlock_irqrestore(page, flags); - return order; -} - static inline void set_compound_order(struct page *page, unsigned long order) { page[1].lru.prev = (void *)order; -- cgit v1.2.3 From fa688207c9db48b64ab6538abc3fcdf26110b9ec Mon Sep 17 00:00:00 2001 From: David Daney Date: Wed, 11 Sep 2013 14:23:24 -0700 Subject: smp: quit unconditionally enabling irq in on_each_cpu_mask and on_each_cpu_cond As in commit f21afc25f9ed ("smp.h: Use local_irq_{save,restore}() in !SMP version of on_each_cpu()"), we don't want to enable irqs if they are not already enabled. There are currently no known problematical callers of these functions, but since it is a known failure pattern, we preemptively fix them. Since they are not trivial functions, make them non-inline by moving them to up.c. This also makes it so we don't have to fix #include dependancies for preempt_{disable,enable}. Signed-off-by: David Daney Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/smp.h | 62 ++++++++++++++--------------------------------------- 1 file changed, 16 insertions(+), 46 deletions(-) (limited to 'include/linux') diff --git a/include/linux/smp.h b/include/linux/smp.h index c8488763277f..3724a9070907 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -29,6 +29,22 @@ extern unsigned int total_cpus; int smp_call_function_single(int cpuid, smp_call_func_t func, void *info, int wait); +/* + * Call a function on processors specified by mask, which might include + * the local one. + */ +void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func, + void *info, bool wait); + +/* + * Call a function on each processor for which the supplied function + * cond_func returns a positive value. This may include the local + * processor. + */ +void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), + smp_call_func_t func, void *info, bool wait, + gfp_t gfp_flags); + #ifdef CONFIG_SMP #include @@ -100,22 +116,6 @@ static inline void call_function_init(void) { } */ int on_each_cpu(smp_call_func_t func, void *info, int wait); -/* - * Call a function on processors specified by mask, which might include - * the local one. - */ -void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func, - void *info, bool wait); - -/* - * Call a function on each processor for which the supplied function - * cond_func returns a positive value. This may include the local - * processor. - */ -void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), - smp_call_func_t func, void *info, bool wait, - gfp_t gfp_flags); - /* * Mark the boot cpu "online" so that it can call console drivers in * printk() and can access its per-cpu storage. @@ -151,36 +151,6 @@ static inline int on_each_cpu(smp_call_func_t func, void *info, int wait) return 0; } -/* - * Note we still need to test the mask even for UP - * because we actually can get an empty mask from - * code that on SMP might call us without the local - * CPU in the mask. - */ -#define on_each_cpu_mask(mask, func, info, wait) \ - do { \ - if (cpumask_test_cpu(0, (mask))) { \ - local_irq_disable(); \ - (func)(info); \ - local_irq_enable(); \ - } \ - } while (0) -/* - * Preemption is disabled here to make sure the cond_func is called under the - * same condtions in UP and SMP. - */ -#define on_each_cpu_cond(cond_func, func, info, wait, gfp_flags)\ - do { \ - void *__info = (info); \ - preempt_disable(); \ - if ((cond_func)(0, __info)) { \ - local_irq_disable(); \ - (func)(__info); \ - local_irq_enable(); \ - } \ - preempt_enable(); \ - } while (0) - static inline void smp_send_reschedule(int cpu) { } #define smp_prepare_boot_cpu() do {} while (0) #define smp_call_function_many(mask, func, info, wait) \ -- cgit v1.2.3 From bff2dc42bcafdd75c0296987747f782965d691a0 Mon Sep 17 00:00:00 2001 From: David Daney Date: Wed, 11 Sep 2013 14:23:26 -0700 Subject: smp.h: move !SMP version of on_each_cpu() out-of-line All of the other non-trivial !SMP versions of functions in smp.h are out-of-line in up.c. Move on_each_cpu() there as well. This allows us to get rid of the #include . The drawback is that this makes both the x86_64 and i386 defconfig !SMP kernels about 200 bytes larger each. Signed-off-by: David Daney Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/smp.h | 21 +++++---------------- 1 file changed, 5 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/smp.h b/include/linux/smp.h index 3724a9070907..cfb7ca094b38 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -11,7 +11,6 @@ #include #include #include -#include extern void cpu_idle(void); @@ -29,6 +28,11 @@ extern unsigned int total_cpus; int smp_call_function_single(int cpuid, smp_call_func_t func, void *info, int wait); +/* + * Call a function on all processors + */ +int on_each_cpu(smp_call_func_t func, void *info, int wait); + /* * Call a function on processors specified by mask, which might include * the local one. @@ -111,11 +115,6 @@ void generic_smp_call_function_single_interrupt(void); static inline void call_function_init(void) { } #endif -/* - * Call a function on all processors - */ -int on_each_cpu(smp_call_func_t func, void *info, int wait); - /* * Mark the boot cpu "online" so that it can call console drivers in * printk() and can access its per-cpu storage. @@ -141,16 +140,6 @@ static inline int up_smp_call_function(smp_call_func_t func, void *info) #define smp_call_function(func, info, wait) \ (up_smp_call_function(func, info)) -static inline int on_each_cpu(smp_call_func_t func, void *info, int wait) -{ - unsigned long flags; - - local_irq_save(flags); - func(info); - local_irq_restore(flags); - return 0; -} - static inline void smp_send_reschedule(int cpu) { } #define smp_prepare_boot_cpu() do {} while (0) #define smp_call_function_many(mask, func, info, wait) \ -- cgit v1.2.3 From f9597f24c089dcbddbd2d9e99fbf00df57fb70c6 Mon Sep 17 00:00:00 2001 From: Sergei Trofimovich Date: Wed, 11 Sep 2013 14:23:28 -0700 Subject: syscalls.h: add forward declarations for inplace syscall wrappers Unclutter -Wmissing-prototypes warning types (enabled at make W=1) linux/include/linux/syscalls.h:190:18: warning: no previous prototype for 'SyS_semctl' [-Wmissing-prototypes] asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \ ^ linux/include/linux/syscalls.h:183:2: note: in expansion of macro '__SYSCALL_DEFINEx' __SYSCALL_DEFINEx(x, sname, __VA_ARGS__) ^ by adding forward declarations right before definitions. Signed-off-by: Sergei Trofimovich Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compat.h | 1 + include/linux/syscalls.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/compat.h b/include/linux/compat.h index ec1aee4aec9c..345da00a86e0 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -43,6 +43,7 @@ #define COMPAT_SYSCALL_DEFINEx(x, name, ...) \ asmlinkage long compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__));\ static inline long C_SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__));\ + asmlinkage long compat_SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__));\ asmlinkage long compat_SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__))\ { \ return C_SYSC##name(__MAP(x,__SC_DELOUSE,__VA_ARGS__)); \ diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 84662ecc7b51..7fac04e7ff6e 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -186,6 +186,7 @@ extern struct trace_event_functions exit_syscall_print_funcs; #define __SYSCALL_DEFINEx(x, name, ...) \ asmlinkage long sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)); \ static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__)); \ + asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \ asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \ { \ long ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__)); \ -- cgit v1.2.3 From c802d64a356b5cf349121ac4c5e005f037ce548d Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 11 Sep 2013 14:24:11 -0700 Subject: kprobes: unify insn caches The current kpropes insn caches allocate memory areas for insn slots with module_alloc(). The assumption is that the kernel image and module area are both within the same +/- 2GB memory area. This however is not true for s390 where the kernel image resides within the first 2GB (DMA memory area), but the module area is far away in the vmalloc area, usually somewhere close below the 4TB area. For new pc relative instructions s390 needs insn slots that are within +/- 2GB of each area. That way we can patch displacements of pc-relative instructions within the insn slots just like x86 and powerpc. The module area works already with the normal insn slot allocator, however there is currently no way to get insn slots that are within the first 2GB on s390 (aka DMA area). Therefore this patch set modifies the kprobes insn slot cache code in order to allow to specify a custom allocator for the insn slot cache pages. In addition architecure can now have private insn slot caches withhout the need to modify common code. Patch 1 unifies and simplifies the current insn and optinsn caches implementation. This is a preparation which allows to add more insn caches in a simple way. Patch 2 adds the possibility to specify a custom allocator. Patch 3 makes s390 use the new insn slot mechanisms and adds support for pc-relative instructions with long displacements. This patch (of 3): The two insn caches (insn, and optinsn) each have an own mutex and alloc/free functions (get_[opt]insn_slot() / free_[opt]insn_slot()). Since there is the need for yet another insn cache which satifies dma allocations on s390, unify and simplify the current implementation: - Move the per insn cache mutex into struct kprobe_insn_cache. - Move the alloc/free functions to kprobe.h so they are simply wrappers for the generic __get_insn_slot/__free_insn_slot functions. The implementation is done with a DEFINE_INSN_CACHE_OPS() macro which provides the alloc/free functions for each cache if needed. - move the struct kprobe_insn_cache to kprobe.h which allows to generate architecture specific insn slot caches outside of the core kprobes code. Signed-off-by: Heiko Carstens Cc: Masami Hiramatsu Cc: Ananth N Mavinakayanahalli Cc: Ingo Molnar Cc: Martin Schwidefsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kprobes.h | 32 ++++++++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index ca1d27a0d6a6..077f65321b5e 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -264,10 +264,34 @@ extern void arch_arm_kprobe(struct kprobe *p); extern void arch_disarm_kprobe(struct kprobe *p); extern int arch_init_kprobes(void); extern void show_registers(struct pt_regs *regs); -extern kprobe_opcode_t *get_insn_slot(void); -extern void free_insn_slot(kprobe_opcode_t *slot, int dirty); extern void kprobes_inc_nmissed_count(struct kprobe *p); +struct kprobe_insn_cache { + struct mutex mutex; + struct list_head pages; /* list of kprobe_insn_page */ + size_t insn_size; /* size of instruction slot */ + int nr_garbage; +}; + +extern kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c); +extern void __free_insn_slot(struct kprobe_insn_cache *c, + kprobe_opcode_t *slot, int dirty); + +#define DEFINE_INSN_CACHE_OPS(__name) \ +extern struct kprobe_insn_cache kprobe_##__name##_slots; \ + \ +static inline kprobe_opcode_t *get_##__name##_slot(void) \ +{ \ + return __get_insn_slot(&kprobe_##__name##_slots); \ +} \ + \ +static inline void free_##__name##_slot(kprobe_opcode_t *slot, int dirty)\ +{ \ + __free_insn_slot(&kprobe_##__name##_slots, slot, dirty); \ +} \ + +DEFINE_INSN_CACHE_OPS(insn); + #ifdef CONFIG_OPTPROBES /* * Internal structure for direct jump optimized probe @@ -287,13 +311,13 @@ extern void arch_optimize_kprobes(struct list_head *oplist); extern void arch_unoptimize_kprobes(struct list_head *oplist, struct list_head *done_list); extern void arch_unoptimize_kprobe(struct optimized_kprobe *op); -extern kprobe_opcode_t *get_optinsn_slot(void); -extern void free_optinsn_slot(kprobe_opcode_t *slot, int dirty); extern int arch_within_optimized_kprobe(struct optimized_kprobe *op, unsigned long addr); extern void opt_pre_handler(struct kprobe *p, struct pt_regs *regs); +DEFINE_INSN_CACHE_OPS(optinsn); + #ifdef CONFIG_SYSCTL extern int sysctl_kprobes_optimization; extern int proc_kprobes_optimization_handler(struct ctl_table *table, -- cgit v1.2.3 From af96397de8600232effbff43dc8b4ca20ddc02b1 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 11 Sep 2013 14:24:13 -0700 Subject: kprobes: allow to specify custom allocator for insn caches The current two insn slot caches both use module_alloc/module_free to allocate and free insn slot cache pages. For s390 this is not sufficient since there is the need to allocate insn slots that are either within the vmalloc module area or within dma memory. Therefore add a mechanism which allows to specify an own allocator for an own insn slot cache. Signed-off-by: Heiko Carstens Acked-by: Masami Hiramatsu Cc: Ananth N Mavinakayanahalli Cc: Ingo Molnar Cc: Martin Schwidefsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kprobes.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 077f65321b5e..925eaf28fca9 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -268,6 +268,8 @@ extern void kprobes_inc_nmissed_count(struct kprobe *p); struct kprobe_insn_cache { struct mutex mutex; + void *(*alloc)(void); /* allocate insn page */ + void (*free)(void *); /* free insn page */ struct list_head pages; /* list of kprobe_insn_page */ size_t insn_size; /* size of instruction slot */ int nr_garbage; -- cgit v1.2.3 From 131b2f9f1214f338f0bf7c0d9760019f2b1d0c20 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 11 Sep 2013 14:24:39 -0700 Subject: exec: kill "int depth" in search_binary_handler() Nobody except search_binary_handler() should touch ->recursion_depth, "int depth" buys nothing but complicates the code, kill it. Probably we should also kill "fn" and the !NULL check, ->load_binary should be always defined. And it can not go away after read_unlock() or this code is buggy anyway. Signed-off-by: Oleg Nesterov Acked-by: Kees Cook Cc: Al Viro Cc: Evgeniy Polyakov Cc: Zach Levis Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/binfmts.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index 70cf138690e9..e8112ae50531 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -31,7 +31,7 @@ struct linux_binprm { #ifdef __alpha__ unsigned int taso:1; #endif - unsigned int recursion_depth; + unsigned int recursion_depth; /* only for search_binary_handler() */ struct file * file; struct cred *cred; /* new credentials */ int unsafe; /* how unsafe this exec is (mask of LSM_UNSAFE_*) */ -- cgit v1.2.3 From be8a8d069e508d4408125e2b1471f549e7813d25 Mon Sep 17 00:00:00 2001 From: Michael Holzheu Date: Wed, 11 Sep 2013 14:24:49 -0700 Subject: vmcore: introduce ELF header in new memory feature For s390 we want to use /proc/vmcore for our SCSI stand-alone dump (zfcpdump). We have support where the first HSA_SIZE bytes are saved into a hypervisor owned memory area (HSA) before the kdump kernel is booted. When the kdump kernel starts, it is restricted to use only HSA_SIZE bytes. The advantages of this mechanism are: * No crashkernel memory has to be defined in the old kernel. * Early boot problems (before kexec_load has been done) can be dumped * Non-Linux systems can be dumped. We modify the s390 copy_oldmem_page() function to read from the HSA memory if memory below HSA_SIZE bytes is requested. Since we cannot use the kexec tool to load the kernel in this scenario, we have to build the ELF header in the 2nd (kdump/new) kernel. So with the following patch set we would like to introduce the new function that the ELF header for /proc/vmcore can be created in the 2nd kernel memory. The following steps are done during zfcpdump execution: 1. Production system crashes 2. User boots a SCSI disk that has been prepared with the zfcpdump tool 3. Hypervisor saves CPU state of boot CPU and HSA_SIZE bytes of memory into HSA 4. Boot loader loads kernel into low memory area 5. Kernel boots and uses only HSA_SIZE bytes of memory 6. Kernel saves registers of non-boot CPUs 7. Kernel does memory detection for dump memory map 8. Kernel creates ELF header for /proc/vmcore 9. /proc/vmcore uses this header for initialization 10. The zfcpdump user space reads /proc/vmcore to write dump to SCSI disk - copy_oldmem_page() copies from HSA for memory below HSA_SIZE - copy_oldmem_page() copies from real memory for memory above HSA_SIZE Currently for s390 we create the ELF core header in the 2nd kernel with a small trick. We relocate the addresses in the ELF header in a way that for the /proc/vmcore code it seems to be in the 1st kernel (old) memory and the read_from_oldmem() returns the correct data. This allows the /proc/vmcore code to use the ELF header in the 2nd kernel. This patch: Exchange the old mechanism with the new and much cleaner function call override feature that now offcially allows to create the ELF core header in the 2nd kernel. To use the new feature the following function have to be defined by the architecture backend code to read from new memory: * elfcorehdr_alloc: Allocate ELF header * elfcorehdr_free: Free the memory of the ELF header * elfcorehdr_read: Read from ELF header * elfcorehdr_read_notes: Read from ELF notes Signed-off-by: Michael Holzheu Acked-by: Vivek Goyal Cc: HATAYAMA Daisuke Cc: Jan Willeke Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/crash_dump.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h index 37e4f8da7cdf..6571f828e313 100644 --- a/include/linux/crash_dump.h +++ b/include/linux/crash_dump.h @@ -12,6 +12,12 @@ extern unsigned long long elfcorehdr_addr; extern unsigned long long elfcorehdr_size; +extern int __weak elfcorehdr_alloc(unsigned long long *addr, + unsigned long long *size); +extern void __weak elfcorehdr_free(unsigned long long addr); +extern ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos); +extern ssize_t __weak elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos); + extern ssize_t copy_oldmem_page(unsigned long, char *, size_t, unsigned long, int); -- cgit v1.2.3 From 9cb218131de1c59dca9063b2efe876f053f316af Mon Sep 17 00:00:00 2001 From: Michael Holzheu Date: Wed, 11 Sep 2013 14:24:51 -0700 Subject: vmcore: introduce remap_oldmem_pfn_range() For zfcpdump we can't map the HSA storage because it is only available via a read interface. Therefore, for the new vmcore mmap feature we have introduce a new mechanism to create mappings on demand. This patch introduces a new architecture function remap_oldmem_pfn_range() that should be used to create mappings with remap_pfn_range() for oldmem areas that can be directly mapped. For zfcpdump this is everything besides of the HSA memory. For the areas that are not mapped by remap_oldmem_pfn_range() a generic vmcore a new generic vmcore fault handler mmap_vmcore_fault() is called. This handler works as follows: * Get already available or new page from page cache (find_or_create_page) * Check if /proc/vmcore page is filled with data (PageUptodate) * If yes: Return that page * If no: Fill page using __vmcore_read(), set PageUptodate, and return page Signed-off-by: Michael Holzheu Acked-by: Vivek Goyal Cc: HATAYAMA Daisuke Cc: Jan Willeke Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/crash_dump.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h index 6571f828e313..fe68a5a98583 100644 --- a/include/linux/crash_dump.h +++ b/include/linux/crash_dump.h @@ -17,6 +17,9 @@ extern int __weak elfcorehdr_alloc(unsigned long long *addr, extern void __weak elfcorehdr_free(unsigned long long addr); extern ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos); extern ssize_t __weak elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos); +extern int __weak remap_oldmem_pfn_range(struct vm_area_struct *vma, + unsigned long from, unsigned long pfn, + unsigned long size, pgprot_t prot); extern ssize_t copy_oldmem_page(unsigned long, char *, size_t, unsigned long, int); -- cgit v1.2.3 From 9dee5c51516d2c3fff22633c1272c5652e68075a Mon Sep 17 00:00:00 2001 From: Cody P Schafer Date: Wed, 11 Sep 2013 14:25:10 -0700 Subject: rbtree: add postorder iteration functions Postorder iteration yields all of a node's children prior to yielding the node itself, and this particular implementation also avoids examining the leaf links in a node after that node has been yielded. In what I expect will be its most common usage, postorder iteration allows the deletion of every node in an rbtree without modifying the rbtree nodes (no _requirement_ that they be nulled) while avoiding referencing child nodes after they have been "deleted" (most commonly, freed). I have only updated zswap to use this functionality at this point, but numerous bits of code (most notably in the filesystem drivers) use a hand rolled postorder iteration that NULLs child links as it traverses the tree. Each of those instances could be replaced with this common implementation. 1 & 2 add rbtree postorder iteration functions. 3 adds testing of the iteration to the rbtree runtime tests 4 allows building the rbtree runtime tests as builtins 5 updates zswap. This patch: Add postorder iteration functions for rbtree. These are useful for safely freeing an entire rbtree without modifying the tree at all. Signed-off-by: Cody P Schafer Reviewed-by: Seth Jennings Cc: David Woodhouse Cc: Rik van Riel Cc: Michel Lespinasse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rbtree.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h index 0022c1bb1e26..c467151e9950 100644 --- a/include/linux/rbtree.h +++ b/include/linux/rbtree.h @@ -68,6 +68,10 @@ extern struct rb_node *rb_prev(const struct rb_node *); extern struct rb_node *rb_first(const struct rb_root *); extern struct rb_node *rb_last(const struct rb_root *); +/* Postorder iteration - always visit the parent after its children */ +extern struct rb_node *rb_first_postorder(const struct rb_root *); +extern struct rb_node *rb_next_postorder(const struct rb_node *); + /* Fast replacement of a single node without remove/rebalance/add/rebalance */ extern void rb_replace_node(struct rb_node *victim, struct rb_node *new, struct rb_root *root); -- cgit v1.2.3 From 2b529089257705499207ce7da9d0e3ae26a844ba Mon Sep 17 00:00:00 2001 From: Cody P Schafer Date: Wed, 11 Sep 2013 14:25:11 -0700 Subject: rbtree: add rbtree_postorder_for_each_entry_safe() helper Because deletion (of the entire tree) is a relatively common use of the rbtree_postorder iteration, and because doing it safely means fiddling with temporary storage, provide a helper to simplify postorder rbtree iteration. Signed-off-by: Cody P Schafer Reviewed-by: Seth Jennings Cc: David Woodhouse Cc: Rik van Riel Cc: Michel Lespinasse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rbtree.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h index c467151e9950..aa870a4ddf54 100644 --- a/include/linux/rbtree.h +++ b/include/linux/rbtree.h @@ -85,4 +85,22 @@ static inline void rb_link_node(struct rb_node * node, struct rb_node * parent, *rb_link = node; } +/** + * rbtree_postorder_for_each_entry_safe - iterate over rb_root in post order of + * given type safe against removal of rb_node entry + * + * @pos: the 'type *' to use as a loop cursor. + * @n: another 'type *' to use as temporary storage + * @root: 'rb_root *' of the rbtree. + * @field: the name of the rb_node field within 'type'. + */ +#define rbtree_postorder_for_each_entry_safe(pos, n, root, field) \ + for (pos = rb_entry(rb_first_postorder(root), typeof(*pos), field),\ + n = rb_entry(rb_next_postorder(&pos->field), \ + typeof(*pos), field); \ + &pos->field; \ + pos = n, \ + n = rb_entry(rb_next_postorder(&pos->field), \ + typeof(*pos), field)) + #endif /* _LINUX_RBTREE_H */ -- cgit v1.2.3 From 5e4c0d974139a98741b829b27cf38dc8f9284490 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 11 Sep 2013 14:26:05 -0700 Subject: lib/radix-tree.c: make radix_tree_node_alloc() work correctly within interrupt With users of radix_tree_preload() run from interrupt (block/blk-ioc.c is one such possible user), the following race can happen: radix_tree_preload() ... radix_tree_insert() radix_tree_node_alloc() if (rtp->nr) { ret = rtp->nodes[rtp->nr - 1]; ... radix_tree_preload() ... radix_tree_insert() radix_tree_node_alloc() if (rtp->nr) { ret = rtp->nodes[rtp->nr - 1]; And we give out one radix tree node twice. That clearly results in radix tree corruption with different results (usually OOPS) depending on which two users of radix tree race. We fix the problem by making radix_tree_node_alloc() always allocate fresh radix tree nodes when in interrupt. Using preloading when in interrupt doesn't make sense since all the allocations have to be atomic anyway and we cannot steal nodes from process-context users because some users rely on radix_tree_insert() succeeding after radix_tree_preload(). in_interrupt() check is somewhat ugly but we cannot simply key off passed gfp_mask as that is acquired from root_gfp_mask() and thus the same for all preload users. Another part of the fix is to avoid node preallocation in radix_tree_preload() when passed gfp_mask doesn't allow waiting. Again, preallocation in such case doesn't make sense and when preallocation would happen in interrupt we could possibly leak some allocated nodes. However, some users of radix_tree_preload() require following radix_tree_insert() to succeed. To avoid unexpected effects for these users, radix_tree_preload() only warns if passed gfp mask doesn't allow waiting and we provide a new function radix_tree_maybe_preload() for those users which get different gfp mask from different call sites and which are prepared to handle radix_tree_insert() failure. Signed-off-by: Jan Kara Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/radix-tree.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index ffc444c38b0a..403940787be1 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -231,6 +231,7 @@ unsigned long radix_tree_next_hole(struct radix_tree_root *root, unsigned long radix_tree_prev_hole(struct radix_tree_root *root, unsigned long index, unsigned long max_scan); int radix_tree_preload(gfp_t gfp_mask); +int radix_tree_maybe_preload(gfp_t gfp_mask); void radix_tree_init(void); void *radix_tree_tag_set(struct radix_tree_root *root, unsigned long index, unsigned int tag); -- cgit v1.2.3 From 57f150a58c40cda598c31af8bceb8598f43c3e5f Mon Sep 17 00:00:00 2001 From: Rob Landley Date: Wed, 11 Sep 2013 14:26:10 -0700 Subject: initmpfs: move rootfs code from fs/ramfs/ to init/ When the rootfs code was a wrapper around ramfs, having them in the same file made sense. Now that it can wrap another filesystem type, move it in with the init code instead. This also allows a subsequent patch to access rootfstype= command line arg. Signed-off-by: Rob Landley Cc: Jeff Layton Cc: Jens Axboe Cc: Stephen Warren Cc: Rusty Russell Cc: Jim Cromie Cc: Sam Ravnborg Cc: Greg Kroah-Hartman Cc: "Eric W. Biederman" Cc: Alexander Viro Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/init.h | 1 + include/linux/ramfs.h | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/init.h b/include/linux/init.h index e73f2b708525..f1c27a71d03c 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -153,6 +153,7 @@ extern unsigned int reset_devices; void setup_arch(char **); void prepare_namespace(void); void __init load_default_modules(void); +int __init init_rootfs(void); extern void (*late_time_init)(void); diff --git a/include/linux/ramfs.h b/include/linux/ramfs.h index 69e37c2d1ea5..753207c8ce20 100644 --- a/include/linux/ramfs.h +++ b/include/linux/ramfs.h @@ -25,7 +25,7 @@ extern int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma); extern const struct file_operations ramfs_file_operations; extern const struct vm_operations_struct generic_file_vm_ops; -extern int __init init_rootfs(void); +extern int __init init_ramfs_fs(void); int ramfs_fill_super(struct super_block *sb, void *data, int silent); -- cgit v1.2.3 From d9a605e40b1376eb02b067d7690580255a0df68f Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 11 Sep 2013 14:26:24 -0700 Subject: ipc: rename ids->rw_mutex Since in some situations the lock can be shared for readers, we shouldn't be calling it a mutex, rename it to rwsem. Signed-off-by: Davidlohr Bueso Tested-by: Sedat Dilek Cc: Rik van Riel Cc: Manfred Spraul Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ipc_namespace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index c4d870b0d5e6..19c19a5eee29 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -22,7 +22,7 @@ struct ipc_ids { int in_use; unsigned short seq; unsigned short seq_max; - struct rw_semaphore rw_mutex; + struct rw_semaphore rwsem; struct idr ipcs_idr; int next_id; }; -- cgit v1.2.3 From b34081f1cd59585451efaa69e1dff1b9507e6c89 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 11 Sep 2013 14:26:32 -0700 Subject: lz4: fix compression/decompression signedness mismatch LZ4 compression and decompression functions require different in signedness input/output parameters: unsigned char for compression and signed char for decompression. Change decompression API to require "(const) unsigned char *". Signed-off-by: Sergey Senozhatsky Cc: Kyungsik Lee Cc: Geert Uytterhoeven Cc: Yann Collet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/lz4.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lz4.h b/include/linux/lz4.h index d21c13f10a64..4356686b0a39 100644 --- a/include/linux/lz4.h +++ b/include/linux/lz4.h @@ -67,8 +67,8 @@ int lz4hc_compress(const unsigned char *src, size_t src_len, * note : Destination buffer must be already allocated. * slightly faster than lz4_decompress_unknownoutputsize() */ -int lz4_decompress(const char *src, size_t *src_len, char *dest, - size_t actual_dest_len); +int lz4_decompress(const unsigned char *src, size_t *src_len, + unsigned char *dest, size_t actual_dest_len); /* * lz4_decompress_unknownoutputsize() @@ -82,6 +82,6 @@ int lz4_decompress(const char *src, size_t *src_len, char *dest, * Error if return (< 0) * note : Destination buffer must be already allocated. */ -int lz4_decompress_unknownoutputsize(const char *src, size_t src_len, - char *dest, size_t *dest_len); +int lz4_decompress_unknownoutputsize(const unsigned char *src, size_t src_len, + unsigned char *dest, size_t *dest_len); #endif -- cgit v1.2.3