summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig45
-rw-r--r--mm/Makefile10
-rw-r--r--mm/allocpercpu.c177
-rw-r--r--mm/backing-dev.c27
-rw-r--r--mm/bootmem.c32
-rw-r--r--mm/filemap.c59
-rw-r--r--mm/filemap_xip.c2
-rw-r--r--mm/highmem.c17
-rw-r--r--mm/hugetlb.c563
-rw-r--r--mm/hwpoison-inject.c41
-rw-r--r--mm/internal.h23
-rw-r--r--mm/kmemleak.c9
-rw-r--r--mm/ksm.c960
-rw-r--r--mm/madvise.c30
-rw-r--r--mm/memcontrol.c1090
-rw-r--r--mm/memory-failure.c833
-rw-r--r--mm/memory.c127
-rw-r--r--mm/memory_hotplug.c40
-rw-r--r--mm/mempolicy.c82
-rw-r--r--mm/migrate.c137
-rw-r--r--mm/mincore.c37
-rw-r--r--mm/mlock.c45
-rw-r--r--mm/mmap.c98
-rw-r--r--mm/mremap.c245
-rw-r--r--mm/nommu.c98
-rw-r--r--mm/oom_kill.c103
-rw-r--r--mm/page-writeback.c72
-rw-r--r--mm/page_alloc.c77
-rw-r--r--mm/page_io.c17
-rw-r--r--mm/pagewalk.c32
-rw-r--r--mm/percpu.c241
-rw-r--r--mm/quicklist.c3
-rw-r--r--mm/rmap.c402
-rw-r--r--mm/shmem.c25
-rw-r--r--mm/slab.c148
-rw-r--r--mm/slub.c24
-rw-r--r--mm/swapfile.c866
-rw-r--r--mm/truncate.c144
-rw-r--r--mm/util.c44
-rw-r--r--mm/vmalloc.c59
-rw-r--r--mm/vmscan.c392
-rw-r--r--mm/vmstat.c10
42 files changed, 5225 insertions, 2261 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 71eb0b4cce8d..2310984591ed 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -67,7 +67,7 @@ config DISCONTIGMEM
config SPARSEMEM
def_bool y
- depends on SPARSEMEM_MANUAL
+ depends on (!SELECT_MEMORY_MODEL && ARCH_SPARSEMEM_ENABLE) || SPARSEMEM_MANUAL
config FLATMEM
def_bool y
@@ -128,11 +128,8 @@ config SPARSEMEM_VMEMMAP
config MEMORY_HOTPLUG
bool "Allow for memory hot-add"
depends on SPARSEMEM || X86_64_ACPI_NUMA
- depends on HOTPLUG && !(HIBERNATION && !S390) && ARCH_ENABLE_MEMORY_HOTPLUG
- depends on (IA64 || X86 || PPC64 || SUPERH || S390)
-
-comment "Memory hotplug is currently incompatible with Software Suspend"
- depends on SPARSEMEM && HOTPLUG && HIBERNATION && !S390
+ depends on HOTPLUG && ARCH_ENABLE_MEMORY_HOTPLUG
+ depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390)
config MEMORY_HOTPLUG_SPARSE
def_bool y
@@ -161,11 +158,13 @@ config PAGEFLAGS_EXTENDED
# Default to 4 for wider testing, though 8 might be more appropriate.
# ARM's adjust_pte (unused if VIPT) depends on mm-wide page_table_lock.
# PA-RISC 7xxx's spinlock_t would enlarge struct page from 32 to 44 bytes.
+# DEBUG_SPINLOCK and DEBUG_LOCK_ALLOC spinlock_t also enlarge struct page.
#
config SPLIT_PTLOCK_CPUS
int
- default "4096" if ARM && !CPU_CACHE_VIPT
- default "4096" if PARISC && !PA20
+ default "999999" if ARM && !CPU_CACHE_VIPT
+ default "999999" if PARISC && !PA20
+ default "999999" if DEBUG_SPINLOCK || DEBUG_LOCK_ALLOC
default "4"
#
@@ -203,14 +202,6 @@ config VIRT_TO_BUS
def_bool y
depends on !ARCH_NO_VIRT_TO_BUS
-config HAVE_MLOCK
- bool
- default y if MMU=y
-
-config HAVE_MLOCKED_PAGE_BIT
- bool
- default y if HAVE_MLOCK=y
-
config MMU_NOTIFIER
bool
@@ -221,10 +212,12 @@ config KSM
Enable Kernel Samepage Merging: KSM periodically scans those areas
of an application's address space that an app has advised may be
mergeable. When it finds pages of identical content, it replaces
- the many instances by a single resident page with that content, so
+ the many instances by a single page with that content, so
saving memory until one or another app needs to modify the content.
Recommended for use with KVM, or with other duplicative applications.
- See Documentation/vm/ksm.txt for more information.
+ See Documentation/vm/ksm.txt for more information: KSM is inactive
+ until a program has madvised that an area is MADV_MERGEABLE, and
+ root has set /sys/kernel/mm/ksm/run to 1 (if CONFIG_SYSFS is set).
config DEFAULT_MMAP_MIN_ADDR
int "Low address space to protect from user allocation"
@@ -244,6 +237,22 @@ config DEFAULT_MMAP_MIN_ADDR
This value can be changed after boot using the
/proc/sys/vm/mmap_min_addr tunable.
+config ARCH_SUPPORTS_MEMORY_FAILURE
+ bool
+
+config MEMORY_FAILURE
+ depends on MMU
+ depends on ARCH_SUPPORTS_MEMORY_FAILURE
+ bool "Enable recovery from hardware memory errors"
+ help
+ Enables code to recover from some memory failures on systems
+ with MCA recovery. This allows a system to continue running
+ even when some of its memory has uncorrected errors. This requires
+ special hardware support and typically ECC memory.
+
+config HWPOISON_INJECT
+ tristate "Poison pages injector"
+ depends on MEMORY_FAILURE && DEBUG_KERNEL
config NOMMU_INITIAL_TRIM_EXCESS
int "Turn on mmap() excess space trimming before booting"
diff --git a/mm/Makefile b/mm/Makefile
index 88193d73cd1a..82131d0f8d85 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -5,14 +5,14 @@
mmu-y := nommu.o
mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \
mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
- vmalloc.o
+ vmalloc.o pagewalk.o
obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
maccess.o page_alloc.o page-writeback.o \
readahead.o swap.o truncate.o vmscan.o shmem.o \
prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
page_isolation.o mm_init.o mmu_context.o \
- pagewalk.o $(mmu-y)
+ $(mmu-y)
obj-y += init-mm.o
obj-$(CONFIG_BOUNCE) += bounce.o
@@ -34,12 +34,10 @@ obj-$(CONFIG_FAILSLAB) += failslab.o
obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
obj-$(CONFIG_FS_XIP) += filemap_xip.o
obj-$(CONFIG_MIGRATION) += migrate.o
-ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA
obj-$(CONFIG_SMP) += percpu.o
-else
-obj-$(CONFIG_SMP) += allocpercpu.o
-endif
obj-$(CONFIG_QUICKLIST) += quicklist.o
obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
+obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
+obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c
deleted file mode 100644
index df34ceae0c67..000000000000
--- a/mm/allocpercpu.c
+++ /dev/null
@@ -1,177 +0,0 @@
-/*
- * linux/mm/allocpercpu.c
- *
- * Separated from slab.c August 11, 2006 Christoph Lameter
- */
-#include <linux/mm.h>
-#include <linux/module.h>
-#include <linux/bootmem.h>
-#include <asm/sections.h>
-
-#ifndef cache_line_size
-#define cache_line_size() L1_CACHE_BYTES
-#endif
-
-/**
- * percpu_depopulate - depopulate per-cpu data for given cpu
- * @__pdata: per-cpu data to depopulate
- * @cpu: depopulate per-cpu data for this cpu
- *
- * Depopulating per-cpu data for a cpu going offline would be a typical
- * use case. You need to register a cpu hotplug handler for that purpose.
- */
-static void percpu_depopulate(void *__pdata, int cpu)
-{
- struct percpu_data *pdata = __percpu_disguise(__pdata);
-
- kfree(pdata->ptrs[cpu]);
- pdata->ptrs[cpu] = NULL;
-}
-
-/**
- * percpu_depopulate_mask - depopulate per-cpu data for some cpu's
- * @__pdata: per-cpu data to depopulate
- * @mask: depopulate per-cpu data for cpu's selected through mask bits
- */
-static void __percpu_depopulate_mask(void *__pdata, const cpumask_t *mask)
-{
- int cpu;
- for_each_cpu_mask_nr(cpu, *mask)
- percpu_depopulate(__pdata, cpu);
-}
-
-#define percpu_depopulate_mask(__pdata, mask) \
- __percpu_depopulate_mask((__pdata), &(mask))
-
-/**
- * percpu_populate - populate per-cpu data for given cpu
- * @__pdata: per-cpu data to populate further
- * @size: size of per-cpu object
- * @gfp: may sleep or not etc.
- * @cpu: populate per-data for this cpu
- *
- * Populating per-cpu data for a cpu coming online would be a typical
- * use case. You need to register a cpu hotplug handler for that purpose.
- * Per-cpu object is populated with zeroed buffer.
- */
-static void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu)
-{
- struct percpu_data *pdata = __percpu_disguise(__pdata);
- int node = cpu_to_node(cpu);
-
- /*
- * We should make sure each CPU gets private memory.
- */
- size = roundup(size, cache_line_size());
-
- BUG_ON(pdata->ptrs[cpu]);
- if (node_online(node))
- pdata->ptrs[cpu] = kmalloc_node(size, gfp|__GFP_ZERO, node);
- else
- pdata->ptrs[cpu] = kzalloc(size, gfp);
- return pdata->ptrs[cpu];
-}
-
-/**
- * percpu_populate_mask - populate per-cpu data for more cpu's
- * @__pdata: per-cpu data to populate further
- * @size: size of per-cpu object
- * @gfp: may sleep or not etc.
- * @mask: populate per-cpu data for cpu's selected through mask bits
- *
- * Per-cpu objects are populated with zeroed buffers.
- */
-static int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp,
- cpumask_t *mask)
-{
- cpumask_t populated;
- int cpu;
-
- cpus_clear(populated);
- for_each_cpu_mask_nr(cpu, *mask)
- if (unlikely(!percpu_populate(__pdata, size, gfp, cpu))) {
- __percpu_depopulate_mask(__pdata, &populated);
- return -ENOMEM;
- } else
- cpu_set(cpu, populated);
- return 0;
-}
-
-#define percpu_populate_mask(__pdata, size, gfp, mask) \
- __percpu_populate_mask((__pdata), (size), (gfp), &(mask))
-
-/**
- * alloc_percpu - initial setup of per-cpu data
- * @size: size of per-cpu object
- * @align: alignment
- *
- * Allocate dynamic percpu area. Percpu objects are populated with
- * zeroed buffers.
- */
-void *__alloc_percpu(size_t size, size_t align)
-{
- /*
- * We allocate whole cache lines to avoid false sharing
- */
- size_t sz = roundup(nr_cpu_ids * sizeof(void *), cache_line_size());
- void *pdata = kzalloc(sz, GFP_KERNEL);
- void *__pdata = __percpu_disguise(pdata);
-
- /*
- * Can't easily make larger alignment work with kmalloc. WARN
- * on it. Larger alignment should only be used for module
- * percpu sections on SMP for which this path isn't used.
- */
- WARN_ON_ONCE(align > SMP_CACHE_BYTES);
-
- if (unlikely(!pdata))
- return NULL;
- if (likely(!__percpu_populate_mask(__pdata, size, GFP_KERNEL,
- &cpu_possible_map)))
- return __pdata;
- kfree(pdata);
- return NULL;
-}
-EXPORT_SYMBOL_GPL(__alloc_percpu);
-
-/**
- * free_percpu - final cleanup of per-cpu data
- * @__pdata: object to clean up
- *
- * We simply clean up any per-cpu object left. No need for the client to
- * track and specify through a bis mask which per-cpu objects are to free.
- */
-void free_percpu(void *__pdata)
-{
- if (unlikely(!__pdata))
- return;
- __percpu_depopulate_mask(__pdata, cpu_possible_mask);
- kfree(__percpu_disguise(__pdata));
-}
-EXPORT_SYMBOL_GPL(free_percpu);
-
-/*
- * Generic percpu area setup.
- */
-#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
-unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
-
-EXPORT_SYMBOL(__per_cpu_offset);
-
-void __init setup_per_cpu_areas(void)
-{
- unsigned long size, i;
- char *ptr;
- unsigned long nr_possible_cpus = num_possible_cpus();
-
- /* Copy section for each CPU (we discard the original) */
- size = ALIGN(PERCPU_ENOUGH_ROOM, PAGE_SIZE);
- ptr = alloc_bootmem_pages(size * nr_possible_cpus);
-
- for_each_possible_cpu(i) {
- __per_cpu_offset[i] = ptr - __per_cpu_start;
- memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
- ptr += size;
- }
-}
-#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 3d3accb1f800..0e8ca0347707 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -92,7 +92,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
"BdiDirtyThresh: %8lu kB\n"
"DirtyThresh: %8lu kB\n"
"BackgroundThresh: %8lu kB\n"
- "WriteBack threads:%8lu\n"
+ "WritebackThreads: %8lu\n"
"b_dirty: %8lu\n"
"b_io: %8lu\n"
"b_more_io: %8lu\n"
@@ -604,15 +604,36 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi)
/*
* Finally, kill the kernel threads. We don't need to be RCU
- * safe anymore, since the bdi is gone from visibility.
+ * safe anymore, since the bdi is gone from visibility. Force
+ * unfreeze of the thread before calling kthread_stop(), otherwise
+ * it would never exet if it is currently stuck in the refrigerator.
*/
- list_for_each_entry(wb, &bdi->wb_list, list)
+ list_for_each_entry(wb, &bdi->wb_list, list) {
+ thaw_process(wb->task);
kthread_stop(wb->task);
+ }
+}
+
+/*
+ * This bdi is going away now, make sure that no super_blocks point to it
+ */
+static void bdi_prune_sb(struct backing_dev_info *bdi)
+{
+ struct super_block *sb;
+
+ spin_lock(&sb_lock);
+ list_for_each_entry(sb, &super_blocks, s_list) {
+ if (sb->s_bdi == bdi)
+ sb->s_bdi = NULL;
+ }
+ spin_unlock(&sb_lock);
}
void bdi_unregister(struct backing_dev_info *bdi)
{
if (bdi->dev) {
+ bdi_prune_sb(bdi);
+
if (!bdi_cap_flush_forker(bdi))
bdi_wb_shutdown(bdi);
bdi_debug_unregister(bdi);
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 555d5d2731c6..7d1486875e1c 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -143,6 +143,30 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages);
}
+/*
+ * free_bootmem_late - free bootmem pages directly to page allocator
+ * @addr: starting address of the range
+ * @size: size of the range in bytes
+ *
+ * This is only useful when the bootmem allocator has already been torn
+ * down, but we are still initializing the system. Pages are given directly
+ * to the page allocator, no bootmem metadata is updated because it is gone.
+ */
+void __init free_bootmem_late(unsigned long addr, unsigned long size)
+{
+ unsigned long cursor, end;
+
+ kmemleak_free_part(__va(addr), size);
+
+ cursor = PFN_UP(addr);
+ end = PFN_DOWN(addr + size);
+
+ for (; cursor < end; cursor++) {
+ __free_pages_bootmem(pfn_to_page(cursor), 0);
+ totalram_pages++;
+ }
+}
+
static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
{
int aligned;
@@ -408,8 +432,8 @@ int __init reserve_bootmem(unsigned long addr, unsigned long size,
return mark_bootmem(start, end, 1, flags);
}
-static unsigned long align_idx(struct bootmem_data *bdata, unsigned long idx,
- unsigned long step)
+static unsigned long __init align_idx(struct bootmem_data *bdata,
+ unsigned long idx, unsigned long step)
{
unsigned long base = bdata->node_min_pfn;
@@ -421,8 +445,8 @@ static unsigned long align_idx(struct bootmem_data *bdata, unsigned long idx,
return ALIGN(base + idx, step) - base;
}
-static unsigned long align_off(struct bootmem_data *bdata, unsigned long off,
- unsigned long align)
+static unsigned long __init align_off(struct bootmem_data *bdata,
+ unsigned long off, unsigned long align)
{
unsigned long base = PFN_PHYS(bdata->node_min_pfn);
diff --git a/mm/filemap.c b/mm/filemap.c
index bcc7372aebbc..8b4d88f9249e 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -58,7 +58,7 @@
/*
* Lock ordering:
*
- * ->i_mmap_lock (vmtruncate)
+ * ->i_mmap_lock (truncate_pagecache)
* ->private_lock (__free_pte->__set_page_dirty_buffers)
* ->swap_lock (exclusive_swap_page, others)
* ->mapping->tree_lock
@@ -104,6 +104,10 @@
*
* ->task->proc_lock
* ->dcache_lock (proc_pid_lookup)
+ *
+ * (code doesn't rely on that order, so you could switch it around)
+ * ->tasklist_lock (memory_failure, collect_procs_ao)
+ * ->i_mmap_lock
*/
/*
@@ -256,27 +260,27 @@ int filemap_flush(struct address_space *mapping)
EXPORT_SYMBOL(filemap_flush);
/**
- * wait_on_page_writeback_range - wait for writeback to complete
- * @mapping: target address_space
- * @start: beginning page index
- * @end: ending page index
+ * filemap_fdatawait_range - wait for writeback to complete
+ * @mapping: address space structure to wait for
+ * @start_byte: offset in bytes where the range starts
+ * @end_byte: offset in bytes where the range ends (inclusive)
*
- * Wait for writeback to complete against pages indexed by start->end
- * inclusive
+ * Walk the list of under-writeback pages of the given address space
+ * in the given range and wait for all of them.
*/
-int wait_on_page_writeback_range(struct address_space *mapping,
- pgoff_t start, pgoff_t end)
+int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
+ loff_t end_byte)
{
+ pgoff_t index = start_byte >> PAGE_CACHE_SHIFT;
+ pgoff_t end = end_byte >> PAGE_CACHE_SHIFT;
struct pagevec pvec;
int nr_pages;
int ret = 0;
- pgoff_t index;
- if (end < start)
+ if (end_byte < start_byte)
return 0;
pagevec_init(&pvec, 0);
- index = start;
while ((index <= end) &&
(nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
PAGECACHE_TAG_WRITEBACK,
@@ -306,25 +310,6 @@ int wait_on_page_writeback_range(struct address_space *mapping,
return ret;
}
-
-/**
- * filemap_fdatawait_range - wait for all under-writeback pages to complete in a given range
- * @mapping: address space structure to wait for
- * @start: offset in bytes where the range starts
- * @end: offset in bytes where the range ends (inclusive)
- *
- * Walk the list of under-writeback pages of the given address space
- * in the given range and wait for all of them.
- *
- * This is just a simple wrapper so that callers don't have to convert offsets
- * to page indexes themselves
- */
-int filemap_fdatawait_range(struct address_space *mapping, loff_t start,
- loff_t end)
-{
- return wait_on_page_writeback_range(mapping, start >> PAGE_CACHE_SHIFT,
- end >> PAGE_CACHE_SHIFT);
-}
EXPORT_SYMBOL(filemap_fdatawait_range);
/**
@@ -341,8 +326,7 @@ int filemap_fdatawait(struct address_space *mapping)
if (i_size == 0)
return 0;
- return wait_on_page_writeback_range(mapping, 0,
- (i_size - 1) >> PAGE_CACHE_SHIFT);
+ return filemap_fdatawait_range(mapping, 0, i_size - 1);
}
EXPORT_SYMBOL(filemap_fdatawait);
@@ -389,9 +373,8 @@ int filemap_write_and_wait_range(struct address_space *mapping,
WB_SYNC_ALL);
/* See comment of filemap_write_and_wait() */
if (err != -EIO) {
- int err2 = wait_on_page_writeback_range(mapping,
- lstart >> PAGE_CACHE_SHIFT,
- lend >> PAGE_CACHE_SHIFT);
+ int err2 = filemap_fdatawait_range(mapping,
+ lstart, lend);
if (!err)
err = err2;
}
@@ -1607,7 +1590,7 @@ page_not_uptodate:
}
EXPORT_SYMBOL(filemap_fault);
-struct vm_operations_struct generic_file_vm_ops = {
+const struct vm_operations_struct generic_file_vm_ops = {
.fault = filemap_fault,
};
@@ -1840,7 +1823,7 @@ static size_t __iovec_copy_from_user_inatomic(char *vaddr,
/*
* Copy as much as we can into the page and return the number of bytes which
- * were sucessfully copied. If a fault is encountered then return the number of
+ * were successfully copied. If a fault is encountered then return the number of
* bytes which were copied.
*/
size_t iov_iter_copy_from_user_atomic(struct page *page,
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 427dfe3ce78c..1888b2d71bb8 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -296,7 +296,7 @@ out:
}
}
-static struct vm_operations_struct xip_file_vm_ops = {
+static const struct vm_operations_struct xip_file_vm_ops = {
.fault = xip_file_fault,
};
diff --git a/mm/highmem.c b/mm/highmem.c
index 25878cc49daa..9c1e627f282e 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -426,16 +426,21 @@ void __init page_address_init(void)
void debug_kmap_atomic(enum km_type type)
{
- static unsigned warn_count = 10;
+ static int warn_count = 10;
- if (unlikely(warn_count == 0))
+ if (unlikely(warn_count < 0))
return;
if (unlikely(in_interrupt())) {
- if (in_irq()) {
+ if (in_nmi()) {
+ if (type != KM_NMI && type != KM_NMI_PTE) {
+ WARN_ON(1);
+ warn_count--;
+ }
+ } else if (in_irq()) {
if (type != KM_IRQ0 && type != KM_IRQ1 &&
type != KM_BIO_SRC_IRQ && type != KM_BIO_DST_IRQ &&
- type != KM_BOUNCE_READ) {
+ type != KM_BOUNCE_READ && type != KM_IRQ_PTE) {
WARN_ON(1);
warn_count--;
}
@@ -452,7 +457,9 @@ void debug_kmap_atomic(enum km_type type)
}
if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ ||
- type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ) {
+ type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ ||
+ type == KM_IRQ_PTE || type == KM_NMI ||
+ type == KM_NMI_PTE ) {
if (!irqs_disabled()) {
WARN_ON(1);
warn_count--;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 815dbd4a6dcb..65f38c218207 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -24,6 +24,7 @@
#include <asm/io.h>
#include <linux/hugetlb.h>
+#include <linux/node.h>
#include "internal.h"
const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
@@ -622,42 +623,66 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
}
/*
- * Use a helper variable to find the next node and then
- * copy it back to next_nid_to_alloc afterwards:
- * otherwise there's a window in which a racer might
- * pass invalid nid MAX_NUMNODES to alloc_pages_exact_node.
- * But we don't need to use a spin_lock here: it really
- * doesn't matter if occasionally a racer chooses the
- * same nid as we do. Move nid forward in the mask even
- * if we just successfully allocated a hugepage so that
- * the next caller gets hugepages on the next node.
+ * common helper functions for hstate_next_node_to_{alloc|free}.
+ * We may have allocated or freed a huge page based on a different
+ * nodes_allowed previously, so h->next_node_to_{alloc|free} might
+ * be outside of *nodes_allowed. Ensure that we use an allowed
+ * node for alloc or free.
*/
-static int hstate_next_node_to_alloc(struct hstate *h)
+static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
{
- int next_nid;
- next_nid = next_node(h->next_nid_to_alloc, node_online_map);
- if (next_nid == MAX_NUMNODES)
- next_nid = first_node(node_online_map);
- h->next_nid_to_alloc = next_nid;
- return next_nid;
+ nid = next_node(nid, *nodes_allowed);
+ if (nid == MAX_NUMNODES)
+ nid = first_node(*nodes_allowed);
+ VM_BUG_ON(nid >= MAX_NUMNODES);
+
+ return nid;
+}
+
+static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
+{
+ if (!node_isset(nid, *nodes_allowed))
+ nid = next_node_allowed(nid, nodes_allowed);
+ return nid;
+}
+
+/*
+ * returns the previously saved node ["this node"] from which to
+ * allocate a persistent huge page for the pool and advance the
+ * next node from which to allocate, handling wrap at end of node
+ * mask.
+ */
+static int hstate_next_node_to_alloc(struct hstate *h,
+ nodemask_t *nodes_allowed)
+{
+ int nid;
+
+ VM_BUG_ON(!nodes_allowed);
+
+ nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed);
+ h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed);
+
+ return nid;
}
-static int alloc_fresh_huge_page(struct hstate *h)
+static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
{
struct page *page;
int start_nid;
int next_nid;
int ret = 0;
- start_nid = h->next_nid_to_alloc;
+ start_nid = hstate_next_node_to_alloc(h, nodes_allowed);
next_nid = start_nid;
do {
page = alloc_fresh_huge_page_node(h, next_nid);
- if (page)
+ if (page) {
ret = 1;
- next_nid = hstate_next_node_to_alloc(h);
- } while (!page && next_nid != start_nid);
+ break;
+ }
+ next_nid = hstate_next_node_to_alloc(h, nodes_allowed);
+ } while (next_nid != start_nid);
if (ret)
count_vm_event(HTLB_BUDDY_PGALLOC);
@@ -668,17 +693,21 @@ static int alloc_fresh_huge_page(struct hstate *h)
}
/*
- * helper for free_pool_huge_page() - find next node
- * from which to free a huge page
+ * helper for free_pool_huge_page() - return the previously saved
+ * node ["this node"] from which to free a huge page. Advance the
+ * next node id whether or not we find a free huge page to free so
+ * that the next attempt to free addresses the next node.
*/
-static int hstate_next_node_to_free(struct hstate *h)
+static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
{
- int next_nid;
- next_nid = next_node(h->next_nid_to_free, node_online_map);
- if (next_nid == MAX_NUMNODES)
- next_nid = first_node(node_online_map);
- h->next_nid_to_free = next_nid;
- return next_nid;
+ int nid;
+
+ VM_BUG_ON(!nodes_allowed);
+
+ nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed);
+ h->next_nid_to_free = next_node_allowed(nid, nodes_allowed);
+
+ return nid;
}
/*
@@ -687,13 +716,14 @@ static int hstate_next_node_to_free(struct hstate *h)
* balanced over allowed nodes.
* Called with hugetlb_lock locked.
*/
-static int free_pool_huge_page(struct hstate *h, bool acct_surplus)
+static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
+ bool acct_surplus)
{
int start_nid;
int next_nid;
int ret = 0;
- start_nid = h->next_nid_to_free;
+ start_nid = hstate_next_node_to_free(h, nodes_allowed);
next_nid = start_nid;
do {
@@ -715,9 +745,10 @@ static int free_pool_huge_page(struct hstate *h, bool acct_surplus)
}
update_and_free_page(h, page);
ret = 1;
+ break;
}
- next_nid = hstate_next_node_to_free(h);
- } while (!ret && next_nid != start_nid);
+ next_nid = hstate_next_node_to_free(h, nodes_allowed);
+ } while (next_nid != start_nid);
return ret;
}
@@ -911,14 +942,14 @@ static void return_unused_surplus_pages(struct hstate *h,
/*
* We want to release as many surplus pages as possible, spread
- * evenly across all nodes. Iterate across all nodes until we
- * can no longer free unreserved surplus pages. This occurs when
- * the nodes with surplus pages have no free pages.
- * free_pool_huge_page() will balance the the frees across the
- * on-line nodes for us and will handle the hstate accounting.
+ * evenly across all nodes with memory. Iterate across these nodes
+ * until we can no longer free unreserved surplus pages. This occurs
+ * when the nodes with surplus pages have no free pages.
+ * free_pool_huge_page() will balance the the freed pages across the
+ * on-line nodes with memory and will handle the hstate accounting.
*/
while (nr_pages--) {
- if (!free_pool_huge_page(h, 1))
+ if (!free_pool_huge_page(h, &node_states[N_HIGH_MEMORY], 1))
break;
}
}
@@ -1022,16 +1053,16 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
int __weak alloc_bootmem_huge_page(struct hstate *h)
{
struct huge_bootmem_page *m;
- int nr_nodes = nodes_weight(node_online_map);
+ int nr_nodes = nodes_weight(node_states[N_HIGH_MEMORY]);
while (nr_nodes) {
void *addr;
addr = __alloc_bootmem_node_nopanic(
- NODE_DATA(h->next_nid_to_alloc),
+ NODE_DATA(hstate_next_node_to_alloc(h,
+ &node_states[N_HIGH_MEMORY])),
huge_page_size(h), huge_page_size(h), 0);
- hstate_next_node_to_alloc(h);
if (addr) {
/*
* Use the beginning of the huge page to store the
@@ -1084,7 +1115,8 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
if (h->order >= MAX_ORDER) {
if (!alloc_bootmem_huge_page(h))
break;
- } else if (!alloc_fresh_huge_page(h))
+ } else if (!alloc_fresh_huge_page(h,
+ &node_states[N_HIGH_MEMORY]))
break;
}
h->max_huge_pages = i;
@@ -1126,14 +1158,15 @@ static void __init report_hugepages(void)
}
#ifdef CONFIG_HIGHMEM
-static void try_to_free_low(struct hstate *h, unsigned long count)
+static void try_to_free_low(struct hstate *h, unsigned long count,
+ nodemask_t *nodes_allowed)
{
int i;
if (h->order >= MAX_ORDER)
return;
- for (i = 0; i < MAX_NUMNODES; ++i) {
+ for_each_node_mask(i, *nodes_allowed) {
struct page *page, *next;
struct list_head *freel = &h->hugepage_freelists[i];
list_for_each_entry_safe(page, next, freel, lru) {
@@ -1149,7 +1182,8 @@ static void try_to_free_low(struct hstate *h, unsigned long count)
}
}
#else
-static inline void try_to_free_low(struct hstate *h, unsigned long count)
+static inline void try_to_free_low(struct hstate *h, unsigned long count,
+ nodemask_t *nodes_allowed)
{
}
#endif
@@ -1159,7 +1193,8 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count)
* balanced by operating on them in a round-robin fashion.
* Returns 1 if an adjustment was made.
*/
-static int adjust_pool_surplus(struct hstate *h, int delta)
+static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed,
+ int delta)
{
int start_nid, next_nid;
int ret = 0;
@@ -1167,29 +1202,33 @@ static int adjust_pool_surplus(struct hstate *h, int delta)
VM_BUG_ON(delta != -1 && delta != 1);
if (delta < 0)
- start_nid = h->next_nid_to_alloc;
+ start_nid = hstate_next_node_to_alloc(h, nodes_allowed);
else
- start_nid = h->next_nid_to_free;
+ start_nid = hstate_next_node_to_free(h, nodes_allowed);
next_nid = start_nid;
do {
int nid = next_nid;
if (delta < 0) {
- next_nid = hstate_next_node_to_alloc(h);
/*
* To shrink on this node, there must be a surplus page
*/
- if (!h->surplus_huge_pages_node[nid])
+ if (!h->surplus_huge_pages_node[nid]) {
+ next_nid = hstate_next_node_to_alloc(h,
+ nodes_allowed);
continue;
+ }
}
if (delta > 0) {
- next_nid = hstate_next_node_to_free(h);
/*
* Surplus cannot exceed the total number of pages
*/
if (h->surplus_huge_pages_node[nid] >=
- h->nr_huge_pages_node[nid])
+ h->nr_huge_pages_node[nid]) {
+ next_nid = hstate_next_node_to_free(h,
+ nodes_allowed);
continue;
+ }
}
h->surplus_huge_pages += delta;
@@ -1202,7 +1241,8 @@ static int adjust_pool_surplus(struct hstate *h, int delta)
}
#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
-static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count)
+static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
+ nodemask_t *nodes_allowed)
{
unsigned long min_count, ret;
@@ -1222,7 +1262,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count)
*/
spin_lock(&hugetlb_lock);
while (h->surplus_huge_pages && count > persistent_huge_pages(h)) {
- if (!adjust_pool_surplus(h, -1))
+ if (!adjust_pool_surplus(h, nodes_allowed, -1))
break;
}
@@ -1233,11 +1273,14 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count)
* and reducing the surplus.
*/
spin_unlock(&hugetlb_lock);
- ret = alloc_fresh_huge_page(h);
+ ret = alloc_fresh_huge_page(h, nodes_allowed);
spin_lock(&hugetlb_lock);
if (!ret)
goto out;
+ /* Bail for signals. Probably ctrl-c from user */
+ if (signal_pending(current))
+ goto out;
}
/*
@@ -1257,13 +1300,13 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count)
*/
min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
min_count = max(count, min_count);
- try_to_free_low(h, min_count);
+ try_to_free_low(h, min_count, nodes_allowed);
while (min_count < persistent_huge_pages(h)) {
- if (!free_pool_huge_page(h, 0))
+ if (!free_pool_huge_page(h, nodes_allowed, 0))
break;
}
while (count < persistent_huge_pages(h)) {
- if (!adjust_pool_surplus(h, 1))
+ if (!adjust_pool_surplus(h, nodes_allowed, 1))
break;
}
out:
@@ -1282,43 +1325,117 @@ out:
static struct kobject *hugepages_kobj;
static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
-static struct hstate *kobj_to_hstate(struct kobject *kobj)
+static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp);
+
+static struct hstate *kobj_to_hstate(struct kobject *kobj, int *nidp)
{
int i;
+
for (i = 0; i < HUGE_MAX_HSTATE; i++)
- if (hstate_kobjs[i] == kobj)
+ if (hstate_kobjs[i] == kobj) {
+ if (nidp)
+ *nidp = NUMA_NO_NODE;
return &hstates[i];
- BUG();
- return NULL;
+ }
+
+ return kobj_to_node_hstate(kobj, nidp);
}
-static ssize_t nr_hugepages_show(struct kobject *kobj,
+static ssize_t nr_hugepages_show_common(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- struct hstate *h = kobj_to_hstate(kobj);
- return sprintf(buf, "%lu\n", h->nr_huge_pages);
+ struct hstate *h;
+ unsigned long nr_huge_pages;
+ int nid;
+
+ h = kobj_to_hstate(kobj, &nid);
+ if (nid == NUMA_NO_NODE)
+ nr_huge_pages = h->nr_huge_pages;
+ else
+ nr_huge_pages = h->nr_huge_pages_node[nid];
+
+ return sprintf(buf, "%lu\n", nr_huge_pages);
}
-static ssize_t nr_hugepages_store(struct kobject *kobj,
- struct kobj_attribute *attr, const char *buf, size_t count)
+static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
+ struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t len)
{
int err;
- unsigned long input;
- struct hstate *h = kobj_to_hstate(kobj);
+ int nid;
+ unsigned long count;
+ struct hstate *h;
+ NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY);
- err = strict_strtoul(buf, 10, &input);
+ err = strict_strtoul(buf, 10, &count);
if (err)
return 0;
- h->max_huge_pages = set_max_huge_pages(h, input);
+ h = kobj_to_hstate(kobj, &nid);
+ if (nid == NUMA_NO_NODE) {
+ /*
+ * global hstate attribute
+ */
+ if (!(obey_mempolicy &&
+ init_nodemask_of_mempolicy(nodes_allowed))) {
+ NODEMASK_FREE(nodes_allowed);
+ nodes_allowed = &node_states[N_HIGH_MEMORY];
+ }
+ } else if (nodes_allowed) {
+ /*
+ * per node hstate attribute: adjust count to global,
+ * but restrict alloc/free to the specified node.
+ */
+ count += h->nr_huge_pages - h->nr_huge_pages_node[nid];
+ init_nodemask_of_node(nodes_allowed, nid);
+ } else
+ nodes_allowed = &node_states[N_HIGH_MEMORY];
+
+ h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed);
- return count;
+ if (nodes_allowed != &node_states[N_HIGH_MEMORY])
+ NODEMASK_FREE(nodes_allowed);
+
+ return len;
+}
+
+static ssize_t nr_hugepages_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return nr_hugepages_show_common(kobj, attr, buf);
+}
+
+static ssize_t nr_hugepages_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t len)
+{
+ return nr_hugepages_store_common(false, kobj, attr, buf, len);
}
HSTATE_ATTR(nr_hugepages);
+#ifdef CONFIG_NUMA
+
+/*
+ * hstate attribute for optionally mempolicy-based constraint on persistent
+ * huge page alloc/free.
+ */
+static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return nr_hugepages_show_common(kobj, attr, buf);
+}
+
+static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t len)
+{
+ return nr_hugepages_store_common(true, kobj, attr, buf, len);
+}
+HSTATE_ATTR(nr_hugepages_mempolicy);
+#endif
+
+
static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- struct hstate *h = kobj_to_hstate(kobj);
+ struct hstate *h = kobj_to_hstate(kobj, NULL);
return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages);
}
static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
@@ -1326,7 +1443,7 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
{
int err;
unsigned long input;
- struct hstate *h = kobj_to_hstate(kobj);
+ struct hstate *h = kobj_to_hstate(kobj, NULL);
err = strict_strtoul(buf, 10, &input);
if (err)
@@ -1343,15 +1460,24 @@ HSTATE_ATTR(nr_overcommit_hugepages);
static ssize_t free_hugepages_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- struct hstate *h = kobj_to_hstate(kobj);
- return sprintf(buf, "%lu\n", h->free_huge_pages);
+ struct hstate *h;
+ unsigned long free_huge_pages;
+ int nid;
+
+ h = kobj_to_hstate(kobj, &nid);
+ if (nid == NUMA_NO_NODE)
+ free_huge_pages = h->free_huge_pages;
+ else
+ free_huge_pages = h->free_huge_pages_node[nid];
+
+ return sprintf(buf, "%lu\n", free_huge_pages);
}
HSTATE_ATTR_RO(free_hugepages);
static ssize_t resv_hugepages_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- struct hstate *h = kobj_to_hstate(kobj);
+ struct hstate *h = kobj_to_hstate(kobj, NULL);
return sprintf(buf, "%lu\n", h->resv_huge_pages);
}
HSTATE_ATTR_RO(resv_hugepages);
@@ -1359,8 +1485,17 @@ HSTATE_ATTR_RO(resv_hugepages);
static ssize_t surplus_hugepages_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- struct hstate *h = kobj_to_hstate(kobj);
- return sprintf(buf, "%lu\n", h->surplus_huge_pages);
+ struct hstate *h;
+ unsigned long surplus_huge_pages;
+ int nid;
+
+ h = kobj_to_hstate(kobj, &nid);
+ if (nid == NUMA_NO_NODE)
+ surplus_huge_pages = h->surplus_huge_pages;
+ else
+ surplus_huge_pages = h->surplus_huge_pages_node[nid];
+
+ return sprintf(buf, "%lu\n", surplus_huge_pages);
}
HSTATE_ATTR_RO(surplus_hugepages);
@@ -1370,6 +1505,9 @@ static struct attribute *hstate_attrs[] = {
&free_hugepages_attr.attr,
&resv_hugepages_attr.attr,
&surplus_hugepages_attr.attr,
+#ifdef CONFIG_NUMA
+ &nr_hugepages_mempolicy_attr.attr,
+#endif
NULL,
};
@@ -1377,19 +1515,21 @@ static struct attribute_group hstate_attr_group = {
.attrs = hstate_attrs,
};
-static int __init hugetlb_sysfs_add_hstate(struct hstate *h)
+static int __init hugetlb_sysfs_add_hstate(struct hstate *h,
+ struct kobject *parent,
+ struct kobject **hstate_kobjs,
+ struct attribute_group *hstate_attr_group)
{
int retval;
+ int hi = h - hstates;
- hstate_kobjs[h - hstates] = kobject_create_and_add(h->name,
- hugepages_kobj);
- if (!hstate_kobjs[h - hstates])
+ hstate_kobjs[hi] = kobject_create_and_add(h->name, parent);
+ if (!hstate_kobjs[hi])
return -ENOMEM;
- retval = sysfs_create_group(hstate_kobjs[h - hstates],
- &hstate_attr_group);
+ retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group);
if (retval)
- kobject_put(hstate_kobjs[h - hstates]);
+ kobject_put(hstate_kobjs[hi]);
return retval;
}
@@ -1404,17 +1544,184 @@ static void __init hugetlb_sysfs_init(void)
return;
for_each_hstate(h) {
- err = hugetlb_sysfs_add_hstate(h);
+ err = hugetlb_sysfs_add_hstate(h, hugepages_kobj,
+ hstate_kobjs, &hstate_attr_group);
if (err)
printk(KERN_ERR "Hugetlb: Unable to add hstate %s",
h->name);
}
}
+#ifdef CONFIG_NUMA
+
+/*
+ * node_hstate/s - associate per node hstate attributes, via their kobjects,
+ * with node sysdevs in node_devices[] using a parallel array. The array
+ * index of a node sysdev or _hstate == node id.
+ * This is here to avoid any static dependency of the node sysdev driver, in
+ * the base kernel, on the hugetlb module.
+ */
+struct node_hstate {
+ struct kobject *hugepages_kobj;
+ struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
+};
+struct node_hstate node_hstates[MAX_NUMNODES];
+
+/*
+ * A subset of global hstate attributes for node sysdevs
+ */
+static struct attribute *per_node_hstate_attrs[] = {
+ &nr_hugepages_attr.attr,
+ &free_hugepages_attr.attr,
+ &surplus_hugepages_attr.attr,
+ NULL,
+};
+
+static struct attribute_group per_node_hstate_attr_group = {
+ .attrs = per_node_hstate_attrs,
+};
+
+/*
+ * kobj_to_node_hstate - lookup global hstate for node sysdev hstate attr kobj.
+ * Returns node id via non-NULL nidp.
+ */
+static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
+{
+ int nid;
+
+ for (nid = 0; nid < nr_node_ids; nid++) {
+ struct node_hstate *nhs = &node_hstates[nid];
+ int i;
+ for (i = 0; i < HUGE_MAX_HSTATE; i++)
+ if (nhs->hstate_kobjs[i] == kobj) {
+ if (nidp)
+ *nidp = nid;
+ return &hstates[i];
+ }
+ }
+
+ BUG();
+ return NULL;
+}
+
+/*
+ * Unregister hstate attributes from a single node sysdev.
+ * No-op if no hstate attributes attached.
+ */
+void hugetlb_unregister_node(struct node *node)
+{
+ struct hstate *h;
+ struct node_hstate *nhs = &node_hstates[node->sysdev.id];
+
+ if (!nhs->hugepages_kobj)
+ return; /* no hstate attributes */
+
+ for_each_hstate(h)
+ if (nhs->hstate_kobjs[h - hstates]) {
+ kobject_put(nhs->hstate_kobjs[h - hstates]);
+ nhs->hstate_kobjs[h - hstates] = NULL;
+ }
+
+ kobject_put(nhs->hugepages_kobj);
+ nhs->hugepages_kobj = NULL;
+}
+
+/*
+ * hugetlb module exit: unregister hstate attributes from node sysdevs
+ * that have them.
+ */
+static void hugetlb_unregister_all_nodes(void)
+{
+ int nid;
+
+ /*
+ * disable node sysdev registrations.
+ */
+ register_hugetlbfs_with_node(NULL, NULL);
+
+ /*
+ * remove hstate attributes from any nodes that have them.
+ */
+ for (nid = 0; nid < nr_node_ids; nid++)
+ hugetlb_unregister_node(&node_devices[nid]);
+}
+
+/*
+ * Register hstate attributes for a single node sysdev.
+ * No-op if attributes already registered.
+ */
+void hugetlb_register_node(struct node *node)
+{
+ struct hstate *h;
+ struct node_hstate *nhs = &node_hstates[node->sysdev.id];
+ int err;
+
+ if (nhs->hugepages_kobj)
+ return; /* already allocated */
+
+ nhs->hugepages_kobj = kobject_create_and_add("hugepages",
+ &node->sysdev.kobj);
+ if (!nhs->hugepages_kobj)
+ return;
+
+ for_each_hstate(h) {
+ err = hugetlb_sysfs_add_hstate(h, nhs->hugepages_kobj,
+ nhs->hstate_kobjs,
+ &per_node_hstate_attr_group);
+ if (err) {
+ printk(KERN_ERR "Hugetlb: Unable to add hstate %s"
+ " for node %d\n",
+ h->name, node->sysdev.id);
+ hugetlb_unregister_node(node);
+ break;
+ }
+ }
+}
+
+/*
+ * hugetlb init time: register hstate attributes for all registered node
+ * sysdevs of nodes that have memory. All on-line nodes should have
+ * registered their associated sysdev by this time.
+ */
+static void hugetlb_register_all_nodes(void)
+{
+ int nid;
+
+ for_each_node_state(nid, N_HIGH_MEMORY) {
+ struct node *node = &node_devices[nid];
+ if (node->sysdev.id == nid)
+ hugetlb_register_node(node);
+ }
+
+ /*
+ * Let the node sysdev driver know we're here so it can
+ * [un]register hstate attributes on node hotplug.
+ */
+ register_hugetlbfs_with_node(hugetlb_register_node,
+ hugetlb_unregister_node);
+}
+#else /* !CONFIG_NUMA */
+
+static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
+{
+ BUG();
+ if (nidp)
+ *nidp = -1;
+ return NULL;
+}
+
+static void hugetlb_unregister_all_nodes(void) { }
+
+static void hugetlb_register_all_nodes(void) { }
+
+#endif
+
static void __exit hugetlb_exit(void)
{
struct hstate *h;
+ hugetlb_unregister_all_nodes();
+
for_each_hstate(h) {
kobject_put(hstate_kobjs[h - hstates]);
}
@@ -1449,6 +1756,8 @@ static int __init hugetlb_init(void)
hugetlb_sysfs_init();
+ hugetlb_register_all_nodes();
+
return 0;
}
module_init(hugetlb_init);
@@ -1472,8 +1781,8 @@ void __init hugetlb_add_hstate(unsigned order)
h->free_huge_pages = 0;
for (i = 0; i < MAX_NUMNODES; ++i)
INIT_LIST_HEAD(&h->hugepage_freelists[i]);
- h->next_nid_to_alloc = first_node(node_online_map);
- h->next_nid_to_free = first_node(node_online_map);
+ h->next_nid_to_alloc = first_node(node_states[N_HIGH_MEMORY]);
+ h->next_nid_to_free = first_node(node_states[N_HIGH_MEMORY]);
snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
huge_page_size(h)/1024);
@@ -1536,9 +1845,9 @@ static unsigned int cpuset_mems_nr(unsigned int *array)
}
#ifdef CONFIG_SYSCTL
-int hugetlb_sysctl_handler(struct ctl_table *table, int write,
- struct file *file, void __user *buffer,
- size_t *length, loff_t *ppos)
+static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
+ struct ctl_table *table, int write,
+ void __user *buffer, size_t *length, loff_t *ppos)
{
struct hstate *h = &default_hstate;
unsigned long tmp;
@@ -1548,19 +1857,47 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write,
table->data = &tmp;
table->maxlen = sizeof(unsigned long);
- proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
+ proc_doulongvec_minmax(table, write, buffer, length, ppos);
+
+ if (write) {
+ NODEMASK_ALLOC(nodemask_t, nodes_allowed,
+ GFP_KERNEL | __GFP_NORETRY);
+ if (!(obey_mempolicy &&
+ init_nodemask_of_mempolicy(nodes_allowed))) {
+ NODEMASK_FREE(nodes_allowed);
+ nodes_allowed = &node_states[N_HIGH_MEMORY];
+ }
+ h->max_huge_pages = set_max_huge_pages(h, tmp, nodes_allowed);
- if (write)
- h->max_huge_pages = set_max_huge_pages(h, tmp);
+ if (nodes_allowed != &node_states[N_HIGH_MEMORY])
+ NODEMASK_FREE(nodes_allowed);
+ }
return 0;
}
+int hugetlb_sysctl_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *length, loff_t *ppos)
+{
+
+ return hugetlb_sysctl_handler_common(false, table, write,
+ buffer, length, ppos);
+}
+
+#ifdef CONFIG_NUMA
+int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *length, loff_t *ppos)
+{
+ return hugetlb_sysctl_handler_common(true, table, write,
+ buffer, length, ppos);
+}
+#endif /* CONFIG_NUMA */
+
int hugetlb_treat_movable_handler(struct ctl_table *table, int write,
- struct file *file, void __user *buffer,
+ void __user *buffer,
size_t *length, loff_t *ppos)
{
- proc_dointvec(table, write, file, buffer, length, ppos);
+ proc_dointvec(table, write, buffer, length, ppos);
if (hugepages_treat_as_movable)
htlb_alloc_mask = GFP_HIGHUSER_MOVABLE;
else
@@ -1569,7 +1906,7 @@ int hugetlb_treat_movable_handler(struct ctl_table *table, int write,
}
int hugetlb_overcommit_handler(struct ctl_table *table, int write,
- struct file *file, void __user *buffer,
+ void __user *buffer,
size_t *length, loff_t *ppos)
{
struct hstate *h = &default_hstate;
@@ -1580,7 +1917,7 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
table->data = &tmp;
table->maxlen = sizeof(unsigned long);
- proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
+ proc_doulongvec_minmax(table, write, buffer, length, ppos);
if (write) {
spin_lock(&hugetlb_lock);
@@ -1721,7 +2058,7 @@ static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
return 0;
}
-struct vm_operations_struct hugetlb_vm_ops = {
+const struct vm_operations_struct hugetlb_vm_ops = {
.fault = hugetlb_vm_op_fault,
.open = hugetlb_vm_op_open,
.close = hugetlb_vm_op_close,
@@ -1903,6 +2240,12 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
+ (vma->vm_pgoff >> PAGE_SHIFT);
mapping = (struct address_space *)page_private(page);
+ /*
+ * Take the mapping lock for the duration of the table walk. As
+ * this mapping should be shared between all the VMAs,
+ * __unmap_hugepage_range() is called as the lock is already held
+ */
+ spin_lock(&mapping->i_mmap_lock);
vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
/* Do not unmap the current VMA */
if (iter_vma == vma)
@@ -1916,10 +2259,11 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
* from the time of fork. This would look like data corruption
*/
if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
- unmap_hugepage_range(iter_vma,
+ __unmap_hugepage_range(iter_vma,
address, address + huge_page_size(h),
page);
}
+ spin_unlock(&mapping->i_mmap_lock);
return 1;
}
@@ -1959,6 +2303,9 @@ retry_avoidcopy:
outside_reserve = 1;
page_cache_get(old_page);
+
+ /* Drop page_table_lock as buddy allocator may be called */
+ spin_unlock(&mm->page_table_lock);
new_page = alloc_huge_page(vma, address, outside_reserve);
if (IS_ERR(new_page)) {
@@ -1976,19 +2323,25 @@ retry_avoidcopy:
if (unmap_ref_private(mm, vma, old_page, address)) {
BUG_ON(page_count(old_page) != 1);
BUG_ON(huge_pte_none(pte));
+ spin_lock(&mm->page_table_lock);
goto retry_avoidcopy;
}
WARN_ON_ONCE(1);
}
+ /* Caller expects lock to be held */
+ spin_lock(&mm->page_table_lock);
return -PTR_ERR(new_page);
}
- spin_unlock(&mm->page_table_lock);
copy_huge_page(new_page, old_page, address, vma);
__SetPageUptodate(new_page);
- spin_lock(&mm->page_table_lock);
+ /*
+ * Retake the page_table_lock to check for racing updates
+ * before the page tables are altered
+ */
+ spin_lock(&mm->page_table_lock);
ptep = huge_pte_offset(mm, address & huge_page_mask(h));
if (likely(pte_same(huge_ptep_get(ptep), pte))) {
/* Break COW */
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
new file mode 100644
index 000000000000..e1d85137f086
--- /dev/null
+++ b/mm/hwpoison-inject.c
@@ -0,0 +1,41 @@
+/* Inject a hwpoison memory failure on a arbitary pfn */
+#include <linux/module.h>
+#include <linux/debugfs.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+
+static struct dentry *hwpoison_dir, *corrupt_pfn;
+
+static int hwpoison_inject(void *data, u64 val)
+{
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ printk(KERN_INFO "Injecting memory failure at pfn %Lx\n", val);
+ return __memory_failure(val, 18, 0);
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(hwpoison_fops, NULL, hwpoison_inject, "%lli\n");
+
+static void pfn_inject_exit(void)
+{
+ if (hwpoison_dir)
+ debugfs_remove_recursive(hwpoison_dir);
+}
+
+static int pfn_inject_init(void)
+{
+ hwpoison_dir = debugfs_create_dir("hwpoison", NULL);
+ if (hwpoison_dir == NULL)
+ return -ENOMEM;
+ corrupt_pfn = debugfs_create_file("corrupt-pfn", 0600, hwpoison_dir,
+ NULL, &hwpoison_fops);
+ if (corrupt_pfn == NULL) {
+ pfn_inject_exit();
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+module_init(pfn_inject_init);
+module_exit(pfn_inject_exit);
+MODULE_LICENSE("GPL");
diff --git a/mm/internal.h b/mm/internal.h
index 22ec8d2b0fb8..4fe67a162cb4 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -63,7 +63,7 @@ static inline unsigned long page_order(struct page *page)
return page_private(page);
}
-#ifdef CONFIG_HAVE_MLOCK
+#ifdef CONFIG_MMU
extern long mlock_vma_pages_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end);
extern void munlock_vma_pages_range(struct vm_area_struct *vma,
@@ -72,21 +72,7 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
{
munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end);
}
-#endif
-
-/*
- * unevictable_migrate_page() called only from migrate_page_copy() to
- * migrate unevictable flag to new page.
- * Note that the old page has been isolated from the LRU lists at this
- * point so we don't need to worry about LRU statistics.
- */
-static inline void unevictable_migrate_page(struct page *new, struct page *old)
-{
- if (TestClearPageUnevictable(old))
- SetPageUnevictable(new);
-}
-#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT
/*
* Called only in fault path via page_evictable() for a new page
* to determine if it's being mapped into a LOCKED vma.
@@ -107,9 +93,10 @@ static inline int is_mlocked_vma(struct vm_area_struct *vma, struct page *page)
}
/*
- * must be called with vma's mmap_sem held for read, and page locked.
+ * must be called with vma's mmap_sem held for read or write, and page locked.
*/
extern void mlock_vma_page(struct page *page);
+extern void munlock_vma_page(struct page *page);
/*
* Clear the page's PageMlocked(). This can be useful in a situation where
@@ -144,7 +131,7 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page)
}
}
-#else /* CONFIG_HAVE_MLOCKED_PAGE_BIT */
+#else /* !CONFIG_MMU */
static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p)
{
return 0;
@@ -153,7 +140,7 @@ static inline void clear_page_mlock(struct page *page) { }
static inline void mlock_vma_page(struct page *page) { }
static inline void mlock_migrate_page(struct page *new, struct page *old) { }
-#endif /* CONFIG_HAVE_MLOCKED_PAGE_BIT */
+#endif /* !CONFIG_MMU */
/*
* Return the mem_map entry representing the 'offset' subpage within
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 4ea4510e2996..13f33b3081ec 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -833,12 +833,15 @@ static void early_alloc(struct early_log *log)
*/
rcu_read_lock();
object = create_object((unsigned long)log->ptr, log->size,
- log->min_count, GFP_KERNEL);
+ log->min_count, GFP_ATOMIC);
+ if (!object)
+ goto out;
spin_lock_irqsave(&object->lock, flags);
for (i = 0; i < log->trace_len; i++)
object->trace[i] = log->trace[i];
object->trace_len = log->trace_len;
spin_unlock_irqrestore(&object->lock, flags);
+out:
rcu_read_unlock();
}
@@ -1047,8 +1050,8 @@ static void scan_object(struct kmemleak_object *object)
unsigned long flags;
/*
- * Once the object->lock is aquired, the corresponding memory block
- * cannot be freed (the same lock is aquired in delete_object).
+ * Once the object->lock is acquired, the corresponding memory block
+ * cannot be freed (the same lock is acquired in delete_object).
*/
spin_lock_irqsave(&object->lock, flags);
if (object->flags & OBJECT_NO_SCAN)
diff --git a/mm/ksm.c b/mm/ksm.c
index 37cc37325094..56a0da1f9979 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -29,10 +29,13 @@
#include <linux/wait.h>
#include <linux/slab.h>
#include <linux/rbtree.h>
+#include <linux/memory.h>
#include <linux/mmu_notifier.h>
+#include <linux/swap.h>
#include <linux/ksm.h>
#include <asm/tlbflush.h>
+#include "internal.h"
/*
* A few notes about the KSM scanning process,
@@ -78,13 +81,13 @@
* struct mm_slot - ksm information per mm that is being scanned
* @link: link to the mm_slots hash list
* @mm_list: link into the mm_slots list, rooted in ksm_mm_head
- * @rmap_list: head for this mm_slot's list of rmap_items
+ * @rmap_list: head for this mm_slot's singly-linked list of rmap_items
* @mm: the mm that this information is valid for
*/
struct mm_slot {
struct hlist_node link;
struct list_head mm_list;
- struct list_head rmap_list;
+ struct rmap_item *rmap_list;
struct mm_struct *mm;
};
@@ -92,7 +95,7 @@ struct mm_slot {
* struct ksm_scan - cursor for scanning
* @mm_slot: the current mm_slot we are scanning
* @address: the next address inside that to be scanned
- * @rmap_item: the current rmap that we are scanning inside the rmap_list
+ * @rmap_list: link to the next rmap to be scanned in the rmap_list
* @seqnr: count of completed full scans (needed when removing unstable node)
*
* There is only the one ksm_scan instance of this cursor structure.
@@ -100,37 +103,51 @@ struct mm_slot {
struct ksm_scan {
struct mm_slot *mm_slot;
unsigned long address;
- struct rmap_item *rmap_item;
+ struct rmap_item **rmap_list;
unsigned long seqnr;
};
/**
+ * struct stable_node - node of the stable rbtree
+ * @node: rb node of this ksm page in the stable tree
+ * @hlist: hlist head of rmap_items using this ksm page
+ * @kpfn: page frame number of this ksm page
+ */
+struct stable_node {
+ struct rb_node node;
+ struct hlist_head hlist;
+ unsigned long kpfn;
+};
+
+/**
* struct rmap_item - reverse mapping item for virtual addresses
- * @link: link into mm_slot's rmap_list (rmap_list is per mm)
+ * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list
+ * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree
* @mm: the memory structure this rmap_item is pointing into
* @address: the virtual address this rmap_item tracks (+ flags in low bits)
* @oldchecksum: previous checksum of the page at that virtual address
- * @node: rb_node of this rmap_item in either unstable or stable tree
- * @next: next rmap_item hanging off the same node of the stable tree
- * @prev: previous rmap_item hanging off the same node of the stable tree
+ * @node: rb node of this rmap_item in the unstable tree
+ * @head: pointer to stable_node heading this list in the stable tree
+ * @hlist: link into hlist of rmap_items hanging off that stable_node
*/
struct rmap_item {
- struct list_head link;
+ struct rmap_item *rmap_list;
+ struct anon_vma *anon_vma; /* when stable */
struct mm_struct *mm;
unsigned long address; /* + low bits used for flags below */
+ unsigned int oldchecksum; /* when unstable */
union {
- unsigned int oldchecksum; /* when unstable */
- struct rmap_item *next; /* when stable */
- };
- union {
- struct rb_node node; /* when tree node */
- struct rmap_item *prev; /* in stable list */
+ struct rb_node node; /* when node of unstable tree */
+ struct { /* when listed from stable tree */
+ struct stable_node *head;
+ struct hlist_node hlist;
+ };
};
};
#define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */
-#define NODE_FLAG 0x100 /* is a node of unstable or stable tree */
-#define STABLE_FLAG 0x200 /* is a node or list item of stable tree */
+#define UNSTABLE_FLAG 0x100 /* is a node of the unstable tree */
+#define STABLE_FLAG 0x200 /* is listed from the stable tree */
/* The stable and unstable tree heads */
static struct rb_root root_stable_tree = RB_ROOT;
@@ -147,6 +164,7 @@ static struct ksm_scan ksm_scan = {
};
static struct kmem_cache *rmap_item_cache;
+static struct kmem_cache *stable_node_cache;
static struct kmem_cache *mm_slot_cache;
/* The number of nodes in the stable tree */
@@ -161,11 +179,8 @@ static unsigned long ksm_pages_unshared;
/* The number of rmap_items in use: to calculate pages_volatile */
static unsigned long ksm_rmap_items;
-/* Limit on the number of unswappable pages used */
-static unsigned long ksm_max_kernel_pages = 2000;
-
/* Number of pages ksmd should scan in one batch */
-static unsigned int ksm_thread_pages_to_scan = 200;
+static unsigned int ksm_thread_pages_to_scan = 100;
/* Milliseconds ksmd should sleep between batches */
static unsigned int ksm_thread_sleep_millisecs = 20;
@@ -173,7 +188,7 @@ static unsigned int ksm_thread_sleep_millisecs = 20;
#define KSM_RUN_STOP 0
#define KSM_RUN_MERGE 1
#define KSM_RUN_UNMERGE 2
-static unsigned int ksm_run = KSM_RUN_MERGE;
+static unsigned int ksm_run = KSM_RUN_STOP;
static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait);
static DEFINE_MUTEX(ksm_thread_mutex);
@@ -189,13 +204,19 @@ static int __init ksm_slab_init(void)
if (!rmap_item_cache)
goto out;
+ stable_node_cache = KSM_KMEM_CACHE(stable_node, 0);
+ if (!stable_node_cache)
+ goto out_free1;
+
mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0);
if (!mm_slot_cache)
- goto out_free;
+ goto out_free2;
return 0;
-out_free:
+out_free2:
+ kmem_cache_destroy(stable_node_cache);
+out_free1:
kmem_cache_destroy(rmap_item_cache);
out:
return -ENOMEM;
@@ -204,6 +225,7 @@ out:
static void __init ksm_slab_free(void)
{
kmem_cache_destroy(mm_slot_cache);
+ kmem_cache_destroy(stable_node_cache);
kmem_cache_destroy(rmap_item_cache);
mm_slot_cache = NULL;
}
@@ -225,6 +247,16 @@ static inline void free_rmap_item(struct rmap_item *rmap_item)
kmem_cache_free(rmap_item_cache, rmap_item);
}
+static inline struct stable_node *alloc_stable_node(void)
+{
+ return kmem_cache_alloc(stable_node_cache, GFP_KERNEL);
+}
+
+static inline void free_stable_node(struct stable_node *stable_node)
+{
+ kmem_cache_free(stable_node_cache, stable_node);
+}
+
static inline struct mm_slot *alloc_mm_slot(void)
{
if (!mm_slot_cache) /* initialization failed */
@@ -274,7 +306,6 @@ static void insert_to_mm_slots_hash(struct mm_struct *mm,
bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
% MM_SLOTS_HASH_HEADS];
mm_slot->mm = mm;
- INIT_LIST_HEAD(&mm_slot->rmap_list);
hlist_add_head(&mm_slot->link, bucket);
}
@@ -283,6 +314,25 @@ static inline int in_stable_tree(struct rmap_item *rmap_item)
return rmap_item->address & STABLE_FLAG;
}
+static void hold_anon_vma(struct rmap_item *rmap_item,
+ struct anon_vma *anon_vma)
+{
+ rmap_item->anon_vma = anon_vma;
+ atomic_inc(&anon_vma->ksm_refcount);
+}
+
+static void drop_anon_vma(struct rmap_item *rmap_item)
+{
+ struct anon_vma *anon_vma = rmap_item->anon_vma;
+
+ if (atomic_dec_and_lock(&anon_vma->ksm_refcount, &anon_vma->lock)) {
+ int empty = list_empty(&anon_vma->head);
+ spin_unlock(&anon_vma->lock);
+ if (empty)
+ anon_vma_free(anon_vma);
+ }
+}
+
/*
* ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's
* page tables after it has passed through ksm_exit() - which, if necessary,
@@ -355,10 +405,18 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
}
-static void break_cow(struct mm_struct *mm, unsigned long addr)
+static void break_cow(struct rmap_item *rmap_item)
{
+ struct mm_struct *mm = rmap_item->mm;
+ unsigned long addr = rmap_item->address;
struct vm_area_struct *vma;
+ /*
+ * It is not an accident that whenever we want to break COW
+ * to undo, we also need to drop a reference to the anon_vma.
+ */
+ drop_anon_vma(rmap_item);
+
down_read(&mm->mmap_sem);
if (ksm_test_exit(mm))
goto out;
@@ -402,21 +460,77 @@ out: page = NULL;
return page;
}
+static void remove_node_from_stable_tree(struct stable_node *stable_node)
+{
+ struct rmap_item *rmap_item;
+ struct hlist_node *hlist;
+
+ hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
+ if (rmap_item->hlist.next)
+ ksm_pages_sharing--;
+ else
+ ksm_pages_shared--;
+ drop_anon_vma(rmap_item);
+ rmap_item->address &= PAGE_MASK;
+ cond_resched();
+ }
+
+ rb_erase(&stable_node->node, &root_stable_tree);
+ free_stable_node(stable_node);
+}
+
/*
- * get_ksm_page: checks if the page at the virtual address in rmap_item
- * is still PageKsm, in which case we can trust the content of the page,
- * and it returns the gotten page; but NULL if the page has been zapped.
+ * get_ksm_page: checks if the page indicated by the stable node
+ * is still its ksm page, despite having held no reference to it.
+ * In which case we can trust the content of the page, and it
+ * returns the gotten page; but if the page has now been zapped,
+ * remove the stale node from the stable tree and return NULL.
+ *
+ * You would expect the stable_node to hold a reference to the ksm page.
+ * But if it increments the page's count, swapping out has to wait for
+ * ksmd to come around again before it can free the page, which may take
+ * seconds or even minutes: much too unresponsive. So instead we use a
+ * "keyhole reference": access to the ksm page from the stable node peeps
+ * out through its keyhole to see if that page still holds the right key,
+ * pointing back to this stable node. This relies on freeing a PageAnon
+ * page to reset its page->mapping to NULL, and relies on no other use of
+ * a page to put something that might look like our key in page->mapping.
+ *
+ * include/linux/pagemap.h page_cache_get_speculative() is a good reference,
+ * but this is different - made simpler by ksm_thread_mutex being held, but
+ * interesting for assuming that no other use of the struct page could ever
+ * put our expected_mapping into page->mapping (or a field of the union which
+ * coincides with page->mapping). The RCU calls are not for KSM at all, but
+ * to keep the page_count protocol described with page_cache_get_speculative.
+ *
+ * Note: it is possible that get_ksm_page() will return NULL one moment,
+ * then page the next, if the page is in between page_freeze_refs() and
+ * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page
+ * is on its way to being freed; but it is an anomaly to bear in mind.
*/
-static struct page *get_ksm_page(struct rmap_item *rmap_item)
+static struct page *get_ksm_page(struct stable_node *stable_node)
{
struct page *page;
-
- page = get_mergeable_page(rmap_item);
- if (page && !PageKsm(page)) {
+ void *expected_mapping;
+
+ page = pfn_to_page(stable_node->kpfn);
+ expected_mapping = (void *)stable_node +
+ (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
+ rcu_read_lock();
+ if (page->mapping != expected_mapping)
+ goto stale;
+ if (!get_page_unless_zero(page))
+ goto stale;
+ if (page->mapping != expected_mapping) {
put_page(page);
- page = NULL;
+ goto stale;
}
+ rcu_read_unlock();
return page;
+stale:
+ rcu_read_unlock();
+ remove_node_from_stable_tree(stable_node);
+ return NULL;
}
/*
@@ -425,35 +539,29 @@ static struct page *get_ksm_page(struct rmap_item *rmap_item)
*/
static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
{
- if (in_stable_tree(rmap_item)) {
- struct rmap_item *next_item = rmap_item->next;
-
- if (rmap_item->address & NODE_FLAG) {
- if (next_item) {
- rb_replace_node(&rmap_item->node,
- &next_item->node,
- &root_stable_tree);
- next_item->address |= NODE_FLAG;
- ksm_pages_sharing--;
- } else {
- rb_erase(&rmap_item->node, &root_stable_tree);
- ksm_pages_shared--;
- }
- } else {
- struct rmap_item *prev_item = rmap_item->prev;
+ if (rmap_item->address & STABLE_FLAG) {
+ struct stable_node *stable_node;
+ struct page *page;
- BUG_ON(prev_item->next != rmap_item);
- prev_item->next = next_item;
- if (next_item) {
- BUG_ON(next_item->prev != rmap_item);
- next_item->prev = rmap_item->prev;
- }
+ stable_node = rmap_item->head;
+ page = get_ksm_page(stable_node);
+ if (!page)
+ goto out;
+
+ lock_page(page);
+ hlist_del(&rmap_item->hlist);
+ unlock_page(page);
+ put_page(page);
+
+ if (stable_node->hlist.first)
ksm_pages_sharing--;
- }
+ else
+ ksm_pages_shared--;
- rmap_item->next = NULL;
+ drop_anon_vma(rmap_item);
+ rmap_item->address &= PAGE_MASK;
- } else if (rmap_item->address & NODE_FLAG) {
+ } else if (rmap_item->address & UNSTABLE_FLAG) {
unsigned char age;
/*
* Usually ksmd can and must skip the rb_erase, because
@@ -466,24 +574,21 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
BUG_ON(age > 1);
if (!age)
rb_erase(&rmap_item->node, &root_unstable_tree);
+
ksm_pages_unshared--;
+ rmap_item->address &= PAGE_MASK;
}
-
- rmap_item->address &= PAGE_MASK;
-
+out:
cond_resched(); /* we're called from many long loops */
}
static void remove_trailing_rmap_items(struct mm_slot *mm_slot,
- struct list_head *cur)
+ struct rmap_item **rmap_list)
{
- struct rmap_item *rmap_item;
-
- while (cur != &mm_slot->rmap_list) {
- rmap_item = list_entry(cur, struct rmap_item, link);
- cur = cur->next;
+ while (*rmap_list) {
+ struct rmap_item *rmap_item = *rmap_list;
+ *rmap_list = rmap_item->rmap_list;
remove_rmap_item_from_tree(rmap_item);
- list_del(&rmap_item->link);
free_rmap_item(rmap_item);
}
}
@@ -549,7 +654,7 @@ static int unmerge_and_remove_all_rmap_items(void)
goto error;
}
- remove_trailing_rmap_items(mm_slot, mm_slot->rmap_list.next);
+ remove_trailing_rmap_items(mm_slot, &mm_slot->rmap_list);
spin_lock(&ksm_mmlist_lock);
ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next,
@@ -645,7 +750,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
* Check that no O_DIRECT or similar I/O is in progress on the
* page
*/
- if ((page_mapcount(page) + 2 + swapped) != page_count(page)) {
+ if (page_mapcount(page) + 1 + swapped != page_count(page)) {
set_pte_at_notify(mm, addr, ptep, entry);
goto out_unlock;
}
@@ -663,15 +768,15 @@ out:
/**
* replace_page - replace page in vma by new ksm page
- * @vma: vma that holds the pte pointing to oldpage
- * @oldpage: the page we are replacing by newpage
- * @newpage: the ksm page we replace oldpage by
+ * @vma: vma that holds the pte pointing to page
+ * @page: the page we are replacing by kpage
+ * @kpage: the ksm page we replace page by
* @orig_pte: the original value of the pte
*
* Returns 0 on success, -EFAULT on failure.
*/
-static int replace_page(struct vm_area_struct *vma, struct page *oldpage,
- struct page *newpage, pte_t orig_pte)
+static int replace_page(struct vm_area_struct *vma, struct page *page,
+ struct page *kpage, pte_t orig_pte)
{
struct mm_struct *mm = vma->vm_mm;
pgd_t *pgd;
@@ -680,12 +785,9 @@ static int replace_page(struct vm_area_struct *vma, struct page *oldpage,
pte_t *ptep;
spinlock_t *ptl;
unsigned long addr;
- pgprot_t prot;
int err = -EFAULT;
- prot = vm_get_page_prot(vma->vm_flags & ~VM_WRITE);
-
- addr = page_address_in_vma(oldpage, vma);
+ addr = page_address_in_vma(page, vma);
if (addr == -EFAULT)
goto out;
@@ -707,15 +809,15 @@ static int replace_page(struct vm_area_struct *vma, struct page *oldpage,
goto out;
}
- get_page(newpage);
- page_add_ksm_rmap(newpage);
+ get_page(kpage);
+ page_add_anon_rmap(kpage, vma, addr);
flush_cache_page(vma, addr, pte_pfn(*ptep));
ptep_clear_flush(vma, addr, ptep);
- set_pte_at_notify(mm, addr, ptep, mk_pte(newpage, prot));
+ set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
- page_remove_rmap(oldpage);
- put_page(oldpage);
+ page_remove_rmap(page);
+ put_page(page);
pte_unmap_unlock(ptep, ptl);
err = 0;
@@ -725,32 +827,27 @@ out:
/*
* try_to_merge_one_page - take two pages and merge them into one
- * @vma: the vma that hold the pte pointing into oldpage
- * @oldpage: the page that we want to replace with newpage
- * @newpage: the page that we want to map instead of oldpage
- *
- * Note:
- * oldpage should be a PageAnon page, while newpage should be a PageKsm page,
- * or a newly allocated kernel page which page_add_ksm_rmap will make PageKsm.
+ * @vma: the vma that holds the pte pointing to page
+ * @page: the PageAnon page that we want to replace with kpage
+ * @kpage: the PageKsm page that we want to map instead of page,
+ * or NULL the first time when we want to use page as kpage.
*
* This function returns 0 if the pages were merged, -EFAULT otherwise.
*/
static int try_to_merge_one_page(struct vm_area_struct *vma,
- struct page *oldpage,
- struct page *newpage)
+ struct page *page, struct page *kpage)
{
pte_t orig_pte = __pte(0);
int err = -EFAULT;
+ if (page == kpage) /* ksm page forked */
+ return 0;
+
if (!(vma->vm_flags & VM_MERGEABLE))
goto out;
-
- if (!PageAnon(oldpage))
+ if (!PageAnon(page))
goto out;
- get_page(newpage);
- get_page(oldpage);
-
/*
* We need the page lock to read a stable PageSwapCache in
* write_protect_page(). We use trylock_page() instead of
@@ -758,26 +855,39 @@ static int try_to_merge_one_page(struct vm_area_struct *vma,
* prefer to continue scanning and merging different pages,
* then come back to this page when it is unlocked.
*/
- if (!trylock_page(oldpage))
- goto out_putpage;
+ if (!trylock_page(page))
+ goto out;
/*
* If this anonymous page is mapped only here, its pte may need
* to be write-protected. If it's mapped elsewhere, all of its
* ptes are necessarily already write-protected. But in either
* case, we need to lock and check page_count is not raised.
*/
- if (write_protect_page(vma, oldpage, &orig_pte)) {
- unlock_page(oldpage);
- goto out_putpage;
+ if (write_protect_page(vma, page, &orig_pte) == 0) {
+ if (!kpage) {
+ /*
+ * While we hold page lock, upgrade page from
+ * PageAnon+anon_vma to PageKsm+NULL stable_node:
+ * stable_tree_insert() will update stable_node.
+ */
+ set_page_stable_node(page, NULL);
+ mark_page_accessed(page);
+ err = 0;
+ } else if (pages_identical(page, kpage))
+ err = replace_page(vma, page, kpage, orig_pte);
}
- unlock_page(oldpage);
- if (pages_identical(oldpage, newpage))
- err = replace_page(vma, oldpage, newpage, orig_pte);
+ if ((vma->vm_flags & VM_LOCKED) && kpage && !err) {
+ munlock_vma_page(page);
+ if (!PageMlocked(kpage)) {
+ unlock_page(page);
+ lock_page(kpage);
+ mlock_vma_page(kpage);
+ page = kpage; /* for final unlock */
+ }
+ }
-out_putpage:
- put_page(oldpage);
- put_page(newpage);
+ unlock_page(page);
out:
return err;
}
@@ -785,26 +895,31 @@ out:
/*
* try_to_merge_with_ksm_page - like try_to_merge_two_pages,
* but no new kernel page is allocated: kpage must already be a ksm page.
+ *
+ * This function returns 0 if the pages were merged, -EFAULT otherwise.
*/
-static int try_to_merge_with_ksm_page(struct mm_struct *mm1,
- unsigned long addr1,
- struct page *page1,
- struct page *kpage)
+static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item,
+ struct page *page, struct page *kpage)
{
+ struct mm_struct *mm = rmap_item->mm;
struct vm_area_struct *vma;
int err = -EFAULT;
- down_read(&mm1->mmap_sem);
- if (ksm_test_exit(mm1))
+ down_read(&mm->mmap_sem);
+ if (ksm_test_exit(mm))
+ goto out;
+ vma = find_vma(mm, rmap_item->address);
+ if (!vma || vma->vm_start > rmap_item->address)
goto out;
- vma = find_vma(mm1, addr1);
- if (!vma || vma->vm_start > addr1)
+ err = try_to_merge_one_page(vma, page, kpage);
+ if (err)
goto out;
- err = try_to_merge_one_page(vma, page1, kpage);
+ /* Must get reference to anon_vma while still holding mmap_sem */
+ hold_anon_vma(rmap_item, vma->anon_vma);
out:
- up_read(&mm1->mmap_sem);
+ up_read(&mm->mmap_sem);
return err;
}
@@ -812,109 +927,73 @@ out:
* try_to_merge_two_pages - take two identical pages and prepare them
* to be merged into one page.
*
- * This function returns 0 if we successfully mapped two identical pages
- * into one page, -EFAULT otherwise.
+ * This function returns the kpage if we successfully merged two identical
+ * pages into one ksm page, NULL otherwise.
*
- * Note that this function allocates a new kernel page: if one of the pages
+ * Note that this function upgrades page to ksm page: if one of the pages
* is already a ksm page, try_to_merge_with_ksm_page should be used.
*/
-static int try_to_merge_two_pages(struct mm_struct *mm1, unsigned long addr1,
- struct page *page1, struct mm_struct *mm2,
- unsigned long addr2, struct page *page2)
+static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item,
+ struct page *page,
+ struct rmap_item *tree_rmap_item,
+ struct page *tree_page)
{
- struct vm_area_struct *vma;
- struct page *kpage;
- int err = -EFAULT;
-
- /*
- * The number of nodes in the stable tree
- * is the number of kernel pages that we hold.
- */
- if (ksm_max_kernel_pages &&
- ksm_max_kernel_pages <= ksm_pages_shared)
- return err;
-
- kpage = alloc_page(GFP_HIGHUSER);
- if (!kpage)
- return err;
-
- down_read(&mm1->mmap_sem);
- if (ksm_test_exit(mm1)) {
- up_read(&mm1->mmap_sem);
- goto out;
- }
- vma = find_vma(mm1, addr1);
- if (!vma || vma->vm_start > addr1) {
- up_read(&mm1->mmap_sem);
- goto out;
- }
-
- copy_user_highpage(kpage, page1, addr1, vma);
- err = try_to_merge_one_page(vma, page1, kpage);
- up_read(&mm1->mmap_sem);
+ int err;
+ err = try_to_merge_with_ksm_page(rmap_item, page, NULL);
if (!err) {
- err = try_to_merge_with_ksm_page(mm2, addr2, page2, kpage);
+ err = try_to_merge_with_ksm_page(tree_rmap_item,
+ tree_page, page);
/*
* If that fails, we have a ksm page with only one pte
* pointing to it: so break it.
*/
if (err)
- break_cow(mm1, addr1);
+ break_cow(rmap_item);
}
-out:
- put_page(kpage);
- return err;
+ return err ? NULL : page;
}
/*
- * stable_tree_search - search page inside the stable tree
- * @page: the page that we are searching identical pages to.
- * @page2: pointer into identical page that we are holding inside the stable
- * tree that we have found.
- * @rmap_item: the reverse mapping item
+ * stable_tree_search - search for page inside the stable tree
*
* This function checks if there is a page inside the stable tree
* with identical content to the page that we are scanning right now.
*
- * This function return rmap_item pointer to the identical item if found,
+ * This function returns the stable tree node of identical content if found,
* NULL otherwise.
*/
-static struct rmap_item *stable_tree_search(struct page *page,
- struct page **page2,
- struct rmap_item *rmap_item)
+static struct page *stable_tree_search(struct page *page)
{
struct rb_node *node = root_stable_tree.rb_node;
+ struct stable_node *stable_node;
+
+ stable_node = page_stable_node(page);
+ if (stable_node) { /* ksm page forked */
+ get_page(page);
+ return page;
+ }
while (node) {
- struct rmap_item *tree_rmap_item, *next_rmap_item;
+ struct page *tree_page;
int ret;
- tree_rmap_item = rb_entry(node, struct rmap_item, node);
- while (tree_rmap_item) {
- BUG_ON(!in_stable_tree(tree_rmap_item));
- cond_resched();
- page2[0] = get_ksm_page(tree_rmap_item);
- if (page2[0])
- break;
- next_rmap_item = tree_rmap_item->next;
- remove_rmap_item_from_tree(tree_rmap_item);
- tree_rmap_item = next_rmap_item;
- }
- if (!tree_rmap_item)
+ cond_resched();
+ stable_node = rb_entry(node, struct stable_node, node);
+ tree_page = get_ksm_page(stable_node);
+ if (!tree_page)
return NULL;
- ret = memcmp_pages(page, page2[0]);
+ ret = memcmp_pages(page, tree_page);
if (ret < 0) {
- put_page(page2[0]);
+ put_page(tree_page);
node = node->rb_left;
} else if (ret > 0) {
- put_page(page2[0]);
+ put_page(tree_page);
node = node->rb_right;
- } else {
- return tree_rmap_item;
- }
+ } else
+ return tree_page;
}
return NULL;
@@ -924,38 +1003,26 @@ static struct rmap_item *stable_tree_search(struct page *page,
* stable_tree_insert - insert rmap_item pointing to new ksm page
* into the stable tree.
*
- * @page: the page that we are searching identical page to inside the stable
- * tree.
- * @rmap_item: pointer to the reverse mapping item.
- *
- * This function returns rmap_item if success, NULL otherwise.
+ * This function returns the stable tree node just allocated on success,
+ * NULL otherwise.
*/
-static struct rmap_item *stable_tree_insert(struct page *page,
- struct rmap_item *rmap_item)
+static struct stable_node *stable_tree_insert(struct page *kpage)
{
struct rb_node **new = &root_stable_tree.rb_node;
struct rb_node *parent = NULL;
+ struct stable_node *stable_node;
while (*new) {
- struct rmap_item *tree_rmap_item, *next_rmap_item;
struct page *tree_page;
int ret;
- tree_rmap_item = rb_entry(*new, struct rmap_item, node);
- while (tree_rmap_item) {
- BUG_ON(!in_stable_tree(tree_rmap_item));
- cond_resched();
- tree_page = get_ksm_page(tree_rmap_item);
- if (tree_page)
- break;
- next_rmap_item = tree_rmap_item->next;
- remove_rmap_item_from_tree(tree_rmap_item);
- tree_rmap_item = next_rmap_item;
- }
- if (!tree_rmap_item)
+ cond_resched();
+ stable_node = rb_entry(*new, struct stable_node, node);
+ tree_page = get_ksm_page(stable_node);
+ if (!tree_page)
return NULL;
- ret = memcmp_pages(page, tree_page);
+ ret = memcmp_pages(kpage, tree_page);
put_page(tree_page);
parent = *new;
@@ -973,22 +1040,24 @@ static struct rmap_item *stable_tree_insert(struct page *page,
}
}
- rmap_item->address |= NODE_FLAG | STABLE_FLAG;
- rmap_item->next = NULL;
- rb_link_node(&rmap_item->node, parent, new);
- rb_insert_color(&rmap_item->node, &root_stable_tree);
+ stable_node = alloc_stable_node();
+ if (!stable_node)
+ return NULL;
- ksm_pages_shared++;
- return rmap_item;
+ rb_link_node(&stable_node->node, parent, new);
+ rb_insert_color(&stable_node->node, &root_stable_tree);
+
+ INIT_HLIST_HEAD(&stable_node->hlist);
+
+ stable_node->kpfn = page_to_pfn(kpage);
+ set_page_stable_node(kpage, stable_node);
+
+ return stable_node;
}
/*
- * unstable_tree_search_insert - search and insert items into the unstable tree.
- *
- * @page: the page that we are going to search for identical page or to insert
- * into the unstable tree
- * @page2: pointer into identical page that was found inside the unstable tree
- * @rmap_item: the reverse mapping item of page
+ * unstable_tree_search_insert - search for identical page,
+ * else insert rmap_item into the unstable tree.
*
* This function searches for a page in the unstable tree identical to the
* page currently being scanned; and if no identical page is found in the
@@ -1000,46 +1069,50 @@ static struct rmap_item *stable_tree_insert(struct page *page,
* This function does both searching and inserting, because they share
* the same walking algorithm in an rbtree.
*/
-static struct rmap_item *unstable_tree_search_insert(struct page *page,
- struct page **page2,
- struct rmap_item *rmap_item)
+static
+struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
+ struct page *page,
+ struct page **tree_pagep)
+
{
struct rb_node **new = &root_unstable_tree.rb_node;
struct rb_node *parent = NULL;
while (*new) {
struct rmap_item *tree_rmap_item;
+ struct page *tree_page;
int ret;
+ cond_resched();
tree_rmap_item = rb_entry(*new, struct rmap_item, node);
- page2[0] = get_mergeable_page(tree_rmap_item);
- if (!page2[0])
+ tree_page = get_mergeable_page(tree_rmap_item);
+ if (!tree_page)
return NULL;
/*
- * Don't substitute an unswappable ksm page
- * just for one good swappable forked page.
+ * Don't substitute a ksm page for a forked page.
*/
- if (page == page2[0]) {
- put_page(page2[0]);
+ if (page == tree_page) {
+ put_page(tree_page);
return NULL;
}
- ret = memcmp_pages(page, page2[0]);
+ ret = memcmp_pages(page, tree_page);
parent = *new;
if (ret < 0) {
- put_page(page2[0]);
+ put_page(tree_page);
new = &parent->rb_left;
} else if (ret > 0) {
- put_page(page2[0]);
+ put_page(tree_page);
new = &parent->rb_right;
} else {
+ *tree_pagep = tree_page;
return tree_rmap_item;
}
}
- rmap_item->address |= NODE_FLAG;
+ rmap_item->address |= UNSTABLE_FLAG;
rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK);
rb_link_node(&rmap_item->node, parent, new);
rb_insert_color(&rmap_item->node, &root_unstable_tree);
@@ -1054,18 +1127,16 @@ static struct rmap_item *unstable_tree_search_insert(struct page *page,
* the same ksm page.
*/
static void stable_tree_append(struct rmap_item *rmap_item,
- struct rmap_item *tree_rmap_item)
+ struct stable_node *stable_node)
{
- rmap_item->next = tree_rmap_item->next;
- rmap_item->prev = tree_rmap_item;
-
- if (tree_rmap_item->next)
- tree_rmap_item->next->prev = rmap_item;
-
- tree_rmap_item->next = rmap_item;
+ rmap_item->head = stable_node;
rmap_item->address |= STABLE_FLAG;
+ hlist_add_head(&rmap_item->hlist, &stable_node->hlist);
- ksm_pages_sharing++;
+ if (rmap_item->hlist.next)
+ ksm_pages_sharing++;
+ else
+ ksm_pages_shared++;
}
/*
@@ -1079,49 +1150,37 @@ static void stable_tree_append(struct rmap_item *rmap_item,
*/
static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
{
- struct page *page2[1];
struct rmap_item *tree_rmap_item;
+ struct page *tree_page = NULL;
+ struct stable_node *stable_node;
+ struct page *kpage;
unsigned int checksum;
int err;
- if (in_stable_tree(rmap_item))
- remove_rmap_item_from_tree(rmap_item);
+ remove_rmap_item_from_tree(rmap_item);
/* We first start with searching the page inside the stable tree */
- tree_rmap_item = stable_tree_search(page, page2, rmap_item);
- if (tree_rmap_item) {
- if (page == page2[0]) /* forked */
- err = 0;
- else
- err = try_to_merge_with_ksm_page(rmap_item->mm,
- rmap_item->address,
- page, page2[0]);
- put_page(page2[0]);
-
+ kpage = stable_tree_search(page);
+ if (kpage) {
+ err = try_to_merge_with_ksm_page(rmap_item, page, kpage);
if (!err) {
/*
* The page was successfully merged:
* add its rmap_item to the stable tree.
*/
- stable_tree_append(rmap_item, tree_rmap_item);
+ lock_page(kpage);
+ stable_tree_append(rmap_item, page_stable_node(kpage));
+ unlock_page(kpage);
}
+ put_page(kpage);
return;
}
/*
- * A ksm page might have got here by fork, but its other
- * references have already been removed from the stable tree.
- * Or it might be left over from a break_ksm which failed
- * when the mem_cgroup had reached its limit: try again now.
- */
- if (PageKsm(page))
- break_cow(rmap_item->mm, rmap_item->address);
-
- /*
- * In case the hash value of the page was changed from the last time we
- * have calculated it, this page to be changed frequely, therefore we
- * don't want to insert it to the unstable tree, and we don't want to
- * waste our time to search if there is something identical to it there.
+ * If the hash value of the page has changed from the last time
+ * we calculated it, this page is changing frequently: therefore we
+ * don't want to insert it in the unstable tree, and we don't want
+ * to waste our time searching for something identical to it there.
*/
checksum = calc_checksum(page);
if (rmap_item->oldchecksum != checksum) {
@@ -1129,21 +1188,27 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
return;
}
- tree_rmap_item = unstable_tree_search_insert(page, page2, rmap_item);
+ tree_rmap_item =
+ unstable_tree_search_insert(rmap_item, page, &tree_page);
if (tree_rmap_item) {
- err = try_to_merge_two_pages(rmap_item->mm,
- rmap_item->address, page,
- tree_rmap_item->mm,
- tree_rmap_item->address, page2[0]);
+ kpage = try_to_merge_two_pages(rmap_item, page,
+ tree_rmap_item, tree_page);
+ put_page(tree_page);
/*
* As soon as we merge this page, we want to remove the
* rmap_item of the page we have merged with from the unstable
* tree, and insert it instead as new node in the stable tree.
*/
- if (!err) {
- rb_erase(&tree_rmap_item->node, &root_unstable_tree);
- tree_rmap_item->address &= ~NODE_FLAG;
- ksm_pages_unshared--;
+ if (kpage) {
+ remove_rmap_item_from_tree(tree_rmap_item);
+
+ lock_page(kpage);
+ stable_node = stable_tree_insert(kpage);
+ if (stable_node) {
+ stable_tree_append(tree_rmap_item, stable_node);
+ stable_tree_append(rmap_item, stable_node);
+ }
+ unlock_page(kpage);
/*
* If we fail to insert the page into the stable tree,
@@ -1151,37 +1216,28 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
* to a ksm page left outside the stable tree,
* in which case we need to break_cow on both.
*/
- if (stable_tree_insert(page2[0], tree_rmap_item))
- stable_tree_append(rmap_item, tree_rmap_item);
- else {
- break_cow(tree_rmap_item->mm,
- tree_rmap_item->address);
- break_cow(rmap_item->mm, rmap_item->address);
+ if (!stable_node) {
+ break_cow(tree_rmap_item);
+ break_cow(rmap_item);
}
}
-
- put_page(page2[0]);
}
}
static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot,
- struct list_head *cur,
+ struct rmap_item **rmap_list,
unsigned long addr)
{
struct rmap_item *rmap_item;
- while (cur != &mm_slot->rmap_list) {
- rmap_item = list_entry(cur, struct rmap_item, link);
- if ((rmap_item->address & PAGE_MASK) == addr) {
- if (!in_stable_tree(rmap_item))
- remove_rmap_item_from_tree(rmap_item);
+ while (*rmap_list) {
+ rmap_item = *rmap_list;
+ if ((rmap_item->address & PAGE_MASK) == addr)
return rmap_item;
- }
if (rmap_item->address > addr)
break;
- cur = cur->next;
+ *rmap_list = rmap_item->rmap_list;
remove_rmap_item_from_tree(rmap_item);
- list_del(&rmap_item->link);
free_rmap_item(rmap_item);
}
@@ -1190,7 +1246,8 @@ static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot,
/* It has already been zeroed */
rmap_item->mm = mm_slot->mm;
rmap_item->address = addr;
- list_add_tail(&rmap_item->link, cur);
+ rmap_item->rmap_list = *rmap_list;
+ *rmap_list = rmap_item;
}
return rmap_item;
}
@@ -1215,8 +1272,7 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
spin_unlock(&ksm_mmlist_lock);
next_mm:
ksm_scan.address = 0;
- ksm_scan.rmap_item = list_entry(&slot->rmap_list,
- struct rmap_item, link);
+ ksm_scan.rmap_list = &slot->rmap_list;
}
mm = slot->mm;
@@ -1242,10 +1298,10 @@ next_mm:
flush_anon_page(vma, *page, ksm_scan.address);
flush_dcache_page(*page);
rmap_item = get_next_rmap_item(slot,
- ksm_scan.rmap_item->link.next,
- ksm_scan.address);
+ ksm_scan.rmap_list, ksm_scan.address);
if (rmap_item) {
- ksm_scan.rmap_item = rmap_item;
+ ksm_scan.rmap_list =
+ &rmap_item->rmap_list;
ksm_scan.address += PAGE_SIZE;
} else
put_page(*page);
@@ -1261,14 +1317,13 @@ next_mm:
if (ksm_test_exit(mm)) {
ksm_scan.address = 0;
- ksm_scan.rmap_item = list_entry(&slot->rmap_list,
- struct rmap_item, link);
+ ksm_scan.rmap_list = &slot->rmap_list;
}
/*
* Nuke all the rmap_items that are above this current rmap:
* because there were no VM_MERGEABLE vmas with such addresses.
*/
- remove_trailing_rmap_items(slot, ksm_scan.rmap_item->link.next);
+ remove_trailing_rmap_items(slot, ksm_scan.rmap_list);
spin_lock(&ksm_mmlist_lock);
ksm_scan.mm_slot = list_entry(slot->mm_list.next,
@@ -1321,14 +1376,6 @@ static void ksm_do_scan(unsigned int scan_npages)
return;
if (!PageKsm(page) || !in_stable_tree(rmap_item))
cmp_and_merge_page(page, rmap_item);
- else if (page_mapcount(page) == 1) {
- /*
- * Replace now-unshared ksm page by ordinary page.
- */
- break_cow(rmap_item->mm, rmap_item->address);
- remove_rmap_item_from_tree(rmap_item);
- rmap_item->oldchecksum = calc_checksum(page);
- }
put_page(page);
}
}
@@ -1373,7 +1420,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE |
VM_PFNMAP | VM_IO | VM_DONTEXPAND |
VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE |
- VM_MIXEDMAP | VM_SAO))
+ VM_NONLINEAR | VM_MIXEDMAP | VM_SAO))
return 0; /* just ignore the advice */
if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
@@ -1450,7 +1497,7 @@ void __ksm_exit(struct mm_struct *mm)
spin_lock(&ksm_mmlist_lock);
mm_slot = get_mm_slot(mm);
if (mm_slot && ksm_scan.mm_slot != mm_slot) {
- if (list_empty(&mm_slot->rmap_list)) {
+ if (!mm_slot->rmap_list) {
hlist_del(&mm_slot->link);
list_del(&mm_slot->mm_list);
easy_to_free = 1;
@@ -1471,6 +1518,249 @@ void __ksm_exit(struct mm_struct *mm)
}
}
+struct page *ksm_does_need_to_copy(struct page *page,
+ struct vm_area_struct *vma, unsigned long address)
+{
+ struct page *new_page;
+
+ unlock_page(page); /* any racers will COW it, not modify it */
+
+ new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
+ if (new_page) {
+ copy_user_highpage(new_page, page, address, vma);
+
+ SetPageDirty(new_page);
+ __SetPageUptodate(new_page);
+ SetPageSwapBacked(new_page);
+ __set_page_locked(new_page);
+
+ if (page_evictable(new_page, vma))
+ lru_cache_add_lru(new_page, LRU_ACTIVE_ANON);
+ else
+ add_page_to_unevictable_list(new_page);
+ }
+
+ page_cache_release(page);
+ return new_page;
+}
+
+int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg,
+ unsigned long *vm_flags)
+{
+ struct stable_node *stable_node;
+ struct rmap_item *rmap_item;
+ struct hlist_node *hlist;
+ unsigned int mapcount = page_mapcount(page);
+ int referenced = 0;
+ int search_new_forks = 0;
+
+ VM_BUG_ON(!PageKsm(page));
+ VM_BUG_ON(!PageLocked(page));
+
+ stable_node = page_stable_node(page);
+ if (!stable_node)
+ return 0;
+again:
+ hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
+ struct anon_vma *anon_vma = rmap_item->anon_vma;
+ struct vm_area_struct *vma;
+
+ spin_lock(&anon_vma->lock);
+ list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
+ if (rmap_item->address < vma->vm_start ||
+ rmap_item->address >= vma->vm_end)
+ continue;
+ /*
+ * Initially we examine only the vma which covers this
+ * rmap_item; but later, if there is still work to do,
+ * we examine covering vmas in other mms: in case they
+ * were forked from the original since ksmd passed.
+ */
+ if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
+ continue;
+
+ if (memcg && !mm_match_cgroup(vma->vm_mm, memcg))
+ continue;
+
+ referenced += page_referenced_one(page, vma,
+ rmap_item->address, &mapcount, vm_flags);
+ if (!search_new_forks || !mapcount)
+ break;
+ }
+ spin_unlock(&anon_vma->lock);
+ if (!mapcount)
+ goto out;
+ }
+ if (!search_new_forks++)
+ goto again;
+out:
+ return referenced;
+}
+
+int try_to_unmap_ksm(struct page *page, enum ttu_flags flags)
+{
+ struct stable_node *stable_node;
+ struct hlist_node *hlist;
+ struct rmap_item *rmap_item;
+ int ret = SWAP_AGAIN;
+ int search_new_forks = 0;
+
+ VM_BUG_ON(!PageKsm(page));
+ VM_BUG_ON(!PageLocked(page));
+
+ stable_node = page_stable_node(page);
+ if (!stable_node)
+ return SWAP_FAIL;
+again:
+ hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
+ struct anon_vma *anon_vma = rmap_item->anon_vma;
+ struct vm_area_struct *vma;
+
+ spin_lock(&anon_vma->lock);
+ list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
+ if (rmap_item->address < vma->vm_start ||
+ rmap_item->address >= vma->vm_end)
+ continue;
+ /*
+ * Initially we examine only the vma which covers this
+ * rmap_item; but later, if there is still work to do,
+ * we examine covering vmas in other mms: in case they
+ * were forked from the original since ksmd passed.
+ */
+ if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
+ continue;
+
+ ret = try_to_unmap_one(page, vma,
+ rmap_item->address, flags);
+ if (ret != SWAP_AGAIN || !page_mapped(page)) {
+ spin_unlock(&anon_vma->lock);
+ goto out;
+ }
+ }
+ spin_unlock(&anon_vma->lock);
+ }
+ if (!search_new_forks++)
+ goto again;
+out:
+ return ret;
+}
+
+#ifdef CONFIG_MIGRATION
+int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *,
+ struct vm_area_struct *, unsigned long, void *), void *arg)
+{
+ struct stable_node *stable_node;
+ struct hlist_node *hlist;
+ struct rmap_item *rmap_item;
+ int ret = SWAP_AGAIN;
+ int search_new_forks = 0;
+
+ VM_BUG_ON(!PageKsm(page));
+ VM_BUG_ON(!PageLocked(page));
+
+ stable_node = page_stable_node(page);
+ if (!stable_node)
+ return ret;
+again:
+ hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
+ struct anon_vma *anon_vma = rmap_item->anon_vma;
+ struct vm_area_struct *vma;
+
+ spin_lock(&anon_vma->lock);
+ list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
+ if (rmap_item->address < vma->vm_start ||
+ rmap_item->address >= vma->vm_end)
+ continue;
+ /*
+ * Initially we examine only the vma which covers this
+ * rmap_item; but later, if there is still work to do,
+ * we examine covering vmas in other mms: in case they
+ * were forked from the original since ksmd passed.
+ */
+ if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
+ continue;
+
+ ret = rmap_one(page, vma, rmap_item->address, arg);
+ if (ret != SWAP_AGAIN) {
+ spin_unlock(&anon_vma->lock);
+ goto out;
+ }
+ }
+ spin_unlock(&anon_vma->lock);
+ }
+ if (!search_new_forks++)
+ goto again;
+out:
+ return ret;
+}
+
+void ksm_migrate_page(struct page *newpage, struct page *oldpage)
+{
+ struct stable_node *stable_node;
+
+ VM_BUG_ON(!PageLocked(oldpage));
+ VM_BUG_ON(!PageLocked(newpage));
+ VM_BUG_ON(newpage->mapping != oldpage->mapping);
+
+ stable_node = page_stable_node(newpage);
+ if (stable_node) {
+ VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage));
+ stable_node->kpfn = page_to_pfn(newpage);
+ }
+}
+#endif /* CONFIG_MIGRATION */
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+static struct stable_node *ksm_check_stable_tree(unsigned long start_pfn,
+ unsigned long end_pfn)
+{
+ struct rb_node *node;
+
+ for (node = rb_first(&root_stable_tree); node; node = rb_next(node)) {
+ struct stable_node *stable_node;
+
+ stable_node = rb_entry(node, struct stable_node, node);
+ if (stable_node->kpfn >= start_pfn &&
+ stable_node->kpfn < end_pfn)
+ return stable_node;
+ }
+ return NULL;
+}
+
+static int ksm_memory_callback(struct notifier_block *self,
+ unsigned long action, void *arg)
+{
+ struct memory_notify *mn = arg;
+ struct stable_node *stable_node;
+
+ switch (action) {
+ case MEM_GOING_OFFLINE:
+ /*
+ * Keep it very simple for now: just lock out ksmd and
+ * MADV_UNMERGEABLE while any memory is going offline.
+ */
+ mutex_lock(&ksm_thread_mutex);
+ break;
+
+ case MEM_OFFLINE:
+ /*
+ * Most of the work is done by page migration; but there might
+ * be a few stable_nodes left over, still pointing to struct
+ * pages which have been offlined: prune those from the tree.
+ */
+ while ((stable_node = ksm_check_stable_tree(mn->start_pfn,
+ mn->start_pfn + mn->nr_pages)) != NULL)
+ remove_node_from_stable_tree(stable_node);
+ /* fallthrough */
+
+ case MEM_CANCEL_OFFLINE:
+ mutex_unlock(&ksm_thread_mutex);
+ break;
+ }
+ return NOTIFY_OK;
+}
+#endif /* CONFIG_MEMORY_HOTREMOVE */
+
#ifdef CONFIG_SYSFS
/*
* This all compiles without CONFIG_SYSFS, but is a waste of space.
@@ -1549,8 +1839,8 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
/*
* KSM_RUN_MERGE sets ksmd running, and 0 stops it running.
* KSM_RUN_UNMERGE stops it running and unmerges all rmap_items,
- * breaking COW to free the unswappable pages_shared (but leaves
- * mm_slots on the list for when ksmd may be set running again).
+ * breaking COW to free the pages_shared (but leaves mm_slots
+ * on the list for when ksmd may be set running again).
*/
mutex_lock(&ksm_thread_mutex);
@@ -1575,29 +1865,6 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
}
KSM_ATTR(run);
-static ssize_t max_kernel_pages_store(struct kobject *kobj,
- struct kobj_attribute *attr,
- const char *buf, size_t count)
-{
- int err;
- unsigned long nr_pages;
-
- err = strict_strtoul(buf, 10, &nr_pages);
- if (err)
- return -EINVAL;
-
- ksm_max_kernel_pages = nr_pages;
-
- return count;
-}
-
-static ssize_t max_kernel_pages_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
-{
- return sprintf(buf, "%lu\n", ksm_max_kernel_pages);
-}
-KSM_ATTR(max_kernel_pages);
-
static ssize_t pages_shared_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
@@ -1647,7 +1914,6 @@ static struct attribute *ksm_attrs[] = {
&sleep_millisecs_attr.attr,
&pages_to_scan_attr.attr,
&run_attr.attr,
- &max_kernel_pages_attr.attr,
&pages_shared_attr.attr,
&pages_sharing_attr.attr,
&pages_unshared_attr.attr,
@@ -1689,8 +1955,18 @@ static int __init ksm_init(void)
kthread_stop(ksm_thread);
goto out_free2;
}
+#else
+ ksm_run = KSM_RUN_MERGE; /* no way for user to start it */
+
#endif /* CONFIG_SYSFS */
+#ifdef CONFIG_MEMORY_HOTREMOVE
+ /*
+ * Choose a high priority since the callback takes ksm_thread_mutex:
+ * later callbacks could only be taking locks which nest within that.
+ */
+ hotplug_memory_notifier(ksm_memory_callback, 100);
+#endif
return 0;
out_free2:
diff --git a/mm/madvise.c b/mm/madvise.c
index d9ae2067952e..35b1479b7c9d 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -218,6 +218,32 @@ static long madvise_remove(struct vm_area_struct *vma,
return error;
}
+#ifdef CONFIG_MEMORY_FAILURE
+/*
+ * Error injection support for memory error handling.
+ */
+static int madvise_hwpoison(unsigned long start, unsigned long end)
+{
+ int ret = 0;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ for (; start < end; start += PAGE_SIZE) {
+ struct page *p;
+ int ret = get_user_pages(current, current->mm, start, 1,
+ 0, 0, &p, NULL);
+ if (ret != 1)
+ return ret;
+ printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n",
+ page_to_pfn(p), start);
+ /* Ignore return value for now */
+ __memory_failure(page_to_pfn(p), 0, 1);
+ put_page(p);
+ }
+ return ret;
+}
+#endif
+
static long
madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
unsigned long start, unsigned long end, int behavior)
@@ -308,6 +334,10 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
int write;
size_t len;
+#ifdef CONFIG_MEMORY_FAILURE
+ if (behavior == MADV_HWPOISON)
+ return madvise_hwpoison(start, start+len_in);
+#endif
if (!madvise_behavior_valid(behavior))
return error;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 9b10d8753784..878808c4fcbe 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -29,6 +29,7 @@
#include <linux/rcupdate.h>
#include <linux/limits.h>
#include <linux/mutex.h>
+#include <linux/rbtree.h>
#include <linux/slab.h>
#include <linux/swap.h>
#include <linux/spinlock.h>
@@ -37,12 +38,14 @@
#include <linux/vmalloc.h>
#include <linux/mm_inline.h>
#include <linux/page_cgroup.h>
+#include <linux/cpu.h>
#include "internal.h"
#include <asm/uaccess.h>
struct cgroup_subsys mem_cgroup_subsys __read_mostly;
#define MEM_CGROUP_RECLAIM_RETRIES 5
+struct mem_cgroup *root_mem_cgroup __read_mostly;
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
@@ -52,7 +55,7 @@ static int really_do_swap_account __initdata = 1; /* for remember boot option*/
#define do_swap_account (0)
#endif
-static DEFINE_MUTEX(memcg_tasklist); /* can be hold under cgroup_mutex */
+#define SOFTLIMIT_EVENTS_THRESH (1000)
/*
* Statistics for memory cgroup.
@@ -63,9 +66,11 @@ enum mem_cgroup_stat_index {
*/
MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */
MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */
- MEM_CGROUP_STAT_MAPPED_FILE, /* # of pages charged as file rss */
+ MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */
MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */
MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */
+ MEM_CGROUP_STAT_EVENTS, /* sum of pagein + pageout for internal use */
+ MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
MEM_CGROUP_STAT_NSTATS,
};
@@ -78,6 +83,20 @@ struct mem_cgroup_stat {
struct mem_cgroup_stat_cpu cpustat[0];
};
+static inline void
+__mem_cgroup_stat_reset_safe(struct mem_cgroup_stat_cpu *stat,
+ enum mem_cgroup_stat_index idx)
+{
+ stat->count[idx] = 0;
+}
+
+static inline s64
+__mem_cgroup_stat_read_local(struct mem_cgroup_stat_cpu *stat,
+ enum mem_cgroup_stat_index idx)
+{
+ return stat->count[idx];
+}
+
/*
* For accounting under irq disable, no need for increment preempt count.
*/
@@ -117,6 +136,12 @@ struct mem_cgroup_per_zone {
unsigned long count[NR_LRU_LISTS];
struct zone_reclaim_stat reclaim_stat;
+ struct rb_node tree_node; /* RB tree node */
+ unsigned long long usage_in_excess;/* Set to the value by which */
+ /* the soft limit is exceeded*/
+ bool on_tree;
+ struct mem_cgroup *mem; /* Back pointer, we cannot */
+ /* use container_of */
};
/* Macro for accessing counter */
#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)])
@@ -130,6 +155,26 @@ struct mem_cgroup_lru_info {
};
/*
+ * Cgroups above their limits are maintained in a RB-Tree, independent of
+ * their hierarchy representation
+ */
+
+struct mem_cgroup_tree_per_zone {
+ struct rb_root rb_root;
+ spinlock_t lock;
+};
+
+struct mem_cgroup_tree_per_node {
+ struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
+};
+
+struct mem_cgroup_tree {
+ struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
+};
+
+static struct mem_cgroup_tree soft_limit_tree __read_mostly;
+
+/*
* The memory controller data structure. The memory controller controls both
* page cache and RSS per cgroup. We would eventually like to provide
* statistics based on the statistics developed by Rik Van Riel for clock-pro,
@@ -164,7 +209,7 @@ struct mem_cgroup {
int prev_priority; /* for recording reclaim priority */
/*
- * While reclaiming in a hiearchy, we cache the last child we
+ * While reclaiming in a hierarchy, we cache the last child we
* reclaimed from.
*/
int last_scanned_child;
@@ -186,6 +231,13 @@ struct mem_cgroup {
struct mem_cgroup_stat stat;
};
+/*
+ * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
+ * limit reclaim to prevent infinite loops, if they ever occur.
+ */
+#define MEM_CGROUP_MAX_RECLAIM_LOOPS (100)
+#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS (2)
+
enum charge_type {
MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
MEM_CGROUP_CHARGE_TYPE_MAPPED,
@@ -200,13 +252,8 @@ enum charge_type {
#define PCGF_CACHE (1UL << PCG_CACHE)
#define PCGF_USED (1UL << PCG_USED)
#define PCGF_LOCK (1UL << PCG_LOCK)
-static const unsigned long
-pcg_default_flags[NR_CHARGE_TYPE] = {
- PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* File Cache */
- PCGF_USED | PCGF_LOCK, /* Anon */
- PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */
- 0, /* FORCE */
-};
+/* Not used, but added here for completeness */
+#define PCGF_ACCT (1UL << PCG_ACCT)
/* for encoding cft->private value on file */
#define _MEM (0)
@@ -215,15 +262,238 @@ pcg_default_flags[NR_CHARGE_TYPE] = {
#define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff)
#define MEMFILE_ATTR(val) ((val) & 0xffff)
+/*
+ * Reclaim flags for mem_cgroup_hierarchical_reclaim
+ */
+#define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0
+#define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
+#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1
+#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
+#define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2
+#define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT)
+
static void mem_cgroup_get(struct mem_cgroup *mem);
static void mem_cgroup_put(struct mem_cgroup *mem);
static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
+static void drain_all_stock_async(void);
+
+static struct mem_cgroup_per_zone *
+mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
+{
+ return &mem->info.nodeinfo[nid]->zoneinfo[zid];
+}
+
+static struct mem_cgroup_per_zone *
+page_cgroup_zoneinfo(struct page_cgroup *pc)
+{
+ struct mem_cgroup *mem = pc->mem_cgroup;
+ int nid = page_cgroup_nid(pc);
+ int zid = page_cgroup_zid(pc);
+
+ if (!mem)
+ return NULL;
+
+ return mem_cgroup_zoneinfo(mem, nid, zid);
+}
+
+static struct mem_cgroup_tree_per_zone *
+soft_limit_tree_node_zone(int nid, int zid)
+{
+ return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
+}
+
+static struct mem_cgroup_tree_per_zone *
+soft_limit_tree_from_page(struct page *page)
+{
+ int nid = page_to_nid(page);
+ int zid = page_zonenum(page);
+
+ return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
+}
+
+static void
+__mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
+ struct mem_cgroup_per_zone *mz,
+ struct mem_cgroup_tree_per_zone *mctz,
+ unsigned long long new_usage_in_excess)
+{
+ struct rb_node **p = &mctz->rb_root.rb_node;
+ struct rb_node *parent = NULL;
+ struct mem_cgroup_per_zone *mz_node;
+
+ if (mz->on_tree)
+ return;
+
+ mz->usage_in_excess = new_usage_in_excess;
+ if (!mz->usage_in_excess)
+ return;
+ while (*p) {
+ parent = *p;
+ mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
+ tree_node);
+ if (mz->usage_in_excess < mz_node->usage_in_excess)
+ p = &(*p)->rb_left;
+ /*
+ * We can't avoid mem cgroups that are over their soft
+ * limit by the same amount
+ */
+ else if (mz->usage_in_excess >= mz_node->usage_in_excess)
+ p = &(*p)->rb_right;
+ }
+ rb_link_node(&mz->tree_node, parent, p);
+ rb_insert_color(&mz->tree_node, &mctz->rb_root);
+ mz->on_tree = true;
+}
+
+static void
+__mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
+ struct mem_cgroup_per_zone *mz,
+ struct mem_cgroup_tree_per_zone *mctz)
+{
+ if (!mz->on_tree)
+ return;
+ rb_erase(&mz->tree_node, &mctz->rb_root);
+ mz->on_tree = false;
+}
+
+static void
+mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
+ struct mem_cgroup_per_zone *mz,
+ struct mem_cgroup_tree_per_zone *mctz)
+{
+ spin_lock(&mctz->lock);
+ __mem_cgroup_remove_exceeded(mem, mz, mctz);
+ spin_unlock(&mctz->lock);
+}
+
+static bool mem_cgroup_soft_limit_check(struct mem_cgroup *mem)
+{
+ bool ret = false;
+ int cpu;
+ s64 val;
+ struct mem_cgroup_stat_cpu *cpustat;
+
+ cpu = get_cpu();
+ cpustat = &mem->stat.cpustat[cpu];
+ val = __mem_cgroup_stat_read_local(cpustat, MEM_CGROUP_STAT_EVENTS);
+ if (unlikely(val > SOFTLIMIT_EVENTS_THRESH)) {
+ __mem_cgroup_stat_reset_safe(cpustat, MEM_CGROUP_STAT_EVENTS);
+ ret = true;
+ }
+ put_cpu();
+ return ret;
+}
+
+static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page)
+{
+ unsigned long long excess;
+ struct mem_cgroup_per_zone *mz;
+ struct mem_cgroup_tree_per_zone *mctz;
+ int nid = page_to_nid(page);
+ int zid = page_zonenum(page);
+ mctz = soft_limit_tree_from_page(page);
+
+ /*
+ * Necessary to update all ancestors when hierarchy is used.
+ * because their event counter is not touched.
+ */
+ for (; mem; mem = parent_mem_cgroup(mem)) {
+ mz = mem_cgroup_zoneinfo(mem, nid, zid);
+ excess = res_counter_soft_limit_excess(&mem->res);
+ /*
+ * We have to update the tree if mz is on RB-tree or
+ * mem is over its softlimit.
+ */
+ if (excess || mz->on_tree) {
+ spin_lock(&mctz->lock);
+ /* if on-tree, remove it */
+ if (mz->on_tree)
+ __mem_cgroup_remove_exceeded(mem, mz, mctz);
+ /*
+ * Insert again. mz->usage_in_excess will be updated.
+ * If excess is 0, no tree ops.
+ */
+ __mem_cgroup_insert_exceeded(mem, mz, mctz, excess);
+ spin_unlock(&mctz->lock);
+ }
+ }
+}
+
+static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem)
+{
+ int node, zone;
+ struct mem_cgroup_per_zone *mz;
+ struct mem_cgroup_tree_per_zone *mctz;
+
+ for_each_node_state(node, N_POSSIBLE) {
+ for (zone = 0; zone < MAX_NR_ZONES; zone++) {
+ mz = mem_cgroup_zoneinfo(mem, node, zone);
+ mctz = soft_limit_tree_node_zone(node, zone);
+ mem_cgroup_remove_exceeded(mem, mz, mctz);
+ }
+ }
+}
+
+static inline unsigned long mem_cgroup_get_excess(struct mem_cgroup *mem)
+{
+ return res_counter_soft_limit_excess(&mem->res) >> PAGE_SHIFT;
+}
+
+static struct mem_cgroup_per_zone *
+__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
+{
+ struct rb_node *rightmost = NULL;
+ struct mem_cgroup_per_zone *mz;
+
+retry:
+ mz = NULL;
+ rightmost = rb_last(&mctz->rb_root);
+ if (!rightmost)
+ goto done; /* Nothing to reclaim from */
+
+ mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
+ /*
+ * Remove the node now but someone else can add it back,
+ * we will to add it back at the end of reclaim to its correct
+ * position in the tree.
+ */
+ __mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
+ if (!res_counter_soft_limit_excess(&mz->mem->res) ||
+ !css_tryget(&mz->mem->css))
+ goto retry;
+done:
+ return mz;
+}
+
+static struct mem_cgroup_per_zone *
+mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
+{
+ struct mem_cgroup_per_zone *mz;
+
+ spin_lock(&mctz->lock);
+ mz = __mem_cgroup_largest_soft_limit_node(mctz);
+ spin_unlock(&mctz->lock);
+ return mz;
+}
+
+static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
+ bool charge)
+{
+ int val = (charge) ? 1 : -1;
+ struct mem_cgroup_stat *stat = &mem->stat;
+ struct mem_cgroup_stat_cpu *cpustat;
+ int cpu = get_cpu();
+
+ cpustat = &stat->cpustat[cpu];
+ __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_SWAPOUT, val);
+ put_cpu();
+}
static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
struct page_cgroup *pc,
bool charge)
{
- int val = (charge)? 1 : -1;
+ int val = (charge) ? 1 : -1;
struct mem_cgroup_stat *stat = &mem->stat;
struct mem_cgroup_stat_cpu *cpustat;
int cpu = get_cpu();
@@ -240,28 +510,10 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
else
__mem_cgroup_stat_add_safe(cpustat,
MEM_CGROUP_STAT_PGPGOUT_COUNT, 1);
+ __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_EVENTS, 1);
put_cpu();
}
-static struct mem_cgroup_per_zone *
-mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
-{
- return &mem->info.nodeinfo[nid]->zoneinfo[zid];
-}
-
-static struct mem_cgroup_per_zone *
-page_cgroup_zoneinfo(struct page_cgroup *pc)
-{
- struct mem_cgroup *mem = pc->mem_cgroup;
- int nid = page_cgroup_nid(pc);
- int zid = page_cgroup_zid(pc);
-
- if (!mem)
- return NULL;
-
- return mem_cgroup_zoneinfo(mem, nid, zid);
-}
-
static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
enum lru_list idx)
{
@@ -354,6 +606,11 @@ static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data,
return ret;
}
+static inline bool mem_cgroup_is_root(struct mem_cgroup *mem)
+{
+ return (mem == root_mem_cgroup);
+}
+
/*
* Following LRU functions are allowed to be used without PCG_LOCK.
* Operations are called by routine of global LRU independently from memcg.
@@ -371,22 +628,24 @@ static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data,
void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
{
struct page_cgroup *pc;
- struct mem_cgroup *mem;
struct mem_cgroup_per_zone *mz;
if (mem_cgroup_disabled())
return;
pc = lookup_page_cgroup(page);
/* can happen while we handle swapcache. */
- if (list_empty(&pc->lru) || !pc->mem_cgroup)
+ if (!TestClearPageCgroupAcctLRU(pc))
return;
+ VM_BUG_ON(!pc->mem_cgroup);
/*
* We don't check PCG_USED bit. It's cleared when the "page" is finally
* removed from global LRU.
*/
mz = page_cgroup_zoneinfo(pc);
- mem = pc->mem_cgroup;
MEM_CGROUP_ZSTAT(mz, lru) -= 1;
+ if (mem_cgroup_is_root(pc->mem_cgroup))
+ return;
+ VM_BUG_ON(list_empty(&pc->lru));
list_del_init(&pc->lru);
return;
}
@@ -410,8 +669,8 @@ void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
* For making pc->mem_cgroup visible, insert smp_rmb() here.
*/
smp_rmb();
- /* unused page is not rotated. */
- if (!PageCgroupUsed(pc))
+ /* unused or root page is not rotated. */
+ if (!PageCgroupUsed(pc) || mem_cgroup_is_root(pc->mem_cgroup))
return;
mz = page_cgroup_zoneinfo(pc);
list_move(&pc->lru, &mz->lists[lru]);
@@ -425,6 +684,7 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
if (mem_cgroup_disabled())
return;
pc = lookup_page_cgroup(page);
+ VM_BUG_ON(PageCgroupAcctLRU(pc));
/*
* Used bit is set without atomic ops but after smp_wmb().
* For making pc->mem_cgroup visible, insert smp_rmb() here.
@@ -435,6 +695,9 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
mz = page_cgroup_zoneinfo(pc);
MEM_CGROUP_ZSTAT(mz, lru) += 1;
+ SetPageCgroupAcctLRU(pc);
+ if (mem_cgroup_is_root(pc->mem_cgroup))
+ return;
list_add(&pc->lru, &mz->lists[lru]);
}
@@ -469,7 +732,7 @@ static void mem_cgroup_lru_add_after_commit_swapcache(struct page *page)
spin_lock_irqsave(&zone->lru_lock, flags);
/* link when the page is linked to LRU but page_cgroup isn't */
- if (PageLRU(page) && list_empty(&pc->lru))
+ if (PageLRU(page) && !PageCgroupAcctLRU(pc))
mem_cgroup_add_lru_list(page, page_lru(page));
spin_unlock_irqrestore(&zone->lru_lock, flags);
}
@@ -496,7 +759,13 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
task_unlock(task);
if (!curr)
return 0;
- if (curr->use_hierarchy)
+ /*
+ * We should check use_hierarchy of "mem" not "curr". Because checking
+ * use_hierarchy of "curr" here make this function true if hierarchy is
+ * enabled in "curr" and "curr" is a child of "mem" in *cgroup*
+ * hierarchy(even if use_hierarchy is disabled in "mem").
+ */
+ if (mem->use_hierarchy)
ret = css_is_ancestor(&curr->css, &mem->css);
else
ret = (curr == mem);
@@ -745,7 +1014,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
static char memcg_name[PATH_MAX];
int ret;
- if (!memcg)
+ if (!memcg || !p)
return;
@@ -855,28 +1124,64 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
* If shrink==true, for avoiding to free too much, this returns immedieately.
*/
static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
- gfp_t gfp_mask, bool noswap, bool shrink)
+ struct zone *zone,
+ gfp_t gfp_mask,
+ unsigned long reclaim_options)
{
struct mem_cgroup *victim;
int ret, total = 0;
int loop = 0;
+ bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP;
+ bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK;
+ bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT;
+ unsigned long excess = mem_cgroup_get_excess(root_mem);
/* If memsw_is_minimum==1, swap-out is of-no-use. */
if (root_mem->memsw_is_minimum)
noswap = true;
- while (loop < 2) {
+ while (1) {
victim = mem_cgroup_select_victim(root_mem);
- if (victim == root_mem)
+ if (victim == root_mem) {
loop++;
+ if (loop >= 1)
+ drain_all_stock_async();
+ if (loop >= 2) {
+ /*
+ * If we have not been able to reclaim
+ * anything, it might because there are
+ * no reclaimable pages under this hierarchy
+ */
+ if (!check_soft || !total) {
+ css_put(&victim->css);
+ break;
+ }
+ /*
+ * We want to do more targetted reclaim.
+ * excess >> 2 is not to excessive so as to
+ * reclaim too much, nor too less that we keep
+ * coming back to reclaim from this cgroup
+ */
+ if (total >= (excess >> 2) ||
+ (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) {
+ css_put(&victim->css);
+ break;
+ }
+ }
+ }
if (!mem_cgroup_local_usage(&victim->stat)) {
/* this cgroup's local usage == 0 */
css_put(&victim->css);
continue;
}
/* we use swappiness of local cgroup */
- ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, noswap,
- get_swappiness(victim));
+ if (check_soft)
+ ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
+ noswap, get_swappiness(victim), zone,
+ zone->zone_pgdat->node_id);
+ else
+ ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
+ noswap, get_swappiness(victim));
css_put(&victim->css);
/*
* At shrinking usage, we can't check we should stop here or
@@ -886,7 +1191,10 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
if (shrink)
return ret;
total += ret;
- if (mem_cgroup_check_under_limit(root_mem))
+ if (check_soft) {
+ if (res_counter_check_under_soft_limit(&root_mem->res))
+ return total;
+ } else if (mem_cgroup_check_under_limit(root_mem))
return 1 + total;
}
return total;
@@ -924,7 +1232,7 @@ static void record_last_oom(struct mem_cgroup *mem)
* Currently used to update mapped file statistics, but the routine can be
* generalized to update other statistics as well.
*/
-void mem_cgroup_update_mapped_file_stat(struct page *page, int val)
+void mem_cgroup_update_file_mapped(struct page *page, int val)
{
struct mem_cgroup *mem;
struct mem_cgroup_stat *stat;
@@ -932,9 +1240,6 @@ void mem_cgroup_update_mapped_file_stat(struct page *page, int val)
int cpu;
struct page_cgroup *pc;
- if (!page_is_file_cache(page))
- return;
-
pc = lookup_page_cgroup(page);
if (unlikely(!pc))
return;
@@ -954,22 +1259,150 @@ void mem_cgroup_update_mapped_file_stat(struct page *page, int val)
stat = &mem->stat;
cpustat = &stat->cpustat[cpu];
- __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE, val);
+ __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED, val);
done:
unlock_page_cgroup(pc);
}
/*
+ * size of first charge trial. "32" comes from vmscan.c's magic value.
+ * TODO: maybe necessary to use big numbers in big irons.
+ */
+#define CHARGE_SIZE (32 * PAGE_SIZE)
+struct memcg_stock_pcp {
+ struct mem_cgroup *cached; /* this never be root cgroup */
+ int charge;
+ struct work_struct work;
+};
+static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
+static atomic_t memcg_drain_count;
+
+/*
+ * Try to consume stocked charge on this cpu. If success, PAGE_SIZE is consumed
+ * from local stock and true is returned. If the stock is 0 or charges from a
+ * cgroup which is not current target, returns false. This stock will be
+ * refilled.
+ */
+static bool consume_stock(struct mem_cgroup *mem)
+{
+ struct memcg_stock_pcp *stock;
+ bool ret = true;
+
+ stock = &get_cpu_var(memcg_stock);
+ if (mem == stock->cached && stock->charge)
+ stock->charge -= PAGE_SIZE;
+ else /* need to call res_counter_charge */
+ ret = false;
+ put_cpu_var(memcg_stock);
+ return ret;
+}
+
+/*
+ * Returns stocks cached in percpu to res_counter and reset cached information.
+ */
+static void drain_stock(struct memcg_stock_pcp *stock)
+{
+ struct mem_cgroup *old = stock->cached;
+
+ if (stock->charge) {
+ res_counter_uncharge(&old->res, stock->charge);
+ if (do_swap_account)
+ res_counter_uncharge(&old->memsw, stock->charge);
+ }
+ stock->cached = NULL;
+ stock->charge = 0;
+}
+
+/*
+ * This must be called under preempt disabled or must be called by
+ * a thread which is pinned to local cpu.
+ */
+static void drain_local_stock(struct work_struct *dummy)
+{
+ struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);
+ drain_stock(stock);
+}
+
+/*
+ * Cache charges(val) which is from res_counter, to local per_cpu area.
+ * This will be consumed by consumt_stock() function, later.
+ */
+static void refill_stock(struct mem_cgroup *mem, int val)
+{
+ struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
+
+ if (stock->cached != mem) { /* reset if necessary */
+ drain_stock(stock);
+ stock->cached = mem;
+ }
+ stock->charge += val;
+ put_cpu_var(memcg_stock);
+}
+
+/*
+ * Tries to drain stocked charges in other cpus. This function is asynchronous
+ * and just put a work per cpu for draining localy on each cpu. Caller can
+ * expects some charges will be back to res_counter later but cannot wait for
+ * it.
+ */
+static void drain_all_stock_async(void)
+{
+ int cpu;
+ /* This function is for scheduling "drain" in asynchronous way.
+ * The result of "drain" is not directly handled by callers. Then,
+ * if someone is calling drain, we don't have to call drain more.
+ * Anyway, WORK_STRUCT_PENDING check in queue_work_on() will catch if
+ * there is a race. We just do loose check here.
+ */
+ if (atomic_read(&memcg_drain_count))
+ return;
+ /* Notify other cpus that system-wide "drain" is running */
+ atomic_inc(&memcg_drain_count);
+ get_online_cpus();
+ for_each_online_cpu(cpu) {
+ struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
+ schedule_work_on(cpu, &stock->work);
+ }
+ put_online_cpus();
+ atomic_dec(&memcg_drain_count);
+ /* We don't wait for flush_work */
+}
+
+/* This is a synchronous drain interface. */
+static void drain_all_stock_sync(void)
+{
+ /* called when force_empty is called */
+ atomic_inc(&memcg_drain_count);
+ schedule_on_each_cpu(drain_local_stock);
+ atomic_dec(&memcg_drain_count);
+}
+
+static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb,
+ unsigned long action,
+ void *hcpu)
+{
+ int cpu = (unsigned long)hcpu;
+ struct memcg_stock_pcp *stock;
+
+ if (action != CPU_DEAD)
+ return NOTIFY_OK;
+ stock = &per_cpu(memcg_stock, cpu);
+ drain_stock(stock);
+ return NOTIFY_OK;
+}
+
+/*
* Unlike exported interface, "oom" parameter is added. if oom==true,
* oom-killer can be invoked.
*/
static int __mem_cgroup_try_charge(struct mm_struct *mm,
gfp_t gfp_mask, struct mem_cgroup **memcg,
- bool oom)
+ bool oom, struct page *page)
{
struct mem_cgroup *mem, *mem_over_limit;
int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
struct res_counter *fail_res;
+ int csize = CHARGE_SIZE;
if (unlikely(test_thread_flag(TIF_MEMDIE))) {
/* Don't account this! */
@@ -994,22 +1427,26 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
return 0;
VM_BUG_ON(css_is_removed(&mem->css));
+ if (mem_cgroup_is_root(mem))
+ goto done;
while (1) {
- int ret;
- bool noswap = false;
+ int ret = 0;
+ unsigned long flags = 0;
+
+ if (consume_stock(mem))
+ goto charged;
- ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res);
+ ret = res_counter_charge(&mem->res, csize, &fail_res);
if (likely(!ret)) {
if (!do_swap_account)
break;
- ret = res_counter_charge(&mem->memsw, PAGE_SIZE,
- &fail_res);
+ ret = res_counter_charge(&mem->memsw, csize, &fail_res);
if (likely(!ret))
break;
/* mem+swap counter fails */
- res_counter_uncharge(&mem->res, PAGE_SIZE);
- noswap = true;
+ res_counter_uncharge(&mem->res, csize);
+ flags |= MEM_CGROUP_RECLAIM_NOSWAP;
mem_over_limit = mem_cgroup_from_res_counter(fail_res,
memsw);
} else
@@ -1017,11 +1454,16 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
mem_over_limit = mem_cgroup_from_res_counter(fail_res,
res);
+ /* reduce request size and retry */
+ if (csize > PAGE_SIZE) {
+ csize = PAGE_SIZE;
+ continue;
+ }
if (!(gfp_mask & __GFP_WAIT))
goto nomem;
- ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, gfp_mask,
- noswap, false);
+ ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
+ gfp_mask, flags);
if (ret)
continue;
@@ -1038,20 +1480,42 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
if (!nr_retries--) {
if (oom) {
- mutex_lock(&memcg_tasklist);
mem_cgroup_out_of_memory(mem_over_limit, gfp_mask);
- mutex_unlock(&memcg_tasklist);
record_last_oom(mem_over_limit);
}
goto nomem;
}
}
+ if (csize > PAGE_SIZE)
+ refill_stock(mem, csize - PAGE_SIZE);
+charged:
+ /*
+ * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
+ * if they exceeds softlimit.
+ */
+ if (mem_cgroup_soft_limit_check(mem))
+ mem_cgroup_update_tree(mem, page);
+done:
return 0;
nomem:
css_put(&mem->css);
return -ENOMEM;
}
+/*
+ * Somemtimes we have to undo a charge we got by try_charge().
+ * This function is for that and do uncharge, put css's refcnt.
+ * gotten by try_charge().
+ */
+static void mem_cgroup_cancel_charge(struct mem_cgroup *mem)
+{
+ if (!mem_cgroup_is_root(mem)) {
+ res_counter_uncharge(&mem->res, PAGE_SIZE);
+ if (do_swap_account)
+ res_counter_uncharge(&mem->memsw, PAGE_SIZE);
+ }
+ css_put(&mem->css);
+}
/*
* A helper function to get mem_cgroup from ID. must be called under
@@ -1119,15 +1583,32 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
lock_page_cgroup(pc);
if (unlikely(PageCgroupUsed(pc))) {
unlock_page_cgroup(pc);
- res_counter_uncharge(&mem->res, PAGE_SIZE);
- if (do_swap_account)
- res_counter_uncharge(&mem->memsw, PAGE_SIZE);
- css_put(&mem->css);
+ mem_cgroup_cancel_charge(mem);
return;
}
+
pc->mem_cgroup = mem;
+ /*
+ * We access a page_cgroup asynchronously without lock_page_cgroup().
+ * Especially when a page_cgroup is taken from a page, pc->mem_cgroup
+ * is accessed after testing USED bit. To make pc->mem_cgroup visible
+ * before USED bit, we need memory barrier here.
+ * See mem_cgroup_add_lru_list(), etc.
+ */
smp_wmb();
- pc->flags = pcg_default_flags[ctype];
+ switch (ctype) {
+ case MEM_CGROUP_CHARGE_TYPE_CACHE:
+ case MEM_CGROUP_CHARGE_TYPE_SHMEM:
+ SetPageCgroupCache(pc);
+ SetPageCgroupUsed(pc);
+ break;
+ case MEM_CGROUP_CHARGE_TYPE_MAPPED:
+ ClearPageCgroupCache(pc);
+ SetPageCgroupUsed(pc);
+ break;
+ default:
+ break;
+ }
mem_cgroup_charge_statistics(mem, pc, true);
@@ -1135,27 +1616,22 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
}
/**
- * mem_cgroup_move_account - move account of the page
+ * __mem_cgroup_move_account - move account of the page
* @pc: page_cgroup of the page.
* @from: mem_cgroup which the page is moved from.
* @to: mem_cgroup which the page is moved to. @from != @to.
*
* The caller must confirm following.
* - page is not on LRU (isolate_page() is useful.)
- *
- * returns 0 at success,
- * returns -EBUSY when lock is busy or "pc" is unstable.
+ * - the pc is locked, used, and ->mem_cgroup points to @from.
*
* This function does "uncharge" from old cgroup but doesn't do "charge" to
* new cgroup. It should be done by a caller.
*/
-static int mem_cgroup_move_account(struct page_cgroup *pc,
+static void __mem_cgroup_move_account(struct page_cgroup *pc,
struct mem_cgroup *from, struct mem_cgroup *to)
{
- struct mem_cgroup_per_zone *from_mz, *to_mz;
- int nid, zid;
- int ret = -EBUSY;
struct page *page;
int cpu;
struct mem_cgroup_stat *stat;
@@ -1163,56 +1639,59 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
VM_BUG_ON(from == to);
VM_BUG_ON(PageLRU(pc->page));
+ VM_BUG_ON(!PageCgroupLocked(pc));
+ VM_BUG_ON(!PageCgroupUsed(pc));
+ VM_BUG_ON(pc->mem_cgroup != from);
- nid = page_cgroup_nid(pc);
- zid = page_cgroup_zid(pc);
- from_mz = mem_cgroup_zoneinfo(from, nid, zid);
- to_mz = mem_cgroup_zoneinfo(to, nid, zid);
-
- if (!trylock_page_cgroup(pc))
- return ret;
-
- if (!PageCgroupUsed(pc))
- goto out;
-
- if (pc->mem_cgroup != from)
- goto out;
-
- res_counter_uncharge(&from->res, PAGE_SIZE);
+ if (!mem_cgroup_is_root(from))
+ res_counter_uncharge(&from->res, PAGE_SIZE);
mem_cgroup_charge_statistics(from, pc, false);
page = pc->page;
- if (page_is_file_cache(page) && page_mapped(page)) {
+ if (page_mapped(page) && !PageAnon(page)) {
cpu = smp_processor_id();
/* Update mapped_file data for mem_cgroup "from" */
stat = &from->stat;
cpustat = &stat->cpustat[cpu];
- __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE,
+ __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED,
-1);
/* Update mapped_file data for mem_cgroup "to" */
stat = &to->stat;
cpustat = &stat->cpustat[cpu];
- __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE,
+ __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED,
1);
}
- if (do_swap_account)
+ if (do_swap_account && !mem_cgroup_is_root(from))
res_counter_uncharge(&from->memsw, PAGE_SIZE);
css_put(&from->css);
css_get(&to->css);
pc->mem_cgroup = to;
mem_cgroup_charge_statistics(to, pc, true);
- ret = 0;
-out:
- unlock_page_cgroup(pc);
/*
* We charges against "to" which may not have any tasks. Then, "to"
* can be under rmdir(). But in current implementation, caller of
* this function is just force_empty() and it's garanteed that
* "to" is never removed. So, we don't check rmdir status here.
*/
+}
+
+/*
+ * check whether the @pc is valid for moving account and call
+ * __mem_cgroup_move_account()
+ */
+static int mem_cgroup_move_account(struct page_cgroup *pc,
+ struct mem_cgroup *from, struct mem_cgroup *to)
+{
+ int ret = -EINVAL;
+ lock_page_cgroup(pc);
+ if (PageCgroupUsed(pc) && pc->mem_cgroup == from) {
+ __mem_cgroup_move_account(pc, from, to);
+ ret = 0;
+ }
+ unlock_page_cgroup(pc);
return ret;
}
@@ -1234,43 +1713,27 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
if (!pcg)
return -EINVAL;
+ ret = -EBUSY;
+ if (!get_page_unless_zero(page))
+ goto out;
+ if (isolate_lru_page(page))
+ goto put;
parent = mem_cgroup_from_cont(pcg);
-
-
- ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false);
+ ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, page);
if (ret || !parent)
- return ret;
-
- if (!get_page_unless_zero(page)) {
- ret = -EBUSY;
- goto uncharge;
- }
-
- ret = isolate_lru_page(page);
-
- if (ret)
- goto cancel;
+ goto put_back;
ret = mem_cgroup_move_account(pc, child, parent);
-
+ if (!ret)
+ css_put(&parent->css); /* drop extra refcnt by try_charge() */
+ else
+ mem_cgroup_cancel_charge(parent); /* does css_put */
+put_back:
putback_lru_page(page);
- if (!ret) {
- put_page(page);
- /* drop extra refcnt by try_charge() */
- css_put(&parent->css);
- return 0;
- }
-
-cancel:
+put:
put_page(page);
-uncharge:
- /* drop extra refcnt by try_charge() */
- css_put(&parent->css);
- /* uncharge if move fails */
- res_counter_uncharge(&parent->res, PAGE_SIZE);
- if (do_swap_account)
- res_counter_uncharge(&parent->memsw, PAGE_SIZE);
+out:
return ret;
}
@@ -1295,7 +1758,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
prefetchw(pc);
mem = memcg;
- ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true);
+ ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page);
if (ret || !mem)
return ret;
@@ -1388,7 +1851,7 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
/*
* While swap-in, try_charge -> commit or cancel, the page is locked.
* And when try_charge() successfully returns, one refcnt to memcg without
- * struct page_cgroup is aquired. This refcnt will be cumsumed by
+ * struct page_cgroup is acquired. This refcnt will be consumed by
* "commit()" or removed by "cancel()"
*/
int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
@@ -1405,23 +1868,24 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
goto charge_cur_mm;
/*
* A racing thread's fault, or swapoff, may have already updated
- * the pte, and even removed page from swap cache: return success
- * to go on to do_swap_page()'s pte_same() test, which should fail.
+ * the pte, and even removed page from swap cache: in those cases
+ * do_swap_page()'s pte_same() test will fail; but there's also a
+ * KSM case which does need to charge the page.
*/
if (!PageSwapCache(page))
- return 0;
+ goto charge_cur_mm;
mem = try_get_mem_cgroup_from_swapcache(page);
if (!mem)
goto charge_cur_mm;
*ptr = mem;
- ret = __mem_cgroup_try_charge(NULL, mask, ptr, true);
+ ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, page);
/* drop extra refcnt from tryget */
css_put(&mem->css);
return ret;
charge_cur_mm:
if (unlikely(!mm))
mm = &init_mm;
- return __mem_cgroup_try_charge(mm, mask, ptr, true);
+ return __mem_cgroup_try_charge(mm, mask, ptr, true, page);
}
static void
@@ -1459,7 +1923,9 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
* This recorded memcg can be obsolete one. So, avoid
* calling css_tryget
*/
- res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
+ if (!mem_cgroup_is_root(memcg))
+ res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
+ mem_cgroup_swap_statistics(memcg, false);
mem_cgroup_put(memcg);
}
rcu_read_unlock();
@@ -1484,13 +1950,54 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
return;
if (!mem)
return;
+ mem_cgroup_cancel_charge(mem);
+}
+
+static void
+__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
+{
+ struct memcg_batch_info *batch = NULL;
+ bool uncharge_memsw = true;
+ /* If swapout, usage of swap doesn't decrease */
+ if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
+ uncharge_memsw = false;
+ /*
+ * do_batch > 0 when unmapping pages or inode invalidate/truncate.
+ * In those cases, all pages freed continously can be expected to be in
+ * the same cgroup and we have chance to coalesce uncharges.
+ * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)
+ * because we want to do uncharge as soon as possible.
+ */
+ if (!current->memcg_batch.do_batch || test_thread_flag(TIF_MEMDIE))
+ goto direct_uncharge;
+
+ batch = &current->memcg_batch;
+ /*
+ * In usual, we do css_get() when we remember memcg pointer.
+ * But in this case, we keep res->usage until end of a series of
+ * uncharges. Then, it's ok to ignore memcg's refcnt.
+ */
+ if (!batch->memcg)
+ batch->memcg = mem;
+ /*
+ * In typical case, batch->memcg == mem. This means we can
+ * merge a series of uncharges to an uncharge of res_counter.
+ * If not, we uncharge res_counter ony by one.
+ */
+ if (batch->memcg != mem)
+ goto direct_uncharge;
+ /* remember freed charge and uncharge it later */
+ batch->bytes += PAGE_SIZE;
+ if (uncharge_memsw)
+ batch->memsw_bytes += PAGE_SIZE;
+ return;
+direct_uncharge:
res_counter_uncharge(&mem->res, PAGE_SIZE);
- if (do_swap_account)
+ if (uncharge_memsw)
res_counter_uncharge(&mem->memsw, PAGE_SIZE);
- css_put(&mem->css);
+ return;
}
-
/*
* uncharge if !page_mapped(page)
*/
@@ -1538,9 +2045,10 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
break;
}
- res_counter_uncharge(&mem->res, PAGE_SIZE);
- if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT))
- res_counter_uncharge(&mem->memsw, PAGE_SIZE);
+ if (!mem_cgroup_is_root(mem))
+ __do_uncharge(mem, ctype);
+ if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
+ mem_cgroup_swap_statistics(mem, true);
mem_cgroup_charge_statistics(mem, pc, false);
ClearPageCgroupUsed(pc);
@@ -1554,6 +2062,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
mz = page_cgroup_zoneinfo(pc);
unlock_page_cgroup(pc);
+ if (mem_cgroup_soft_limit_check(mem))
+ mem_cgroup_update_tree(mem, page);
/* at swapout, this memcg will be accessed to record to swap */
if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
css_put(&mem->css);
@@ -1582,6 +2092,50 @@ void mem_cgroup_uncharge_cache_page(struct page *page)
__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
}
+/*
+ * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate.
+ * In that cases, pages are freed continuously and we can expect pages
+ * are in the same memcg. All these calls itself limits the number of
+ * pages freed at once, then uncharge_start/end() is called properly.
+ * This may be called prural(2) times in a context,
+ */
+
+void mem_cgroup_uncharge_start(void)
+{
+ current->memcg_batch.do_batch++;
+ /* We can do nest. */
+ if (current->memcg_batch.do_batch == 1) {
+ current->memcg_batch.memcg = NULL;
+ current->memcg_batch.bytes = 0;
+ current->memcg_batch.memsw_bytes = 0;
+ }
+}
+
+void mem_cgroup_uncharge_end(void)
+{
+ struct memcg_batch_info *batch = &current->memcg_batch;
+
+ if (!batch->do_batch)
+ return;
+
+ batch->do_batch--;
+ if (batch->do_batch) /* If stacked, do nothing. */
+ return;
+
+ if (!batch->memcg)
+ return;
+ /*
+ * This "batch->memcg" is valid without any css_get/put etc...
+ * bacause we hide charges behind us.
+ */
+ if (batch->bytes)
+ res_counter_uncharge(&batch->memcg->res, batch->bytes);
+ if (batch->memsw_bytes)
+ res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes);
+ /* forget this pointer (for sanity check) */
+ batch->memcg = NULL;
+}
+
#ifdef CONFIG_SWAP
/*
* called after __delete_from_swap_cache() and drop "page" account.
@@ -1629,7 +2183,9 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent)
* We uncharge this because swap is freed.
* This memcg can be obsolete one. We avoid calling css_tryget
*/
- res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
+ if (!mem_cgroup_is_root(memcg))
+ res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
+ mem_cgroup_swap_statistics(memcg, false);
mem_cgroup_put(memcg);
}
rcu_read_unlock();
@@ -1658,7 +2214,8 @@ int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr)
unlock_page_cgroup(pc);
if (mem) {
- ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false);
+ ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false,
+ page);
css_put(&mem->css);
}
*ptr = mem;
@@ -1754,7 +2311,6 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
unsigned long long val)
{
int retry_count;
- int progress;
u64 memswlimit;
int ret = 0;
int children = mem_cgroup_count_children(memcg);
@@ -1798,8 +2354,8 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
if (!ret)
break;
- progress = mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL,
- false, true);
+ mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
+ MEM_CGROUP_RECLAIM_SHRINK);
curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
/* Usage is reduced ? */
if (curusage >= oldusage)
@@ -1851,7 +2407,9 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
if (!ret)
break;
- mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, true, true);
+ mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
+ MEM_CGROUP_RECLAIM_NOSWAP |
+ MEM_CGROUP_RECLAIM_SHRINK);
curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
/* Usage is reduced ? */
if (curusage >= oldusage)
@@ -1862,6 +2420,97 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
return ret;
}
+unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
+ gfp_t gfp_mask, int nid,
+ int zid)
+{
+ unsigned long nr_reclaimed = 0;
+ struct mem_cgroup_per_zone *mz, *next_mz = NULL;
+ unsigned long reclaimed;
+ int loop = 0;
+ struct mem_cgroup_tree_per_zone *mctz;
+ unsigned long long excess;
+
+ if (order > 0)
+ return 0;
+
+ mctz = soft_limit_tree_node_zone(nid, zid);
+ /*
+ * This loop can run a while, specially if mem_cgroup's continuously
+ * keep exceeding their soft limit and putting the system under
+ * pressure
+ */
+ do {
+ if (next_mz)
+ mz = next_mz;
+ else
+ mz = mem_cgroup_largest_soft_limit_node(mctz);
+ if (!mz)
+ break;
+
+ reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone,
+ gfp_mask,
+ MEM_CGROUP_RECLAIM_SOFT);
+ nr_reclaimed += reclaimed;
+ spin_lock(&mctz->lock);
+
+ /*
+ * If we failed to reclaim anything from this memory cgroup
+ * it is time to move on to the next cgroup
+ */
+ next_mz = NULL;
+ if (!reclaimed) {
+ do {
+ /*
+ * Loop until we find yet another one.
+ *
+ * By the time we get the soft_limit lock
+ * again, someone might have aded the
+ * group back on the RB tree. Iterate to
+ * make sure we get a different mem.
+ * mem_cgroup_largest_soft_limit_node returns
+ * NULL if no other cgroup is present on
+ * the tree
+ */
+ next_mz =
+ __mem_cgroup_largest_soft_limit_node(mctz);
+ if (next_mz == mz) {
+ css_put(&next_mz->mem->css);
+ next_mz = NULL;
+ } else /* next_mz == NULL or other memcg */
+ break;
+ } while (1);
+ }
+ __mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
+ excess = res_counter_soft_limit_excess(&mz->mem->res);
+ /*
+ * One school of thought says that we should not add
+ * back the node to the tree if reclaim returns 0.
+ * But our reclaim could return 0, simply because due
+ * to priority we are exposing a smaller subset of
+ * memory to reclaim from. Consider this as a longer
+ * term TODO.
+ */
+ /* If excess == 0, no tree ops */
+ __mem_cgroup_insert_exceeded(mz->mem, mz, mctz, excess);
+ spin_unlock(&mctz->lock);
+ css_put(&mz->mem->css);
+ loop++;
+ /*
+ * Could not reclaim anything and there are no more
+ * mem cgroups to try or we seem to be looping without
+ * reclaiming anything.
+ */
+ if (!nr_reclaimed &&
+ (next_mz == NULL ||
+ loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
+ break;
+ } while (!nr_reclaimed);
+ if (next_mz)
+ css_put(&next_mz->mem->css);
+ return nr_reclaimed;
+}
+
/*
* This routine traverse page_cgroup in given list and drop them all.
* *And* this routine doesn't reclaim page itself, just removes page_cgroup.
@@ -1944,6 +2593,7 @@ move_account:
goto out;
/* This is for making all *used* pages to be on LRU. */
lru_add_drain_all();
+ drain_all_stock_sync();
ret = 0;
for_each_node_state(node, N_HIGH_MEMORY) {
for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
@@ -2026,7 +2676,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
cgroup_lock();
/*
- * If parent's use_hiearchy is set, we can't make any modifications
+ * If parent's use_hierarchy is set, we can't make any modifications
* in the child subtrees. If it is unset, then the change can
* occur, provided the current cgroup has no children.
*
@@ -2046,20 +2696,65 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
return retval;
}
+struct mem_cgroup_idx_data {
+ s64 val;
+ enum mem_cgroup_stat_index idx;
+};
+
+static int
+mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data)
+{
+ struct mem_cgroup_idx_data *d = data;
+ d->val += mem_cgroup_read_stat(&mem->stat, d->idx);
+ return 0;
+}
+
+static void
+mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem,
+ enum mem_cgroup_stat_index idx, s64 *val)
+{
+ struct mem_cgroup_idx_data d;
+ d.idx = idx;
+ d.val = 0;
+ mem_cgroup_walk_tree(mem, &d, mem_cgroup_get_idx_stat);
+ *val = d.val;
+}
+
static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
{
struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
- u64 val = 0;
+ u64 idx_val, val;
int type, name;
type = MEMFILE_TYPE(cft->private);
name = MEMFILE_ATTR(cft->private);
switch (type) {
case _MEM:
- val = res_counter_read_u64(&mem->res, name);
+ if (name == RES_USAGE && mem_cgroup_is_root(mem)) {
+ mem_cgroup_get_recursive_idx_stat(mem,
+ MEM_CGROUP_STAT_CACHE, &idx_val);
+ val = idx_val;
+ mem_cgroup_get_recursive_idx_stat(mem,
+ MEM_CGROUP_STAT_RSS, &idx_val);
+ val += idx_val;
+ val <<= PAGE_SHIFT;
+ } else
+ val = res_counter_read_u64(&mem->res, name);
break;
case _MEMSWAP:
- val = res_counter_read_u64(&mem->memsw, name);
+ if (name == RES_USAGE && mem_cgroup_is_root(mem)) {
+ mem_cgroup_get_recursive_idx_stat(mem,
+ MEM_CGROUP_STAT_CACHE, &idx_val);
+ val = idx_val;
+ mem_cgroup_get_recursive_idx_stat(mem,
+ MEM_CGROUP_STAT_RSS, &idx_val);
+ val += idx_val;
+ mem_cgroup_get_recursive_idx_stat(mem,
+ MEM_CGROUP_STAT_SWAPOUT, &idx_val);
+ val += idx_val;
+ val <<= PAGE_SHIFT;
+ } else
+ val = res_counter_read_u64(&mem->memsw, name);
break;
default:
BUG();
@@ -2083,6 +2778,10 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
name = MEMFILE_ATTR(cft->private);
switch (name) {
case RES_LIMIT:
+ if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
+ ret = -EINVAL;
+ break;
+ }
/* This function does all necessary parse...reuse it */
ret = res_counter_memparse_write_strategy(buffer, &val);
if (ret)
@@ -2092,6 +2791,20 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
else
ret = mem_cgroup_resize_memsw_limit(memcg, val);
break;
+ case RES_SOFT_LIMIT:
+ ret = res_counter_memparse_write_strategy(buffer, &val);
+ if (ret)
+ break;
+ /*
+ * For memsw, soft limits are hard to implement in terms
+ * of semantics, for now, we support soft limits for
+ * control without swap
+ */
+ if (type == _MEM)
+ ret = res_counter_set_soft_limit(&memcg->res, val);
+ else
+ ret = -EINVAL;
+ break;
default:
ret = -EINVAL; /* should be BUG() ? */
break;
@@ -2149,6 +2862,7 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
res_counter_reset_failcnt(&mem->memsw);
break;
}
+
return 0;
}
@@ -2157,9 +2871,10 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
enum {
MCS_CACHE,
MCS_RSS,
- MCS_MAPPED_FILE,
+ MCS_FILE_MAPPED,
MCS_PGPGIN,
MCS_PGPGOUT,
+ MCS_SWAP,
MCS_INACTIVE_ANON,
MCS_ACTIVE_ANON,
MCS_INACTIVE_FILE,
@@ -2181,6 +2896,7 @@ struct {
{"mapped_file", "total_mapped_file"},
{"pgpgin", "total_pgpgin"},
{"pgpgout", "total_pgpgout"},
+ {"swap", "total_swap"},
{"inactive_anon", "total_inactive_anon"},
{"active_anon", "total_active_anon"},
{"inactive_file", "total_inactive_file"},
@@ -2199,12 +2915,16 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data)
s->stat[MCS_CACHE] += val * PAGE_SIZE;
val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS);
s->stat[MCS_RSS] += val * PAGE_SIZE;
- val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_MAPPED_FILE);
- s->stat[MCS_MAPPED_FILE] += val * PAGE_SIZE;
+ val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_FILE_MAPPED);
+ s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGIN_COUNT);
s->stat[MCS_PGPGIN] += val;
val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT);
s->stat[MCS_PGPGOUT] += val;
+ if (do_swap_account) {
+ val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_SWAPOUT);
+ s->stat[MCS_SWAP] += val * PAGE_SIZE;
+ }
/* per zone stat */
val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON);
@@ -2236,8 +2956,11 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
memset(&mystat, 0, sizeof(mystat));
mem_cgroup_get_local_stat(mem_cont, &mystat);
- for (i = 0; i < NR_MCS_STAT; i++)
+ for (i = 0; i < NR_MCS_STAT; i++) {
+ if (i == MCS_SWAP && !do_swap_account)
+ continue;
cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]);
+ }
/* Hierarchical information */
{
@@ -2250,9 +2973,11 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
memset(&mystat, 0, sizeof(mystat));
mem_cgroup_get_total_stat(mem_cont, &mystat);
- for (i = 0; i < NR_MCS_STAT; i++)
+ for (i = 0; i < NR_MCS_STAT; i++) {
+ if (i == MCS_SWAP && !do_swap_account)
+ continue;
cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]);
-
+ }
#ifdef CONFIG_DEBUG_VM
cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL));
@@ -2345,6 +3070,12 @@ static struct cftype mem_cgroup_files[] = {
.read_u64 = mem_cgroup_read,
},
{
+ .name = "soft_limit_in_bytes",
+ .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
+ .write_string = mem_cgroup_write,
+ .read_u64 = mem_cgroup_read,
+ },
+ {
.name = "failcnt",
.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
.trigger = mem_cgroup_reset,
@@ -2438,6 +3169,9 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
mz = &pn->zoneinfo[zone];
for_each_lru(l)
INIT_LIST_HEAD(&mz->lists[l]);
+ mz->usage_in_excess = 0;
+ mz->on_tree = false;
+ mz->mem = mem;
}
return 0;
}
@@ -2483,6 +3217,7 @@ static void __mem_cgroup_free(struct mem_cgroup *mem)
{
int node;
+ mem_cgroup_remove_from_trees(mem);
free_css_id(&mem_cgroup_subsys, &mem->css);
for_each_node_state(node, N_POSSIBLE)
@@ -2531,6 +3266,31 @@ static void __init enable_swap_cgroup(void)
}
#endif
+static int mem_cgroup_soft_limit_tree_init(void)
+{
+ struct mem_cgroup_tree_per_node *rtpn;
+ struct mem_cgroup_tree_per_zone *rtpz;
+ int tmp, node, zone;
+
+ for_each_node_state(node, N_POSSIBLE) {
+ tmp = node;
+ if (!node_state(node, N_NORMAL_MEMORY))
+ tmp = -1;
+ rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
+ if (!rtpn)
+ return 1;
+
+ soft_limit_tree.rb_tree_per_node[node] = rtpn;
+
+ for (zone = 0; zone < MAX_NR_ZONES; zone++) {
+ rtpz = &rtpn->rb_tree_per_zone[zone];
+ rtpz->rb_root = RB_ROOT;
+ spin_lock_init(&rtpz->lock);
+ }
+ }
+ return 0;
+}
+
static struct cgroup_subsys_state * __ref
mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
{
@@ -2545,10 +3305,22 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
for_each_node_state(node, N_POSSIBLE)
if (alloc_mem_cgroup_per_zone_info(mem, node))
goto free_out;
+
/* root ? */
if (cont->parent == NULL) {
+ int cpu;
enable_swap_cgroup();
parent = NULL;
+ root_mem_cgroup = mem;
+ if (mem_cgroup_soft_limit_tree_init())
+ goto free_out;
+ for_each_possible_cpu(cpu) {
+ struct memcg_stock_pcp *stock =
+ &per_cpu(memcg_stock, cpu);
+ INIT_WORK(&stock->work, drain_local_stock);
+ }
+ hotcpu_notifier(memcg_stock_cpu_callback, 0);
+
} else {
parent = mem_cgroup_from_cont(cont->parent);
mem->use_hierarchy = parent->use_hierarchy;
@@ -2577,6 +3349,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
return &mem->css;
free_out:
__mem_cgroup_free(mem);
+ root_mem_cgroup = NULL;
return ERR_PTR(error);
}
@@ -2612,14 +3385,13 @@ static int mem_cgroup_populate(struct cgroup_subsys *ss,
static void mem_cgroup_move_task(struct cgroup_subsys *ss,
struct cgroup *cont,
struct cgroup *old_cont,
- struct task_struct *p)
+ struct task_struct *p,
+ bool threadgroup)
{
- mutex_lock(&memcg_tasklist);
/*
* FIXME: It's better to move charges of this process from old
* memcg to new memcg. But it's just on TODO-List now.
*/
- mutex_unlock(&memcg_tasklist);
}
struct cgroup_subsys mem_cgroup_subsys = {
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
new file mode 100644
index 000000000000..50d4f8d7024a
--- /dev/null
+++ b/mm/memory-failure.c
@@ -0,0 +1,833 @@
+/*
+ * Copyright (C) 2008, 2009 Intel Corporation
+ * Authors: Andi Kleen, Fengguang Wu
+ *
+ * This software may be redistributed and/or modified under the terms of
+ * the GNU General Public License ("GPL") version 2 only as published by the
+ * Free Software Foundation.
+ *
+ * High level machine check handler. Handles pages reported by the
+ * hardware as being corrupted usually due to a 2bit ECC memory or cache
+ * failure.
+ *
+ * Handles page cache pages in various states. The tricky part
+ * here is that we can access any page asynchronous to other VM
+ * users, because memory failures could happen anytime and anywhere,
+ * possibly violating some of their assumptions. This is why this code
+ * has to be extremely careful. Generally it tries to use normal locking
+ * rules, as in get the standard locks, even if that means the
+ * error handling takes potentially a long time.
+ *
+ * The operation to map back from RMAP chains to processes has to walk
+ * the complete process list and has non linear complexity with the number
+ * mappings. In short it can be quite slow. But since memory corruptions
+ * are rare we hope to get away with this.
+ */
+
+/*
+ * Notebook:
+ * - hugetlb needs more code
+ * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages
+ * - pass bad pages to kdump next kernel
+ */
+#define DEBUG 1 /* remove me in 2.6.34 */
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/page-flags.h>
+#include <linux/sched.h>
+#include <linux/ksm.h>
+#include <linux/rmap.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
+#include <linux/backing-dev.h>
+#include "internal.h"
+
+int sysctl_memory_failure_early_kill __read_mostly = 0;
+
+int sysctl_memory_failure_recovery __read_mostly = 1;
+
+atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0);
+
+/*
+ * Send all the processes who have the page mapped an ``action optional''
+ * signal.
+ */
+static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno,
+ unsigned long pfn)
+{
+ struct siginfo si;
+ int ret;
+
+ printk(KERN_ERR
+ "MCE %#lx: Killing %s:%d early due to hardware memory corruption\n",
+ pfn, t->comm, t->pid);
+ si.si_signo = SIGBUS;
+ si.si_errno = 0;
+ si.si_code = BUS_MCEERR_AO;
+ si.si_addr = (void *)addr;
+#ifdef __ARCH_SI_TRAPNO
+ si.si_trapno = trapno;
+#endif
+ si.si_addr_lsb = PAGE_SHIFT;
+ /*
+ * Don't use force here, it's convenient if the signal
+ * can be temporarily blocked.
+ * This could cause a loop when the user sets SIGBUS
+ * to SIG_IGN, but hopefully noone will do that?
+ */
+ ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */
+ if (ret < 0)
+ printk(KERN_INFO "MCE: Error sending signal to %s:%d: %d\n",
+ t->comm, t->pid, ret);
+ return ret;
+}
+
+/*
+ * Kill all processes that have a poisoned page mapped and then isolate
+ * the page.
+ *
+ * General strategy:
+ * Find all processes having the page mapped and kill them.
+ * But we keep a page reference around so that the page is not
+ * actually freed yet.
+ * Then stash the page away
+ *
+ * There's no convenient way to get back to mapped processes
+ * from the VMAs. So do a brute-force search over all
+ * running processes.
+ *
+ * Remember that machine checks are not common (or rather
+ * if they are common you have other problems), so this shouldn't
+ * be a performance issue.
+ *
+ * Also there are some races possible while we get from the
+ * error detection to actually handle it.
+ */
+
+struct to_kill {
+ struct list_head nd;
+ struct task_struct *tsk;
+ unsigned long addr;
+ unsigned addr_valid:1;
+};
+
+/*
+ * Failure handling: if we can't find or can't kill a process there's
+ * not much we can do. We just print a message and ignore otherwise.
+ */
+
+/*
+ * Schedule a process for later kill.
+ * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
+ * TBD would GFP_NOIO be enough?
+ */
+static void add_to_kill(struct task_struct *tsk, struct page *p,
+ struct vm_area_struct *vma,
+ struct list_head *to_kill,
+ struct to_kill **tkc)
+{
+ struct to_kill *tk;
+
+ if (*tkc) {
+ tk = *tkc;
+ *tkc = NULL;
+ } else {
+ tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
+ if (!tk) {
+ printk(KERN_ERR
+ "MCE: Out of memory while machine check handling\n");
+ return;
+ }
+ }
+ tk->addr = page_address_in_vma(p, vma);
+ tk->addr_valid = 1;
+
+ /*
+ * In theory we don't have to kill when the page was
+ * munmaped. But it could be also a mremap. Since that's
+ * likely very rare kill anyways just out of paranoia, but use
+ * a SIGKILL because the error is not contained anymore.
+ */
+ if (tk->addr == -EFAULT) {
+ pr_debug("MCE: Unable to find user space address %lx in %s\n",
+ page_to_pfn(p), tsk->comm);
+ tk->addr_valid = 0;
+ }
+ get_task_struct(tsk);
+ tk->tsk = tsk;
+ list_add_tail(&tk->nd, to_kill);
+}
+
+/*
+ * Kill the processes that have been collected earlier.
+ *
+ * Only do anything when DOIT is set, otherwise just free the list
+ * (this is used for clean pages which do not need killing)
+ * Also when FAIL is set do a force kill because something went
+ * wrong earlier.
+ */
+static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno,
+ int fail, unsigned long pfn)
+{
+ struct to_kill *tk, *next;
+
+ list_for_each_entry_safe (tk, next, to_kill, nd) {
+ if (doit) {
+ /*
+ * In case something went wrong with munmapping
+ * make sure the process doesn't catch the
+ * signal and then access the memory. Just kill it.
+ * the signal handlers
+ */
+ if (fail || tk->addr_valid == 0) {
+ printk(KERN_ERR
+ "MCE %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
+ pfn, tk->tsk->comm, tk->tsk->pid);
+ force_sig(SIGKILL, tk->tsk);
+ }
+
+ /*
+ * In theory the process could have mapped
+ * something else on the address in-between. We could
+ * check for that, but we need to tell the
+ * process anyways.
+ */
+ else if (kill_proc_ao(tk->tsk, tk->addr, trapno,
+ pfn) < 0)
+ printk(KERN_ERR
+ "MCE %#lx: Cannot send advisory machine check signal to %s:%d\n",
+ pfn, tk->tsk->comm, tk->tsk->pid);
+ }
+ put_task_struct(tk->tsk);
+ kfree(tk);
+ }
+}
+
+static int task_early_kill(struct task_struct *tsk)
+{
+ if (!tsk->mm)
+ return 0;
+ if (tsk->flags & PF_MCE_PROCESS)
+ return !!(tsk->flags & PF_MCE_EARLY);
+ return sysctl_memory_failure_early_kill;
+}
+
+/*
+ * Collect processes when the error hit an anonymous page.
+ */
+static void collect_procs_anon(struct page *page, struct list_head *to_kill,
+ struct to_kill **tkc)
+{
+ struct vm_area_struct *vma;
+ struct task_struct *tsk;
+ struct anon_vma *av;
+
+ read_lock(&tasklist_lock);
+ av = page_lock_anon_vma(page);
+ if (av == NULL) /* Not actually mapped anymore */
+ goto out;
+ for_each_process (tsk) {
+ if (!task_early_kill(tsk))
+ continue;
+ list_for_each_entry (vma, &av->head, anon_vma_node) {
+ if (!page_mapped_in_vma(page, vma))
+ continue;
+ if (vma->vm_mm == tsk->mm)
+ add_to_kill(tsk, page, vma, to_kill, tkc);
+ }
+ }
+ page_unlock_anon_vma(av);
+out:
+ read_unlock(&tasklist_lock);
+}
+
+/*
+ * Collect processes when the error hit a file mapped page.
+ */
+static void collect_procs_file(struct page *page, struct list_head *to_kill,
+ struct to_kill **tkc)
+{
+ struct vm_area_struct *vma;
+ struct task_struct *tsk;
+ struct prio_tree_iter iter;
+ struct address_space *mapping = page->mapping;
+
+ /*
+ * A note on the locking order between the two locks.
+ * We don't rely on this particular order.
+ * If you have some other code that needs a different order
+ * feel free to switch them around. Or add a reverse link
+ * from mm_struct to task_struct, then this could be all
+ * done without taking tasklist_lock and looping over all tasks.
+ */
+
+ read_lock(&tasklist_lock);
+ spin_lock(&mapping->i_mmap_lock);
+ for_each_process(tsk) {
+ pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+
+ if (!task_early_kill(tsk))
+ continue;
+
+ vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff,
+ pgoff) {
+ /*
+ * Send early kill signal to tasks where a vma covers
+ * the page but the corrupted page is not necessarily
+ * mapped it in its pte.
+ * Assume applications who requested early kill want
+ * to be informed of all such data corruptions.
+ */
+ if (vma->vm_mm == tsk->mm)
+ add_to_kill(tsk, page, vma, to_kill, tkc);
+ }
+ }
+ spin_unlock(&mapping->i_mmap_lock);
+ read_unlock(&tasklist_lock);
+}
+
+/*
+ * Collect the processes who have the corrupted page mapped to kill.
+ * This is done in two steps for locking reasons.
+ * First preallocate one tokill structure outside the spin locks,
+ * so that we can kill at least one process reasonably reliable.
+ */
+static void collect_procs(struct page *page, struct list_head *tokill)
+{
+ struct to_kill *tk;
+
+ if (!page->mapping)
+ return;
+
+ tk = kmalloc(sizeof(struct to_kill), GFP_NOIO);
+ if (!tk)
+ return;
+ if (PageAnon(page))
+ collect_procs_anon(page, tokill, &tk);
+ else
+ collect_procs_file(page, tokill, &tk);
+ kfree(tk);
+}
+
+/*
+ * Error handlers for various types of pages.
+ */
+
+enum outcome {
+ FAILED, /* Error handling failed */
+ DELAYED, /* Will be handled later */
+ IGNORED, /* Error safely ignored */
+ RECOVERED, /* Successfully recovered */
+};
+
+static const char *action_name[] = {
+ [FAILED] = "Failed",
+ [DELAYED] = "Delayed",
+ [IGNORED] = "Ignored",
+ [RECOVERED] = "Recovered",
+};
+
+/*
+ * Error hit kernel page.
+ * Do nothing, try to be lucky and not touch this instead. For a few cases we
+ * could be more sophisticated.
+ */
+static int me_kernel(struct page *p, unsigned long pfn)
+{
+ return DELAYED;
+}
+
+/*
+ * Already poisoned page.
+ */
+static int me_ignore(struct page *p, unsigned long pfn)
+{
+ return IGNORED;
+}
+
+/*
+ * Page in unknown state. Do nothing.
+ */
+static int me_unknown(struct page *p, unsigned long pfn)
+{
+ printk(KERN_ERR "MCE %#lx: Unknown page state\n", pfn);
+ return FAILED;
+}
+
+/*
+ * Free memory
+ */
+static int me_free(struct page *p, unsigned long pfn)
+{
+ return DELAYED;
+}
+
+/*
+ * Clean (or cleaned) page cache page.
+ */
+static int me_pagecache_clean(struct page *p, unsigned long pfn)
+{
+ int err;
+ int ret = FAILED;
+ struct address_space *mapping;
+
+ /*
+ * For anonymous pages we're done the only reference left
+ * should be the one m_f() holds.
+ */
+ if (PageAnon(p))
+ return RECOVERED;
+
+ /*
+ * Now truncate the page in the page cache. This is really
+ * more like a "temporary hole punch"
+ * Don't do this for block devices when someone else
+ * has a reference, because it could be file system metadata
+ * and that's not safe to truncate.
+ */
+ mapping = page_mapping(p);
+ if (!mapping) {
+ /*
+ * Page has been teared down in the meanwhile
+ */
+ return FAILED;
+ }
+
+ /*
+ * Truncation is a bit tricky. Enable it per file system for now.
+ *
+ * Open: to take i_mutex or not for this? Right now we don't.
+ */
+ if (mapping->a_ops->error_remove_page) {
+ err = mapping->a_ops->error_remove_page(mapping, p);
+ if (err != 0) {
+ printk(KERN_INFO "MCE %#lx: Failed to punch page: %d\n",
+ pfn, err);
+ } else if (page_has_private(p) &&
+ !try_to_release_page(p, GFP_NOIO)) {
+ pr_debug("MCE %#lx: failed to release buffers\n", pfn);
+ } else {
+ ret = RECOVERED;
+ }
+ } else {
+ /*
+ * If the file system doesn't support it just invalidate
+ * This fails on dirty or anything with private pages
+ */
+ if (invalidate_inode_page(p))
+ ret = RECOVERED;
+ else
+ printk(KERN_INFO "MCE %#lx: Failed to invalidate\n",
+ pfn);
+ }
+ return ret;
+}
+
+/*
+ * Dirty cache page page
+ * Issues: when the error hit a hole page the error is not properly
+ * propagated.
+ */
+static int me_pagecache_dirty(struct page *p, unsigned long pfn)
+{
+ struct address_space *mapping = page_mapping(p);
+
+ SetPageError(p);
+ /* TBD: print more information about the file. */
+ if (mapping) {
+ /*
+ * IO error will be reported by write(), fsync(), etc.
+ * who check the mapping.
+ * This way the application knows that something went
+ * wrong with its dirty file data.
+ *
+ * There's one open issue:
+ *
+ * The EIO will be only reported on the next IO
+ * operation and then cleared through the IO map.
+ * Normally Linux has two mechanisms to pass IO error
+ * first through the AS_EIO flag in the address space
+ * and then through the PageError flag in the page.
+ * Since we drop pages on memory failure handling the
+ * only mechanism open to use is through AS_AIO.
+ *
+ * This has the disadvantage that it gets cleared on
+ * the first operation that returns an error, while
+ * the PageError bit is more sticky and only cleared
+ * when the page is reread or dropped. If an
+ * application assumes it will always get error on
+ * fsync, but does other operations on the fd before
+ * and the page is dropped inbetween then the error
+ * will not be properly reported.
+ *
+ * This can already happen even without hwpoisoned
+ * pages: first on metadata IO errors (which only
+ * report through AS_EIO) or when the page is dropped
+ * at the wrong time.
+ *
+ * So right now we assume that the application DTRT on
+ * the first EIO, but we're not worse than other parts
+ * of the kernel.
+ */
+ mapping_set_error(mapping, EIO);
+ }
+
+ return me_pagecache_clean(p, pfn);
+}
+
+/*
+ * Clean and dirty swap cache.
+ *
+ * Dirty swap cache page is tricky to handle. The page could live both in page
+ * cache and swap cache(ie. page is freshly swapped in). So it could be
+ * referenced concurrently by 2 types of PTEs:
+ * normal PTEs and swap PTEs. We try to handle them consistently by calling
+ * try_to_unmap(TTU_IGNORE_HWPOISON) to convert the normal PTEs to swap PTEs,
+ * and then
+ * - clear dirty bit to prevent IO
+ * - remove from LRU
+ * - but keep in the swap cache, so that when we return to it on
+ * a later page fault, we know the application is accessing
+ * corrupted data and shall be killed (we installed simple
+ * interception code in do_swap_page to catch it).
+ *
+ * Clean swap cache pages can be directly isolated. A later page fault will
+ * bring in the known good data from disk.
+ */
+static int me_swapcache_dirty(struct page *p, unsigned long pfn)
+{
+ ClearPageDirty(p);
+ /* Trigger EIO in shmem: */
+ ClearPageUptodate(p);
+
+ return DELAYED;
+}
+
+static int me_swapcache_clean(struct page *p, unsigned long pfn)
+{
+ delete_from_swap_cache(p);
+
+ return RECOVERED;
+}
+
+/*
+ * Huge pages. Needs work.
+ * Issues:
+ * No rmap support so we cannot find the original mapper. In theory could walk
+ * all MMs and look for the mappings, but that would be non atomic and racy.
+ * Need rmap for hugepages for this. Alternatively we could employ a heuristic,
+ * like just walking the current process and hoping it has it mapped (that
+ * should be usually true for the common "shared database cache" case)
+ * Should handle free huge pages and dequeue them too, but this needs to
+ * handle huge page accounting correctly.
+ */
+static int me_huge_page(struct page *p, unsigned long pfn)
+{
+ return FAILED;
+}
+
+/*
+ * Various page states we can handle.
+ *
+ * A page state is defined by its current page->flags bits.
+ * The table matches them in order and calls the right handler.
+ *
+ * This is quite tricky because we can access page at any time
+ * in its live cycle, so all accesses have to be extremly careful.
+ *
+ * This is not complete. More states could be added.
+ * For any missing state don't attempt recovery.
+ */
+
+#define dirty (1UL << PG_dirty)
+#define sc (1UL << PG_swapcache)
+#define unevict (1UL << PG_unevictable)
+#define mlock (1UL << PG_mlocked)
+#define writeback (1UL << PG_writeback)
+#define lru (1UL << PG_lru)
+#define swapbacked (1UL << PG_swapbacked)
+#define head (1UL << PG_head)
+#define tail (1UL << PG_tail)
+#define compound (1UL << PG_compound)
+#define slab (1UL << PG_slab)
+#define buddy (1UL << PG_buddy)
+#define reserved (1UL << PG_reserved)
+
+static struct page_state {
+ unsigned long mask;
+ unsigned long res;
+ char *msg;
+ int (*action)(struct page *p, unsigned long pfn);
+} error_states[] = {
+ { reserved, reserved, "reserved kernel", me_ignore },
+ { buddy, buddy, "free kernel", me_free },
+
+ /*
+ * Could in theory check if slab page is free or if we can drop
+ * currently unused objects without touching them. But just
+ * treat it as standard kernel for now.
+ */
+ { slab, slab, "kernel slab", me_kernel },
+
+#ifdef CONFIG_PAGEFLAGS_EXTENDED
+ { head, head, "huge", me_huge_page },
+ { tail, tail, "huge", me_huge_page },
+#else
+ { compound, compound, "huge", me_huge_page },
+#endif
+
+ { sc|dirty, sc|dirty, "swapcache", me_swapcache_dirty },
+ { sc|dirty, sc, "swapcache", me_swapcache_clean },
+
+ { unevict|dirty, unevict|dirty, "unevictable LRU", me_pagecache_dirty},
+ { unevict, unevict, "unevictable LRU", me_pagecache_clean},
+
+ { mlock|dirty, mlock|dirty, "mlocked LRU", me_pagecache_dirty },
+ { mlock, mlock, "mlocked LRU", me_pagecache_clean },
+
+ { lru|dirty, lru|dirty, "LRU", me_pagecache_dirty },
+ { lru|dirty, lru, "clean LRU", me_pagecache_clean },
+ { swapbacked, swapbacked, "anonymous", me_pagecache_clean },
+
+ /*
+ * Catchall entry: must be at end.
+ */
+ { 0, 0, "unknown page state", me_unknown },
+};
+
+static void action_result(unsigned long pfn, char *msg, int result)
+{
+ struct page *page = NULL;
+ if (pfn_valid(pfn))
+ page = pfn_to_page(pfn);
+
+ printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n",
+ pfn,
+ page && PageDirty(page) ? "dirty " : "",
+ msg, action_name[result]);
+}
+
+static int page_action(struct page_state *ps, struct page *p,
+ unsigned long pfn, int ref)
+{
+ int result;
+ int count;
+
+ result = ps->action(p, pfn);
+ action_result(pfn, ps->msg, result);
+
+ count = page_count(p) - 1 - ref;
+ if (count != 0)
+ printk(KERN_ERR
+ "MCE %#lx: %s page still referenced by %d users\n",
+ pfn, ps->msg, count);
+
+ /* Could do more checks here if page looks ok */
+ /*
+ * Could adjust zone counters here to correct for the missing page.
+ */
+
+ return result == RECOVERED ? 0 : -EBUSY;
+}
+
+#define N_UNMAP_TRIES 5
+
+/*
+ * Do all that is necessary to remove user space mappings. Unmap
+ * the pages and send SIGBUS to the processes if the data was dirty.
+ */
+static void hwpoison_user_mappings(struct page *p, unsigned long pfn,
+ int trapno)
+{
+ enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
+ struct address_space *mapping;
+ LIST_HEAD(tokill);
+ int ret;
+ int i;
+ int kill = 1;
+
+ if (PageReserved(p) || PageCompound(p) || PageSlab(p) || PageKsm(p))
+ return;
+
+ /*
+ * This check implies we don't kill processes if their pages
+ * are in the swap cache early. Those are always late kills.
+ */
+ if (!page_mapped(p))
+ return;
+
+ if (PageSwapCache(p)) {
+ printk(KERN_ERR
+ "MCE %#lx: keeping poisoned page in swap cache\n", pfn);
+ ttu |= TTU_IGNORE_HWPOISON;
+ }
+
+ /*
+ * Propagate the dirty bit from PTEs to struct page first, because we
+ * need this to decide if we should kill or just drop the page.
+ */
+ mapping = page_mapping(p);
+ if (!PageDirty(p) && mapping && mapping_cap_writeback_dirty(mapping)) {
+ if (page_mkclean(p)) {
+ SetPageDirty(p);
+ } else {
+ kill = 0;
+ ttu |= TTU_IGNORE_HWPOISON;
+ printk(KERN_INFO
+ "MCE %#lx: corrupted page was clean: dropped without side effects\n",
+ pfn);
+ }
+ }
+
+ /*
+ * First collect all the processes that have the page
+ * mapped in dirty form. This has to be done before try_to_unmap,
+ * because ttu takes the rmap data structures down.
+ *
+ * Error handling: We ignore errors here because
+ * there's nothing that can be done.
+ */
+ if (kill)
+ collect_procs(p, &tokill);
+
+ /*
+ * try_to_unmap can fail temporarily due to races.
+ * Try a few times (RED-PEN better strategy?)
+ */
+ for (i = 0; i < N_UNMAP_TRIES; i++) {
+ ret = try_to_unmap(p, ttu);
+ if (ret == SWAP_SUCCESS)
+ break;
+ pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn, ret);
+ }
+
+ if (ret != SWAP_SUCCESS)
+ printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
+ pfn, page_mapcount(p));
+
+ /*
+ * Now that the dirty bit has been propagated to the
+ * struct page and all unmaps done we can decide if
+ * killing is needed or not. Only kill when the page
+ * was dirty, otherwise the tokill list is merely
+ * freed. When there was a problem unmapping earlier
+ * use a more force-full uncatchable kill to prevent
+ * any accesses to the poisoned memory.
+ */
+ kill_procs_ao(&tokill, !!PageDirty(p), trapno,
+ ret != SWAP_SUCCESS, pfn);
+}
+
+int __memory_failure(unsigned long pfn, int trapno, int ref)
+{
+ unsigned long lru_flag;
+ struct page_state *ps;
+ struct page *p;
+ int res;
+
+ if (!sysctl_memory_failure_recovery)
+ panic("Memory failure from trap %d on page %lx", trapno, pfn);
+
+ if (!pfn_valid(pfn)) {
+ action_result(pfn, "memory outside kernel control", IGNORED);
+ return -EIO;
+ }
+
+ p = pfn_to_page(pfn);
+ if (TestSetPageHWPoison(p)) {
+ action_result(pfn, "already hardware poisoned", IGNORED);
+ return 0;
+ }
+
+ atomic_long_add(1, &mce_bad_pages);
+
+ /*
+ * We need/can do nothing about count=0 pages.
+ * 1) it's a free page, and therefore in safe hand:
+ * prep_new_page() will be the gate keeper.
+ * 2) it's part of a non-compound high order page.
+ * Implies some kernel user: cannot stop them from
+ * R/W the page; let's pray that the page has been
+ * used and will be freed some time later.
+ * In fact it's dangerous to directly bump up page count from 0,
+ * that may make page_freeze_refs()/page_unfreeze_refs() mismatch.
+ */
+ if (!get_page_unless_zero(compound_head(p))) {
+ action_result(pfn, "free or high order kernel", IGNORED);
+ return PageBuddy(compound_head(p)) ? 0 : -EBUSY;
+ }
+
+ /*
+ * We ignore non-LRU pages for good reasons.
+ * - PG_locked is only well defined for LRU pages and a few others
+ * - to avoid races with __set_page_locked()
+ * - to avoid races with __SetPageSlab*() (and more non-atomic ops)
+ * The check (unnecessarily) ignores LRU pages being isolated and
+ * walked by the page reclaim code, however that's not a big loss.
+ */
+ if (!PageLRU(p))
+ lru_add_drain_all();
+ lru_flag = p->flags & lru;
+ if (isolate_lru_page(p)) {
+ action_result(pfn, "non LRU", IGNORED);
+ put_page(p);
+ return -EBUSY;
+ }
+ page_cache_release(p);
+
+ /*
+ * Lock the page and wait for writeback to finish.
+ * It's very difficult to mess with pages currently under IO
+ * and in many cases impossible, so we just avoid it here.
+ */
+ lock_page_nosync(p);
+ wait_on_page_writeback(p);
+
+ /*
+ * Now take care of user space mappings.
+ */
+ hwpoison_user_mappings(p, pfn, trapno);
+
+ /*
+ * Torn down by someone else?
+ */
+ if ((lru_flag & lru) && !PageSwapCache(p) && p->mapping == NULL) {
+ action_result(pfn, "already truncated LRU", IGNORED);
+ res = 0;
+ goto out;
+ }
+
+ res = -EBUSY;
+ for (ps = error_states;; ps++) {
+ if (((p->flags | lru_flag)& ps->mask) == ps->res) {
+ res = page_action(ps, p, pfn, ref);
+ break;
+ }
+ }
+out:
+ unlock_page(p);
+ return res;
+}
+EXPORT_SYMBOL_GPL(__memory_failure);
+
+/**
+ * memory_failure - Handle memory failure of a page.
+ * @pfn: Page Number of the corrupted page
+ * @trapno: Trap number reported in the signal to user space.
+ *
+ * This function is called by the low level machine check code
+ * of an architecture when it detects hardware memory corruption
+ * of a page. It tries its best to recover, which includes
+ * dropping pages, killing processes etc.
+ *
+ * The function is primarily of use for corruptions that
+ * happen outside the current execution context (e.g. when
+ * detected by a background scrubber)
+ *
+ * Must run in process context (e.g. a work queue) with interrupts
+ * enabled and no spinlocks hold.
+ */
+void memory_failure(unsigned long pfn, int trapno)
+{
+ __memory_failure(pfn, trapno, 0);
+}
diff --git a/mm/memory.c b/mm/memory.c
index b1443ac07c00..aed45eaf8ac9 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -297,7 +297,8 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
unsigned long addr = vma->vm_start;
/*
- * Hide vma from rmap and vmtruncate before freeing pgtables
+ * Hide vma from rmap and truncate_pagecache before freeing
+ * pgtables
*/
anon_vma_unlink(vma);
unlink_file_vma(vma);
@@ -571,7 +572,7 @@ out:
* covered by this vma.
*/
-static inline void
+static inline unsigned long
copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
unsigned long addr, int *rss)
@@ -585,7 +586,9 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
if (!pte_file(pte)) {
swp_entry_t entry = pte_to_swp_entry(pte);
- swap_duplicate(entry);
+ if (swap_duplicate(entry) < 0)
+ return entry.val;
+
/* make sure dst_mm is on swapoff's mmlist. */
if (unlikely(list_empty(&dst_mm->mmlist))) {
spin_lock(&mmlist_lock);
@@ -634,16 +637,19 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
out_set_pte:
set_pte_at(dst_mm, addr, dst_pte, pte);
+ return 0;
}
static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
unsigned long addr, unsigned long end)
{
+ pte_t *orig_src_pte, *orig_dst_pte;
pte_t *src_pte, *dst_pte;
spinlock_t *src_ptl, *dst_ptl;
int progress = 0;
int rss[2];
+ swp_entry_t entry = (swp_entry_t){0};
again:
rss[1] = rss[0] = 0;
@@ -653,6 +659,8 @@ again:
src_pte = pte_offset_map_nested(src_pmd, addr);
src_ptl = pte_lockptr(src_mm, src_pmd);
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
+ orig_src_pte = src_pte;
+ orig_dst_pte = dst_pte;
arch_enter_lazy_mmu_mode();
do {
@@ -670,16 +678,25 @@ again:
progress++;
continue;
}
- copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss);
+ entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte,
+ vma, addr, rss);
+ if (entry.val)
+ break;
progress += 8;
} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
arch_leave_lazy_mmu_mode();
spin_unlock(src_ptl);
- pte_unmap_nested(src_pte - 1);
+ pte_unmap_nested(orig_src_pte);
add_mm_rss(dst_mm, rss[0], rss[1]);
- pte_unmap_unlock(dst_pte - 1, dst_ptl);
+ pte_unmap_unlock(orig_dst_pte, dst_ptl);
cond_resched();
+
+ if (entry.val) {
+ if (add_swap_count_continuation(entry, GFP_KERNEL) < 0)
+ return -ENOMEM;
+ progress = 0;
+ }
if (addr != end)
goto again;
return 0;
@@ -939,6 +956,7 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
details = NULL;
BUG_ON(addr >= end);
+ mem_cgroup_uncharge_start();
tlb_start_vma(tlb, vma);
pgd = pgd_offset(vma->vm_mm, addr);
do {
@@ -951,6 +969,7 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
zap_work, details);
} while (pgd++, addr = next, (addr != end && *zap_work > 0));
tlb_end_vma(tlb, vma);
+ mem_cgroup_uncharge_end();
return addr;
}
@@ -1325,7 +1344,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
if (ret & VM_FAULT_ERROR) {
if (ret & VM_FAULT_OOM)
return i ? i : -ENOMEM;
- else if (ret & VM_FAULT_SIGBUS)
+ if (ret &
+ (VM_FAULT_HWPOISON|VM_FAULT_SIGBUS))
return i ? i : -EFAULT;
BUG();
}
@@ -1818,10 +1838,10 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
token = pmd_pgtable(*pmd);
do {
- err = fn(pte, token, addr, data);
+ err = fn(pte++, token, addr, data);
if (err)
break;
- } while (pte++, addr += PAGE_SIZE, addr != end);
+ } while (addr += PAGE_SIZE, addr != end);
arch_leave_lazy_mmu_mode();
@@ -2407,7 +2427,7 @@ restart:
* @mapping: the address space containing mmaps to be unmapped.
* @holebegin: byte in first page to unmap, relative to the start of
* the underlying file. This will be rounded down to a PAGE_SIZE
- * boundary. Note that this is different from vmtruncate(), which
+ * boundary. Note that this is different from truncate_pagecache(), which
* must keep the partial page. In contrast, we must get rid of
* partial pages.
* @holelen: size of prospective hole in bytes. This will be rounded
@@ -2458,63 +2478,6 @@ void unmap_mapping_range(struct address_space *mapping,
}
EXPORT_SYMBOL(unmap_mapping_range);
-/**
- * vmtruncate - unmap mappings "freed" by truncate() syscall
- * @inode: inode of the file used
- * @offset: file offset to start truncating
- *
- * NOTE! We have to be ready to update the memory sharing
- * between the file and the memory map for a potential last
- * incomplete page. Ugly, but necessary.
- */
-int vmtruncate(struct inode * inode, loff_t offset)
-{
- if (inode->i_size < offset) {
- unsigned long limit;
-
- limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
- if (limit != RLIM_INFINITY && offset > limit)
- goto out_sig;
- if (offset > inode->i_sb->s_maxbytes)
- goto out_big;
- i_size_write(inode, offset);
- } else {
- struct address_space *mapping = inode->i_mapping;
-
- /*
- * truncation of in-use swapfiles is disallowed - it would
- * cause subsequent swapout to scribble on the now-freed
- * blocks.
- */
- if (IS_SWAPFILE(inode))
- return -ETXTBSY;
- i_size_write(inode, offset);
-
- /*
- * unmap_mapping_range is called twice, first simply for
- * efficiency so that truncate_inode_pages does fewer
- * single-page unmaps. However after this first call, and
- * before truncate_inode_pages finishes, it is possible for
- * private pages to be COWed, which remain after
- * truncate_inode_pages finishes, hence the second
- * unmap_mapping_range call must be made for correctness.
- */
- unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
- truncate_inode_pages(mapping, offset);
- unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
- }
-
- if (inode->i_op->truncate)
- inode->i_op->truncate(inode);
- return 0;
-
-out_sig:
- send_sig(SIGXFSZ, current, 0);
-out_big:
- return -EFBIG;
-}
-EXPORT_SYMBOL(vmtruncate);
-
int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
{
struct address_space *mapping = inode->i_mapping;
@@ -2559,8 +2522,15 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
goto out;
entry = pte_to_swp_entry(orig_pte);
- if (is_migration_entry(entry)) {
- migration_entry_wait(mm, pmd, address);
+ if (unlikely(non_swap_entry(entry))) {
+ if (is_migration_entry(entry)) {
+ migration_entry_wait(mm, pmd, address);
+ } else if (is_hwpoison_entry(entry)) {
+ ret = VM_FAULT_HWPOISON;
+ } else {
+ print_bad_pte(vma, address, orig_pte, NULL);
+ ret = VM_FAULT_SIGBUS;
+ }
goto out;
}
delayacct_set_flag(DELAYACCT_PF_SWAPIN);
@@ -2584,11 +2554,21 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
/* Had to read the page from swap area: Major fault */
ret = VM_FAULT_MAJOR;
count_vm_event(PGMAJFAULT);
+ } else if (PageHWPoison(page)) {
+ ret = VM_FAULT_HWPOISON;
+ delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
+ goto out_release;
}
lock_page(page);
delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
+ page = ksm_might_need_to_copy(page, vma, address);
+ if (!page) {
+ ret = VM_FAULT_OOM;
+ goto out;
+ }
+
if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) {
ret = VM_FAULT_OOM;
goto out_page;
@@ -2655,6 +2635,7 @@ out_nomap:
pte_unmap_unlock(page_table, ptl);
out_page:
unlock_page(page);
+out_release:
page_cache_release(page);
return ret;
}
@@ -2760,6 +2741,12 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
return ret;
+ if (unlikely(PageHWPoison(vmf.page))) {
+ if (ret & VM_FAULT_LOCKED)
+ unlock_page(vmf.page);
+ return VM_FAULT_HWPOISON;
+ }
+
/*
* For consistency in subsequent calls, make the faulted page always
* locked.
@@ -2944,7 +2931,7 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* Page table corrupted: show pte and kill process.
*/
print_bad_pte(vma, address, orig_pte, NULL);
- return VM_FAULT_OOM;
+ return VM_FAULT_SIGBUS;
}
pgoff = pte_to_pgoff(orig_pte);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 821dee596377..030ce8a5bb0e 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -26,6 +26,8 @@
#include <linux/migrate.h>
#include <linux/page-isolation.h>
#include <linux/pfn.h>
+#include <linux/suspend.h>
+#include <linux/mm_inline.h>
#include <asm/tlbflush.h>
@@ -70,7 +72,9 @@ static void get_page_bootmem(unsigned long info, struct page *page, int type)
atomic_inc(&page->_count);
}
-void put_page_bootmem(struct page *page)
+/* reference to __meminit __free_pages_bootmem is valid
+ * so use __ref to tell modpost not to generate a warning */
+void __ref put_page_bootmem(struct page *page)
{
int type;
@@ -447,7 +451,8 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
}
#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
-static pg_data_t *hotadd_new_pgdat(int nid, u64 start)
+/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
+static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
{
struct pglist_data *pgdat;
unsigned long zones_size[MAX_NR_ZONES] = {0};
@@ -484,14 +489,18 @@ int __ref add_memory(int nid, u64 start, u64 size)
struct resource *res;
int ret;
+ lock_system_sleep();
+
res = register_memory_resource(start, size);
+ ret = -EEXIST;
if (!res)
- return -EEXIST;
+ goto out;
if (!node_online(nid)) {
pgdat = hotadd_new_pgdat(nid, start);
+ ret = -ENOMEM;
if (!pgdat)
- return -ENOMEM;
+ goto out;
new_pgdat = 1;
}
@@ -514,7 +523,8 @@ int __ref add_memory(int nid, u64 start, u64 size)
BUG_ON(ret);
}
- return ret;
+ goto out;
+
error:
/* rollback pgdat allocation and others */
if (new_pgdat)
@@ -522,6 +532,8 @@ error:
if (res)
release_memory_resource(res);
+out:
+ unlock_system_sleep();
return ret;
}
EXPORT_SYMBOL_GPL(add_memory);
@@ -663,6 +675,9 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
if (!ret) { /* Success */
list_add_tail(&page->lru, &source);
move_pages--;
+ inc_zone_page_state(page, NR_ISOLATED_ANON +
+ page_is_file_cache(page));
+
} else {
/* Becasue we don't have big zone->lock. we should
check this again here. */
@@ -685,7 +700,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
if (list_empty(&source))
goto out;
/* this function returns # of failed pages */
- ret = migrate_pages(&source, hotremove_migrate_alloc, 0);
+ ret = migrate_pages(&source, hotremove_migrate_alloc, 0, 1);
out:
return ret;
@@ -738,7 +753,7 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
return offlined;
}
-int offline_pages(unsigned long start_pfn,
+static int offline_pages(unsigned long start_pfn,
unsigned long end_pfn, unsigned long timeout)
{
unsigned long pfn, nr_pages, expire;
@@ -758,6 +773,8 @@ int offline_pages(unsigned long start_pfn,
if (!test_pages_in_a_zone(start_pfn, end_pfn))
return -EINVAL;
+ lock_system_sleep();
+
zone = page_zone(pfn_to_page(start_pfn));
node = zone_to_nid(zone);
nr_pages = end_pfn - start_pfn;
@@ -765,7 +782,7 @@ int offline_pages(unsigned long start_pfn,
/* set above range as isolated */
ret = start_isolate_page_range(start_pfn, end_pfn);
if (ret)
- return ret;
+ goto out;
arg.start_pfn = start_pfn;
arg.nr_pages = nr_pages;
@@ -838,11 +855,16 @@ repeat:
setup_per_zone_wmarks();
calculate_zone_inactive_ratio(zone);
+ if (!node_present_pages(node)) {
+ node_clear_state(node, N_HIGH_MEMORY);
+ kswapd_stop(node);
+ }
vm_total_pages = nr_free_pagecache_pages();
writeback_set_ratelimit();
memory_notify(MEM_OFFLINE, &arg);
+ unlock_system_sleep();
return 0;
failed_removal:
@@ -852,6 +874,8 @@ failed_removal:
/* pushback to free area */
undo_isolate_page_range(start_pfn, end_pfn);
+out:
+ unlock_system_sleep();
return ret;
}
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 7dd9d9f80694..290fb5bf0440 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -85,10 +85,12 @@
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
#include <linux/migrate.h>
+#include <linux/ksm.h>
#include <linux/rmap.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/ctype.h>
+#include <linux/mm_inline.h>
#include <asm/tlbflush.h>
#include <asm/uaccess.h>
@@ -412,17 +414,11 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
if (!page)
continue;
/*
- * The check for PageReserved here is important to avoid
- * handling zero pages and other pages that may have been
- * marked special by the system.
- *
- * If the PageReserved would not be checked here then f.e.
- * the location of the zero page could have an influence
- * on MPOL_MF_STRICT, zero pages would be counted for
- * the per node stats, and there would be useless attempts
- * to put zero pages on the migration list.
+ * vm_normal_page() filters out zero pages, but there might
+ * still be PageReserved pages to skip, perhaps in a VDSO.
+ * And we cannot move PageKsm pages sensibly or safely yet.
*/
- if (PageReserved(page))
+ if (PageReserved(page) || PageKsm(page))
continue;
nid = page_to_nid(page);
if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
@@ -809,6 +805,8 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
if (!isolate_lru_page(page)) {
list_add_tail(&page->lru, pagelist);
+ inc_zone_page_state(page, NR_ISOLATED_ANON +
+ page_is_file_cache(page));
}
}
}
@@ -836,7 +834,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
flags | MPOL_MF_DISCONTIG_OK, &pagelist);
if (!list_empty(&pagelist))
- err = migrate_pages(&pagelist, new_node_page, dest);
+ err = migrate_pages(&pagelist, new_node_page, dest, 0);
return err;
}
@@ -1024,7 +1022,7 @@ static long do_mbind(unsigned long start, unsigned long len,
err = migrate_prep();
if (err)
- return err;
+ goto mpol_out;
}
{
NODEMASK_SCRATCH(scratch);
@@ -1039,10 +1037,9 @@ static long do_mbind(unsigned long start, unsigned long len,
err = -ENOMEM;
NODEMASK_SCRATCH_FREE(scratch);
}
- if (err) {
- mpol_put(new);
- return err;
- }
+ if (err)
+ goto mpol_out;
+
vma = check_range(mm, start, end, nmask,
flags | MPOL_MF_INVERT, &pagelist);
@@ -1054,13 +1051,15 @@ static long do_mbind(unsigned long start, unsigned long len,
if (!list_empty(&pagelist))
nr_failed = migrate_pages(&pagelist, new_vma_page,
- (unsigned long)vma);
+ (unsigned long)vma, 0);
if (!err && nr_failed && (flags & MPOL_MF_STRICT))
err = -EIO;
- }
+ } else
+ putback_lru_pages(&pagelist);
up_write(&mm->mmap_sem);
+ mpol_out:
mpol_put(new);
return err;
}
@@ -1564,6 +1563,53 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
}
return zl;
}
+
+/*
+ * init_nodemask_of_mempolicy
+ *
+ * If the current task's mempolicy is "default" [NULL], return 'false'
+ * to indicate default policy. Otherwise, extract the policy nodemask
+ * for 'bind' or 'interleave' policy into the argument nodemask, or
+ * initialize the argument nodemask to contain the single node for
+ * 'preferred' or 'local' policy and return 'true' to indicate presence
+ * of non-default mempolicy.
+ *
+ * We don't bother with reference counting the mempolicy [mpol_get/put]
+ * because the current task is examining it's own mempolicy and a task's
+ * mempolicy is only ever changed by the task itself.
+ *
+ * N.B., it is the caller's responsibility to free a returned nodemask.
+ */
+bool init_nodemask_of_mempolicy(nodemask_t *mask)
+{
+ struct mempolicy *mempolicy;
+ int nid;
+
+ if (!(mask && current->mempolicy))
+ return false;
+
+ mempolicy = current->mempolicy;
+ switch (mempolicy->mode) {
+ case MPOL_PREFERRED:
+ if (mempolicy->flags & MPOL_F_LOCAL)
+ nid = numa_node_id();
+ else
+ nid = mempolicy->v.preferred_node;
+ init_nodemask_of_node(mask, nid);
+ break;
+
+ case MPOL_BIND:
+ /* Fall through */
+ case MPOL_INTERLEAVE:
+ *mask = mempolicy->v.nodes;
+ break;
+
+ default:
+ BUG();
+ }
+
+ return true;
+}
#endif
/* Allocate a page in interleaved policy.
diff --git a/mm/migrate.c b/mm/migrate.c
index 16052e80aaac..efddbf0926b2 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -21,6 +21,7 @@
#include <linux/mm_inline.h>
#include <linux/nsproxy.h>
#include <linux/pagevec.h>
+#include <linux/ksm.h>
#include <linux/rmap.h>
#include <linux/topology.h>
#include <linux/cpu.h>
@@ -78,8 +79,8 @@ int putback_lru_pages(struct list_head *l)
/*
* Restore a potential migration pte to a working pte entry
*/
-static void remove_migration_pte(struct vm_area_struct *vma,
- struct page *old, struct page *new)
+static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
+ unsigned long addr, void *old)
{
struct mm_struct *mm = vma->vm_mm;
swp_entry_t entry;
@@ -88,40 +89,37 @@ static void remove_migration_pte(struct vm_area_struct *vma,
pmd_t *pmd;
pte_t *ptep, pte;
spinlock_t *ptl;
- unsigned long addr = page_address_in_vma(new, vma);
-
- if (addr == -EFAULT)
- return;
pgd = pgd_offset(mm, addr);
if (!pgd_present(*pgd))
- return;
+ goto out;
pud = pud_offset(pgd, addr);
if (!pud_present(*pud))
- return;
+ goto out;
pmd = pmd_offset(pud, addr);
if (!pmd_present(*pmd))
- return;
+ goto out;
ptep = pte_offset_map(pmd, addr);
if (!is_swap_pte(*ptep)) {
pte_unmap(ptep);
- return;
+ goto out;
}
ptl = pte_lockptr(mm, pmd);
spin_lock(ptl);
pte = *ptep;
if (!is_swap_pte(pte))
- goto out;
+ goto unlock;
entry = pte_to_swp_entry(pte);
- if (!is_migration_entry(entry) || migration_entry_to_page(entry) != old)
- goto out;
+ if (!is_migration_entry(entry) ||
+ migration_entry_to_page(entry) != old)
+ goto unlock;
get_page(new);
pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
@@ -137,58 +135,10 @@ static void remove_migration_pte(struct vm_area_struct *vma,
/* No need to invalidate - it was non-present before */
update_mmu_cache(vma, addr, pte);
-
-out:
+unlock:
pte_unmap_unlock(ptep, ptl);
-}
-
-/*
- * Note that remove_file_migration_ptes will only work on regular mappings,
- * Nonlinear mappings do not use migration entries.
- */
-static void remove_file_migration_ptes(struct page *old, struct page *new)
-{
- struct vm_area_struct *vma;
- struct address_space *mapping = new->mapping;
- struct prio_tree_iter iter;
- pgoff_t pgoff = new->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
-
- if (!mapping)
- return;
-
- spin_lock(&mapping->i_mmap_lock);
-
- vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff)
- remove_migration_pte(vma, old, new);
-
- spin_unlock(&mapping->i_mmap_lock);
-}
-
-/*
- * Must hold mmap_sem lock on at least one of the vmas containing
- * the page so that the anon_vma cannot vanish.
- */
-static void remove_anon_migration_ptes(struct page *old, struct page *new)
-{
- struct anon_vma *anon_vma;
- struct vm_area_struct *vma;
- unsigned long mapping;
-
- mapping = (unsigned long)new->mapping;
-
- if (!mapping || (mapping & PAGE_MAPPING_ANON) == 0)
- return;
-
- /*
- * We hold the mmap_sem lock. So no need to call page_lock_anon_vma.
- */
- anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON);
- spin_lock(&anon_vma->lock);
-
- list_for_each_entry(vma, &anon_vma->head, anon_vma_node)
- remove_migration_pte(vma, old, new);
-
- spin_unlock(&anon_vma->lock);
+out:
+ return SWAP_AGAIN;
}
/*
@@ -197,10 +147,7 @@ static void remove_anon_migration_ptes(struct page *old, struct page *new)
*/
static void remove_migration_ptes(struct page *old, struct page *new)
{
- if (PageAnon(new))
- remove_anon_migration_ptes(old, new);
- else
- remove_file_migration_ptes(old, new);
+ rmap_walk(new, remove_migration_pte, old);
}
/*
@@ -341,8 +288,8 @@ static void migrate_page_copy(struct page *newpage, struct page *page)
if (TestClearPageActive(page)) {
VM_BUG_ON(PageUnevictable(page));
SetPageActive(newpage);
- } else
- unevictable_migrate_page(newpage, page);
+ } else if (TestClearPageUnevictable(page))
+ SetPageUnevictable(newpage);
if (PageChecked(page))
SetPageChecked(newpage);
if (PageMappedToDisk(page))
@@ -361,6 +308,7 @@ static void migrate_page_copy(struct page *newpage, struct page *page)
}
mlock_migrate_page(newpage, page);
+ ksm_migrate_page(newpage, page);
ClearPageSwapCache(page);
ClearPagePrivate(page);
@@ -580,9 +528,9 @@ static int move_to_new_page(struct page *newpage, struct page *page)
else
rc = fallback_migrate_page(mapping, newpage, page);
- if (!rc) {
+ if (!rc)
remove_migration_ptes(page, newpage);
- } else
+ else
newpage->mapping = NULL;
unlock_page(newpage);
@@ -595,14 +543,14 @@ static int move_to_new_page(struct page *newpage, struct page *page)
* to the newly allocated page in newpage.
*/
static int unmap_and_move(new_page_t get_new_page, unsigned long private,
- struct page *page, int force)
+ struct page *page, int force, int offlining)
{
int rc = 0;
int *result = NULL;
struct page *newpage = get_new_page(page, private, &result);
int rcu_locked = 0;
int charge = 0;
- struct mem_cgroup *mem;
+ struct mem_cgroup *mem = NULL;
if (!newpage)
return -ENOMEM;
@@ -621,6 +569,20 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
lock_page(page);
}
+ /*
+ * Only memory hotplug's offline_pages() caller has locked out KSM,
+ * and can safely migrate a KSM page. The other cases have skipped
+ * PageKsm along with PageReserved - but it is only now when we have
+ * the page lock that we can be certain it will not go KSM beneath us
+ * (KSM will not upgrade a page from PageAnon to PageKsm when it sees
+ * its pagecount raised, but only here do we take the page lock which
+ * serializes that).
+ */
+ if (PageKsm(page) && !offlining) {
+ rc = -EBUSY;
+ goto unlock;
+ }
+
/* charge against new page */
charge = mem_cgroup_prepare_migration(page, &mem);
if (charge == -ENOMEM) {
@@ -675,7 +637,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
}
/* Establish migration ptes or remove ptes */
- try_to_unmap(page, 1);
+ try_to_unmap(page, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
skip_unmap:
if (!page_mapped(page))
@@ -737,7 +699,7 @@ move_newpage:
* Return: Number of pages not migrated or error code.
*/
int migrate_pages(struct list_head *from,
- new_page_t get_new_page, unsigned long private)
+ new_page_t get_new_page, unsigned long private, int offlining)
{
int retry = 1;
int nr_failed = 0;
@@ -746,13 +708,6 @@ int migrate_pages(struct list_head *from,
struct page *page2;
int swapwrite = current->flags & PF_SWAPWRITE;
int rc;
- unsigned long flags;
-
- local_irq_save(flags);
- list_for_each_entry(page, from, lru)
- __inc_zone_page_state(page, NR_ISOLATED_ANON +
- page_is_file_cache(page));
- local_irq_restore(flags);
if (!swapwrite)
current->flags |= PF_SWAPWRITE;
@@ -764,7 +719,7 @@ int migrate_pages(struct list_head *from,
cond_resched();
rc = unmap_and_move(get_new_page, private,
- page, pass > 2);
+ page, pass > 2, offlining);
switch(rc) {
case -ENOMEM:
@@ -860,7 +815,8 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
if (!page)
goto set_status;
- if (PageReserved(page)) /* Check for zero page */
+ /* Use PageReserved to check for zero page */
+ if (PageReserved(page) || PageKsm(page))
goto put_and_set;
pp->page = page;
@@ -878,8 +834,11 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
goto put_and_set;
err = isolate_lru_page(page);
- if (!err)
+ if (!err) {
list_add_tail(&page->lru, &pagelist);
+ inc_zone_page_state(page, NR_ISOLATED_ANON +
+ page_is_file_cache(page));
+ }
put_and_set:
/*
* Either remove the duplicate refcount from
@@ -894,7 +853,7 @@ set_status:
err = 0;
if (!list_empty(&pagelist))
err = migrate_pages(&pagelist, new_page_node,
- (unsigned long)pm);
+ (unsigned long)pm, 0);
up_read(&mm->mmap_sem);
return err;
@@ -1015,7 +974,7 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
err = -ENOENT;
/* Use PageReserved to check for zero page */
- if (!page || PageReserved(page))
+ if (!page || PageReserved(page) || PageKsm(page))
goto set_status;
err = page_to_nid(page);
@@ -1044,7 +1003,7 @@ static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
int err;
for (i = 0; i < nr_pages; i += chunk_nr) {
- if (chunk_nr + i > nr_pages)
+ if (chunk_nr > nr_pages - i)
chunk_nr = nr_pages - i;
err = copy_from_user(chunk_pages, &pages[i],
diff --git a/mm/mincore.c b/mm/mincore.c
index 8cb508f84ea4..7a3436ef39eb 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -14,6 +14,7 @@
#include <linux/syscalls.h>
#include <linux/swap.h>
#include <linux/swapops.h>
+#include <linux/hugetlb.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
@@ -72,6 +73,42 @@ static long do_mincore(unsigned long addr, unsigned char *vec, unsigned long pag
if (!vma || addr < vma->vm_start)
return -ENOMEM;
+#ifdef CONFIG_HUGETLB_PAGE
+ if (is_vm_hugetlb_page(vma)) {
+ struct hstate *h;
+ unsigned long nr_huge;
+ unsigned char present;
+
+ i = 0;
+ nr = min(pages, (vma->vm_end - addr) >> PAGE_SHIFT);
+ h = hstate_vma(vma);
+ nr_huge = ((addr + pages * PAGE_SIZE - 1) >> huge_page_shift(h))
+ - (addr >> huge_page_shift(h)) + 1;
+ nr_huge = min(nr_huge,
+ (vma->vm_end - addr) >> huge_page_shift(h));
+ while (1) {
+ /* hugepage always in RAM for now,
+ * but generally it needs to be check */
+ ptep = huge_pte_offset(current->mm,
+ addr & huge_page_mask(h));
+ present = !!(ptep &&
+ !huge_pte_none(huge_ptep_get(ptep)));
+ while (1) {
+ vec[i++] = present;
+ addr += PAGE_SIZE;
+ /* reach buffer limit */
+ if (i == nr)
+ return nr;
+ /* check hugepage border */
+ if (!((addr & ~huge_page_mask(h))
+ >> PAGE_SHIFT))
+ break;
+ }
+ }
+ return nr;
+ }
+#endif
+
/*
* Calculate how many pages there are left in the last level of the
* PTE array for our address.
diff --git a/mm/mlock.c b/mm/mlock.c
index bd6f0e466f6c..2b8335a89400 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -88,25 +88,22 @@ void mlock_vma_page(struct page *page)
}
}
-/*
- * called from munlock()/munmap() path with page supposedly on the LRU.
+/**
+ * munlock_vma_page - munlock a vma page
+ * @page - page to be unlocked
*
- * Note: unlike mlock_vma_page(), we can't just clear the PageMlocked
- * [in try_to_munlock()] and then attempt to isolate the page. We must
- * isolate the page to keep others from messing with its unevictable
- * and mlocked state while trying to munlock. However, we pre-clear the
- * mlocked state anyway as we might lose the isolation race and we might
- * not get another chance to clear PageMlocked. If we successfully
- * isolate the page and try_to_munlock() detects other VM_LOCKED vmas
- * mapping the page, it will restore the PageMlocked state, unless the page
- * is mapped in a non-linear vma. So, we go ahead and SetPageMlocked(),
- * perhaps redundantly.
- * If we lose the isolation race, and the page is mapped by other VM_LOCKED
- * vmas, we'll detect this in vmscan--via try_to_munlock() or try_to_unmap()
- * either of which will restore the PageMlocked state by calling
- * mlock_vma_page() above, if it can grab the vma's mmap sem.
+ * called from munlock()/munmap() path with page supposedly on the LRU.
+ * When we munlock a page, because the vma where we found the page is being
+ * munlock()ed or munmap()ed, we want to check whether other vmas hold the
+ * page locked so that we can leave it on the unevictable lru list and not
+ * bother vmscan with it. However, to walk the page's rmap list in
+ * try_to_munlock() we must isolate the page from the LRU. If some other
+ * task has removed the page from the LRU, we won't be able to do that.
+ * So we clear the PageMlocked as we might not get another chance. If we
+ * can't isolate the page, we leave it for putback_lru_page() and vmscan
+ * [page_referenced()/try_to_unmap()] to deal with.
*/
-static void munlock_vma_page(struct page *page)
+void munlock_vma_page(struct page *page)
{
BUG_ON(!PageLocked(page));
@@ -117,18 +114,18 @@ static void munlock_vma_page(struct page *page)
/*
* did try_to_unlock() succeed or punt?
*/
- if (ret == SWAP_SUCCESS || ret == SWAP_AGAIN)
+ if (ret != SWAP_MLOCK)
count_vm_event(UNEVICTABLE_PGMUNLOCKED);
putback_lru_page(page);
} else {
/*
- * We lost the race. let try_to_unmap() deal
- * with it. At least we get the page state and
- * mlock stats right. However, page is still on
- * the noreclaim list. We'll fix that up when
- * the page is eventually freed or we scan the
- * noreclaim list.
+ * Some other task has removed the page from the LRU.
+ * putback_lru_page() will take care of removing the
+ * page from the unevictable list, if necessary.
+ * vmscan [page_referenced()] will move the page back
+ * to the unevictable list if some other vma has it
+ * mlocked.
*/
if (PageUnevictable(page))
count_vm_event(UNEVICTABLE_PGSTRANDED);
diff --git a/mm/mmap.c b/mm/mmap.c
index 21d4029a07b3..d9c77b2dbe9d 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -20,7 +20,6 @@
#include <linux/fs.h>
#include <linux/personality.h>
#include <linux/security.h>
-#include <linux/ima.h>
#include <linux/hugetlb.h>
#include <linux/profile.h>
#include <linux/module.h>
@@ -932,13 +931,9 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
if (!(flags & MAP_FIXED))
addr = round_hint_to_min(addr);
- error = arch_mmap_check(addr, len, flags);
- if (error)
- return error;
-
/* Careful about overflows.. */
len = PAGE_ALIGN(len);
- if (!len || len > TASK_SIZE)
+ if (!len)
return -ENOMEM;
/* offset overflow? */
@@ -949,24 +944,6 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
if (mm->map_count > sysctl_max_map_count)
return -ENOMEM;
- if (flags & MAP_HUGETLB) {
- struct user_struct *user = NULL;
- if (file)
- return -EINVAL;
-
- /*
- * VM_NORESERVE is used because the reservations will be
- * taken when vm_ops->mmap() is called
- * A dummy user value is used because we are not locking
- * memory so no accounting is necessary
- */
- len = ALIGN(len, huge_page_size(&default_hstate));
- file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE,
- &user, HUGETLB_ANONHUGE_INODE);
- if (IS_ERR(file))
- return PTR_ERR(file);
- }
-
/* Obtain the address to map to. we verify (or select) it and ensure
* that it represents a valid section of the address space.
*/
@@ -1061,9 +1038,6 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
error = security_file_mmap(file, reqprot, prot, flags, addr, 0);
if (error)
return error;
- error = ima_file_mmap(file, prot);
- if (error)
- return error;
return mmap_region(file, addr, len, flags, vm_flags, pgoff);
}
@@ -1224,8 +1198,20 @@ munmap_back:
goto free_vma;
}
- if (vma_wants_writenotify(vma))
+ if (vma_wants_writenotify(vma)) {
+ pgprot_t pprot = vma->vm_page_prot;
+
+ /* Can vma->vm_page_prot have changed??
+ *
+ * Answer: Yes, drivers may have changed it in their
+ * f_op->mmap method.
+ *
+ * Ensures that vmas marked as uncached stay that way.
+ */
vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED);
+ if (pgprot_val(pprot) == pgprot_val(pgprot_noncached(pprot)))
+ vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+ }
vma_link(mm, vma, prev, rb_link, rb_parent);
file = vma->vm_file;
@@ -1459,6 +1445,14 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
unsigned long (*get_area)(struct file *, unsigned long,
unsigned long, unsigned long, unsigned long);
+ unsigned long error = arch_mmap_check(addr, len, flags);
+ if (error)
+ return error;
+
+ /* Careful about overflows.. */
+ if (len > TASK_SIZE)
+ return -ENOMEM;
+
get_area = current->mm->get_unmapped_area;
if (file && file->f_op && file->f_op->get_unmapped_area)
get_area = file->f_op->get_unmapped_area;
@@ -1829,10 +1823,10 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
}
/*
- * Split a vma into two pieces at address 'addr', a new vma is allocated
- * either for the first part or the tail.
+ * __split_vma() bypasses sysctl_max_map_count checking. We use this on the
+ * munmap path where it doesn't make sense to fail.
*/
-int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
+static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
unsigned long addr, int new_below)
{
struct mempolicy *pol;
@@ -1842,9 +1836,6 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
~(huge_page_mask(hstate_vma(vma)))))
return -EINVAL;
- if (mm->map_count >= sysctl_max_map_count)
- return -ENOMEM;
-
new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
if (!new)
return -ENOMEM;
@@ -1884,6 +1875,19 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
return 0;
}
+/*
+ * Split a vma into two pieces at address 'addr', a new vma is allocated
+ * either for the first part or the tail.
+ */
+int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long addr, int new_below)
+{
+ if (mm->map_count >= sysctl_max_map_count)
+ return -ENOMEM;
+
+ return __split_vma(mm, vma, addr, new_below);
+}
+
/* Munmap is split into 2 main parts -- this part which finds
* what needs doing, and the areas themselves, which do the
* work. This now handles partial unmappings.
@@ -1919,7 +1923,17 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
* places tmp vma above, and higher split_vma places tmp vma below.
*/
if (start > vma->vm_start) {
- int error = split_vma(mm, vma, start, 0);
+ int error;
+
+ /*
+ * Make sure that map_count on return from munmap() will
+ * not exceed its limit; but let map_count go just above
+ * its limit temporarily, to help free resources as expected.
+ */
+ if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count)
+ return -ENOMEM;
+
+ error = __split_vma(mm, vma, start, 0);
if (error)
return error;
prev = vma;
@@ -1928,7 +1942,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
/* Does it split the last one? */
last = find_vma(mm, end);
if (last && end > last->vm_start) {
- int error = split_vma(mm, last, end, 1);
+ int error = __split_vma(mm, last, end, 1);
if (error)
return error;
}
@@ -2003,20 +2017,14 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
if (!len)
return addr;
- if ((addr + len) > TASK_SIZE || (addr + len) < addr)
- return -EINVAL;
-
- if (is_hugepage_only_range(mm, addr, len))
- return -EINVAL;
-
error = security_file_mmap(NULL, 0, 0, 0, addr, 1);
if (error)
return error;
flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
- error = arch_mmap_check(addr, len, flags);
- if (error)
+ error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
+ if (error & ~PAGE_MASK)
return error;
/*
@@ -2282,7 +2290,7 @@ static void special_mapping_close(struct vm_area_struct *vma)
{
}
-static struct vm_operations_struct special_mapping_vmops = {
+static const struct vm_operations_struct special_mapping_vmops = {
.close = special_mapping_close,
.fault = special_mapping_fault,
};
diff --git a/mm/mremap.c b/mm/mremap.c
index 20a07dba6be0..845190898d59 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -86,8 +86,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
if (vma->vm_file) {
/*
* Subtle point from Rajesh Venkatasubramanian: before
- * moving file-based ptes, we must lock vmtruncate out,
- * since it might clean the dst vma before the src vma,
+ * moving file-based ptes, we must lock truncate_pagecache
+ * out, since it might clean the dst vma before the src vma,
* and we propagate stale pages into the dst afterward.
*/
mapping = vma->vm_file->f_mapping;
@@ -261,6 +261,137 @@ static unsigned long move_vma(struct vm_area_struct *vma,
return new_addr;
}
+static struct vm_area_struct *vma_to_resize(unsigned long addr,
+ unsigned long old_len, unsigned long new_len, unsigned long *p)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma = find_vma(mm, addr);
+
+ if (!vma || vma->vm_start > addr)
+ goto Efault;
+
+ if (is_vm_hugetlb_page(vma))
+ goto Einval;
+
+ /* We can't remap across vm area boundaries */
+ if (old_len > vma->vm_end - addr)
+ goto Efault;
+
+ if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) {
+ if (new_len > old_len)
+ goto Efault;
+ }
+
+ if (vma->vm_flags & VM_LOCKED) {
+ unsigned long locked, lock_limit;
+ locked = mm->locked_vm << PAGE_SHIFT;
+ lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
+ locked += new_len - old_len;
+ if (locked > lock_limit && !capable(CAP_IPC_LOCK))
+ goto Eagain;
+ }
+
+ if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT))
+ goto Enomem;
+
+ if (vma->vm_flags & VM_ACCOUNT) {
+ unsigned long charged = (new_len - old_len) >> PAGE_SHIFT;
+ if (security_vm_enough_memory(charged))
+ goto Efault;
+ *p = charged;
+ }
+
+ return vma;
+
+Efault: /* very odd choice for most of the cases, but... */
+ return ERR_PTR(-EFAULT);
+Einval:
+ return ERR_PTR(-EINVAL);
+Enomem:
+ return ERR_PTR(-ENOMEM);
+Eagain:
+ return ERR_PTR(-EAGAIN);
+}
+
+static unsigned long mremap_to(unsigned long addr,
+ unsigned long old_len, unsigned long new_addr,
+ unsigned long new_len)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+ unsigned long ret = -EINVAL;
+ unsigned long charged = 0;
+ unsigned long map_flags;
+
+ if (new_addr & ~PAGE_MASK)
+ goto out;
+
+ if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len)
+ goto out;
+
+ /* Check if the location we're moving into overlaps the
+ * old location at all, and fail if it does.
+ */
+ if ((new_addr <= addr) && (new_addr+new_len) > addr)
+ goto out;
+
+ if ((addr <= new_addr) && (addr+old_len) > new_addr)
+ goto out;
+
+ ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1);
+ if (ret)
+ goto out;
+
+ ret = do_munmap(mm, new_addr, new_len);
+ if (ret)
+ goto out;
+
+ if (old_len >= new_len) {
+ ret = do_munmap(mm, addr+new_len, old_len - new_len);
+ if (ret && old_len != new_len)
+ goto out;
+ old_len = new_len;
+ }
+
+ vma = vma_to_resize(addr, old_len, new_len, &charged);
+ if (IS_ERR(vma)) {
+ ret = PTR_ERR(vma);
+ goto out;
+ }
+
+ map_flags = MAP_FIXED;
+ if (vma->vm_flags & VM_MAYSHARE)
+ map_flags |= MAP_SHARED;
+
+ ret = get_unmapped_area(vma->vm_file, new_addr, new_len, vma->vm_pgoff +
+ ((addr - vma->vm_start) >> PAGE_SHIFT),
+ map_flags);
+ if (ret & ~PAGE_MASK)
+ goto out1;
+
+ ret = move_vma(vma, addr, old_len, new_len, new_addr);
+ if (!(ret & ~PAGE_MASK))
+ goto out;
+out1:
+ vm_unacct_memory(charged);
+
+out:
+ return ret;
+}
+
+static int vma_expandable(struct vm_area_struct *vma, unsigned long delta)
+{
+ unsigned long end = vma->vm_end + delta;
+ if (end < vma->vm_end) /* overflow */
+ return 0;
+ if (vma->vm_next && vma->vm_next->vm_start < end) /* intersection */
+ return 0;
+ if (get_unmapped_area(NULL, vma->vm_start, end - vma->vm_start,
+ 0, MAP_FIXED) & ~PAGE_MASK)
+ return 0;
+ return 1;
+}
+
/*
* Expand (or shrink) an existing mapping, potentially moving it at the
* same time (controlled by the MREMAP_MAYMOVE flag and available VM space)
@@ -294,32 +425,10 @@ unsigned long do_mremap(unsigned long addr,
if (!new_len)
goto out;
- /* new_addr is only valid if MREMAP_FIXED is specified */
if (flags & MREMAP_FIXED) {
- if (new_addr & ~PAGE_MASK)
- goto out;
- if (!(flags & MREMAP_MAYMOVE))
- goto out;
-
- if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len)
- goto out;
-
- /* Check if the location we're moving into overlaps the
- * old location at all, and fail if it does.
- */
- if ((new_addr <= addr) && (new_addr+new_len) > addr)
- goto out;
-
- if ((addr <= new_addr) && (addr+old_len) > new_addr)
- goto out;
-
- ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1);
- if (ret)
- goto out;
-
- ret = do_munmap(mm, new_addr, new_len);
- if (ret)
- goto out;
+ if (flags & MREMAP_MAYMOVE)
+ ret = mremap_to(addr, old_len, new_addr, new_len);
+ goto out;
}
/*
@@ -332,60 +441,23 @@ unsigned long do_mremap(unsigned long addr,
if (ret && old_len != new_len)
goto out;
ret = addr;
- if (!(flags & MREMAP_FIXED) || (new_addr == addr))
- goto out;
- old_len = new_len;
+ goto out;
}
/*
- * Ok, we need to grow.. or relocate.
+ * Ok, we need to grow..
*/
- ret = -EFAULT;
- vma = find_vma(mm, addr);
- if (!vma || vma->vm_start > addr)
- goto out;
- if (is_vm_hugetlb_page(vma)) {
- ret = -EINVAL;
- goto out;
- }
- /* We can't remap across vm area boundaries */
- if (old_len > vma->vm_end - addr)
- goto out;
- if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) {
- if (new_len > old_len)
- goto out;
- }
- if (vma->vm_flags & VM_LOCKED) {
- unsigned long locked, lock_limit;
- locked = mm->locked_vm << PAGE_SHIFT;
- lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
- locked += new_len - old_len;
- ret = -EAGAIN;
- if (locked > lock_limit && !capable(CAP_IPC_LOCK))
- goto out;
- }
- if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT)) {
- ret = -ENOMEM;
+ vma = vma_to_resize(addr, old_len, new_len, &charged);
+ if (IS_ERR(vma)) {
+ ret = PTR_ERR(vma);
goto out;
}
- if (vma->vm_flags & VM_ACCOUNT) {
- charged = (new_len - old_len) >> PAGE_SHIFT;
- if (security_vm_enough_memory(charged))
- goto out_nc;
- }
-
/* old_len exactly to the end of the area..
- * And we're not relocating the area.
*/
- if (old_len == vma->vm_end - addr &&
- !((flags & MREMAP_FIXED) && (addr != new_addr)) &&
- (old_len != new_len || !(flags & MREMAP_MAYMOVE))) {
- unsigned long max_addr = TASK_SIZE;
- if (vma->vm_next)
- max_addr = vma->vm_next->vm_start;
+ if (old_len == vma->vm_end - addr) {
/* can we just expand the current mapping? */
- if (max_addr - addr >= new_len) {
+ if (vma_expandable(vma, new_len - old_len)) {
int pages = (new_len - old_len) >> PAGE_SHIFT;
vma_adjust(vma, vma->vm_start,
@@ -409,28 +481,27 @@ unsigned long do_mremap(unsigned long addr,
*/
ret = -ENOMEM;
if (flags & MREMAP_MAYMOVE) {
- if (!(flags & MREMAP_FIXED)) {
- unsigned long map_flags = 0;
- if (vma->vm_flags & VM_MAYSHARE)
- map_flags |= MAP_SHARED;
-
- new_addr = get_unmapped_area(vma->vm_file, 0, new_len,
- vma->vm_pgoff, map_flags);
- if (new_addr & ~PAGE_MASK) {
- ret = new_addr;
- goto out;
- }
-
- ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1);
- if (ret)
- goto out;
+ unsigned long map_flags = 0;
+ if (vma->vm_flags & VM_MAYSHARE)
+ map_flags |= MAP_SHARED;
+
+ new_addr = get_unmapped_area(vma->vm_file, 0, new_len,
+ vma->vm_pgoff +
+ ((addr - vma->vm_start) >> PAGE_SHIFT),
+ map_flags);
+ if (new_addr & ~PAGE_MASK) {
+ ret = new_addr;
+ goto out;
}
+
+ ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1);
+ if (ret)
+ goto out;
ret = move_vma(vma, addr, old_len, new_len, new_addr);
}
out:
if (ret & ~PAGE_MASK)
vm_unacct_memory(charged);
-out_nc:
return ret;
}
diff --git a/mm/nommu.c b/mm/nommu.c
index 8d484241d034..8687973462bb 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -79,50 +79,10 @@ static struct kmem_cache *vm_region_jar;
struct rb_root nommu_region_tree = RB_ROOT;
DECLARE_RWSEM(nommu_region_sem);
-struct vm_operations_struct generic_file_vm_ops = {
+const struct vm_operations_struct generic_file_vm_ops = {
};
/*
- * Handle all mappings that got truncated by a "truncate()"
- * system call.
- *
- * NOTE! We have to be ready to update the memory sharing
- * between the file and the memory map for a potential last
- * incomplete page. Ugly, but necessary.
- */
-int vmtruncate(struct inode *inode, loff_t offset)
-{
- struct address_space *mapping = inode->i_mapping;
- unsigned long limit;
-
- if (inode->i_size < offset)
- goto do_expand;
- i_size_write(inode, offset);
-
- truncate_inode_pages(mapping, offset);
- goto out_truncate;
-
-do_expand:
- limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
- if (limit != RLIM_INFINITY && offset > limit)
- goto out_sig;
- if (offset > inode->i_sb->s_maxbytes)
- goto out;
- i_size_write(inode, offset);
-
-out_truncate:
- if (inode->i_op->truncate)
- inode->i_op->truncate(inode);
- return 0;
-out_sig:
- send_sig(SIGXFSZ, current, 0);
-out:
- return -EFBIG;
-}
-
-EXPORT_SYMBOL(vmtruncate);
-
-/*
* Return the total memory allocated for this pointer, not
* just what the caller asked for.
*
@@ -866,7 +826,7 @@ static int validate_mmap_request(struct file *file,
int ret;
/* do the simple checks first */
- if (flags & MAP_FIXED || addr) {
+ if (flags & MAP_FIXED) {
printk(KERN_DEBUG
"%d: Can't do fixed-address/overlay mmap of RAM\n",
current->pid);
@@ -1074,7 +1034,7 @@ static int do_mmap_shared_file(struct vm_area_struct *vma)
ret = vma->vm_file->f_op->mmap(vma->vm_file, vma);
if (ret == 0) {
vma->vm_region->vm_top = vma->vm_region->vm_end;
- return ret;
+ return 0;
}
if (ret != -ENOSYS)
return ret;
@@ -1091,7 +1051,8 @@ static int do_mmap_shared_file(struct vm_area_struct *vma)
*/
static int do_mmap_private(struct vm_area_struct *vma,
struct vm_region *region,
- unsigned long len)
+ unsigned long len,
+ unsigned long capabilities)
{
struct page *pages;
unsigned long total, point, n, rlen;
@@ -1102,13 +1063,13 @@ static int do_mmap_private(struct vm_area_struct *vma,
* shared mappings on devices or memory
* - VM_MAYSHARE will be set if it may attempt to share
*/
- if (vma->vm_file) {
+ if (capabilities & BDI_CAP_MAP_DIRECT) {
ret = vma->vm_file->f_op->mmap(vma->vm_file, vma);
if (ret == 0) {
/* shouldn't return success if we're not sharing */
BUG_ON(!(vma->vm_flags & VM_MAYSHARE));
vma->vm_region->vm_top = vma->vm_region->vm_end;
- return ret;
+ return 0;
}
if (ret != -ENOSYS)
return ret;
@@ -1182,9 +1143,6 @@ static int do_mmap_private(struct vm_area_struct *vma,
if (ret < rlen)
memset(base + ret, 0, rlen - ret);
- } else {
- /* if it's an anonymous mapping, then just clear it */
- memset(base, 0, rlen);
}
return 0;
@@ -1221,9 +1179,6 @@ unsigned long do_mmap_pgoff(struct file *file,
kenter(",%lx,%lx,%lx,%lx,%lx", addr, len, prot, flags, pgoff);
- if (!(flags & MAP_FIXED))
- addr = round_hint_to_min(addr);
-
/* decide whether we should attempt the mapping, and if so what sort of
* mapping */
ret = validate_mmap_request(file, addr, len, prot, flags, pgoff,
@@ -1233,6 +1188,9 @@ unsigned long do_mmap_pgoff(struct file *file,
return ret;
}
+ /* we ignore the address hint */
+ addr = 0;
+
/* we've determined that we can make the mapping, now translate what we
* now know into VMA flags */
vm_flags = determine_vm_flags(file, prot, flags, capabilities);
@@ -1346,7 +1304,7 @@ unsigned long do_mmap_pgoff(struct file *file,
* - this is the hook for quasi-memory character devices to
* tell us the location of a shared mapping
*/
- if (file && file->f_op->get_unmapped_area) {
+ if (capabilities & BDI_CAP_MAP_DIRECT) {
addr = file->f_op->get_unmapped_area(file, addr, len,
pgoff, flags);
if (IS_ERR((void *) addr)) {
@@ -1370,15 +1328,22 @@ unsigned long do_mmap_pgoff(struct file *file,
}
vma->vm_region = region;
- add_nommu_region(region);
- /* set up the mapping */
+ /* set up the mapping
+ * - the region is filled in if BDI_CAP_MAP_DIRECT is still set
+ */
if (file && vma->vm_flags & VM_SHARED)
ret = do_mmap_shared_file(vma);
else
- ret = do_mmap_private(vma, region, len);
+ ret = do_mmap_private(vma, region, len, capabilities);
if (ret < 0)
- goto error_put_region;
+ goto error_just_free;
+ add_nommu_region(region);
+
+ /* clear anonymous mappings that don't ask for uninitialized data */
+ if (!vma->vm_file && !(flags & MAP_UNINITIALIZED))
+ memset((void *)region->vm_start, 0,
+ region->vm_end - region->vm_start);
/* okay... we have a mapping; now we have to register it */
result = vma->vm_start;
@@ -1396,25 +1361,14 @@ share:
kleave(" = %lx", result);
return result;
-error_put_region:
- __put_nommu_region(region);
- if (vma) {
- if (vma->vm_file) {
- fput(vma->vm_file);
- if (vma->vm_flags & VM_EXECUTABLE)
- removed_exe_file_vma(vma->vm_mm);
- }
- kmem_cache_free(vm_area_cachep, vma);
- }
- kleave(" = %d [pr]", ret);
- return ret;
-
error_just_free:
up_write(&nommu_region_sem);
error:
- fput(region->vm_file);
+ if (region->vm_file)
+ fput(region->vm_file);
kmem_cache_free(vm_region_jar, region);
- fput(vma->vm_file);
+ if (vma->vm_file)
+ fput(vma->vm_file);
if (vma->vm_flags & VM_EXECUTABLE)
removed_exe_file_vma(vma->vm_mm);
kmem_cache_free(vm_area_cachep, vma);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index ea2147dabba6..f52481b1c1e5 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -196,27 +196,46 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
/*
* Determine the type of allocation constraint.
*/
-static inline enum oom_constraint constrained_alloc(struct zonelist *zonelist,
- gfp_t gfp_mask)
-{
#ifdef CONFIG_NUMA
+static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
+ gfp_t gfp_mask, nodemask_t *nodemask)
+{
struct zone *zone;
struct zoneref *z;
enum zone_type high_zoneidx = gfp_zone(gfp_mask);
- nodemask_t nodes = node_states[N_HIGH_MEMORY];
- for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
- if (cpuset_zone_allowed_softwall(zone, gfp_mask))
- node_clear(zone_to_nid(zone), nodes);
- else
- return CONSTRAINT_CPUSET;
+ /*
+ * Reach here only when __GFP_NOFAIL is used. So, we should avoid
+ * to kill current.We have to random task kill in this case.
+ * Hopefully, CONSTRAINT_THISNODE...but no way to handle it, now.
+ */
+ if (gfp_mask & __GFP_THISNODE)
+ return CONSTRAINT_NONE;
- if (!nodes_empty(nodes))
+ /*
+ * The nodemask here is a nodemask passed to alloc_pages(). Now,
+ * cpuset doesn't use this nodemask for its hardwall/softwall/hierarchy
+ * feature. mempolicy is an only user of nodemask here.
+ * check mempolicy's nodemask contains all N_HIGH_MEMORY
+ */
+ if (nodemask && !nodes_subset(node_states[N_HIGH_MEMORY], *nodemask))
return CONSTRAINT_MEMORY_POLICY;
-#endif
+ /* Check this allocation failure is caused by cpuset's wall function */
+ for_each_zone_zonelist_nodemask(zone, z, zonelist,
+ high_zoneidx, nodemask)
+ if (!cpuset_zone_allowed_softwall(zone, gfp_mask))
+ return CONSTRAINT_CPUSET;
+
+ return CONSTRAINT_NONE;
+}
+#else
+static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
+ gfp_t gfp_mask, nodemask_t *nodemask)
+{
return CONSTRAINT_NONE;
}
+#endif
/*
* Simple selection loop. We chose the process with the highest
@@ -337,6 +356,24 @@ static void dump_tasks(const struct mem_cgroup *mem)
} while_each_thread(g, p);
}
+static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
+ struct mem_cgroup *mem)
+{
+ pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
+ "oom_adj=%d\n",
+ current->comm, gfp_mask, order, current->signal->oom_adj);
+ task_lock(current);
+ cpuset_print_task_mems_allowed(current);
+ task_unlock(current);
+ dump_stack();
+ mem_cgroup_print_oom_info(mem, p);
+ show_mem();
+ if (sysctl_oom_dump_tasks)
+ dump_tasks(mem);
+}
+
+#define K(x) ((x) << (PAGE_SHIFT-10))
+
/*
* Send SIGKILL to the selected process irrespective of CAP_SYS_RAW_IO
* flag though it's unlikely that we select a process with CAP_SYS_RAW_IO
@@ -350,15 +387,23 @@ static void __oom_kill_task(struct task_struct *p, int verbose)
return;
}
+ task_lock(p);
if (!p->mm) {
WARN_ON(1);
- printk(KERN_WARNING "tried to kill an mm-less task!\n");
+ printk(KERN_WARNING "tried to kill an mm-less task %d (%s)!\n",
+ task_pid_nr(p), p->comm);
+ task_unlock(p);
return;
}
if (verbose)
- printk(KERN_ERR "Killed process %d (%s)\n",
- task_pid_nr(p), p->comm);
+ printk(KERN_ERR "Killed process %d (%s) "
+ "vsz:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
+ task_pid_nr(p), p->comm,
+ K(p->mm->total_vm),
+ K(get_mm_counter(p->mm, anon_rss)),
+ K(get_mm_counter(p->mm, file_rss)));
+ task_unlock(p);
/*
* We give our sacrificial lamb high priority and access to
@@ -395,20 +440,8 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
{
struct task_struct *c;
- if (printk_ratelimit()) {
- printk(KERN_WARNING "%s invoked oom-killer: "
- "gfp_mask=0x%x, order=%d, oom_adj=%d\n",
- current->comm, gfp_mask, order,
- current->signal->oom_adj);
- task_lock(current);
- cpuset_print_task_mems_allowed(current);
- task_unlock(current);
- dump_stack();
- mem_cgroup_print_oom_info(mem, current);
- show_mem();
- if (sysctl_oom_dump_tasks)
- dump_tasks(mem);
- }
+ if (printk_ratelimit())
+ dump_header(p, gfp_mask, order, mem);
/*
* If the task is already exiting, don't alarm the sysadmin or kill
@@ -544,6 +577,7 @@ retry:
/* Found nothing?!?! Either we hang forever, or we panic. */
if (!p) {
read_unlock(&tasklist_lock);
+ dump_header(NULL, gfp_mask, order, NULL);
panic("Out of memory and no killable processes...\n");
}
@@ -599,7 +633,8 @@ rest_and_return:
* OR try to be smart about which process to kill. Note that we
* don't have to be perfect here, we just have to be good.
*/
-void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
+void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
+ int order, nodemask_t *nodemask)
{
unsigned long freed = 0;
enum oom_constraint constraint;
@@ -609,14 +644,16 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
/* Got some memory back in the last second. */
return;
- if (sysctl_panic_on_oom == 2)
+ if (sysctl_panic_on_oom == 2) {
+ dump_header(NULL, gfp_mask, order, NULL);
panic("out of memory. Compulsory panic_on_oom is selected.\n");
+ }
/*
* Check if there were limitations on the allocation (only relevant for
* NUMA) that may require different handling.
*/
- constraint = constrained_alloc(zonelist, gfp_mask);
+ constraint = constrained_alloc(zonelist, gfp_mask, nodemask);
read_lock(&tasklist_lock);
switch (constraint) {
@@ -626,8 +663,10 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
break;
case CONSTRAINT_NONE:
- if (sysctl_panic_on_oom)
+ if (sysctl_panic_on_oom) {
+ dump_header(NULL, gfp_mask, order, NULL);
panic("out of memory. panic_on_oom is selected\n");
+ }
/* Fall-through */
case CONSTRAINT_CPUSET:
__out_of_memory(gfp_mask, order);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 5f378dd58802..0b19943ecf8b 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -44,18 +44,21 @@ static long ratelimit_pages = 32;
/*
* When balance_dirty_pages decides that the caller needs to perform some
* non-background writeback, this is how many pages it will attempt to write.
- * It should be somewhat larger than RATELIMIT_PAGES to ensure that reasonably
+ * It should be somewhat larger than dirtied pages to ensure that reasonably
* large amounts of I/O are submitted.
*/
-static inline long sync_writeback_pages(void)
+static inline long sync_writeback_pages(unsigned long dirtied)
{
- return ratelimit_pages + ratelimit_pages / 2;
+ if (dirtied < ratelimit_pages)
+ dirtied = ratelimit_pages;
+
+ return dirtied + dirtied / 2;
}
/* The following parameters are exported via /proc/sys/vm */
/*
- * Start background writeback (via pdflush) at this percentage
+ * Start background writeback (via writeback threads) at this percentage
*/
int dirty_background_ratio = 10;
@@ -155,37 +158,37 @@ static void update_completion_period(void)
}
int dirty_background_ratio_handler(struct ctl_table *table, int write,
- struct file *filp, void __user *buffer, size_t *lenp,
+ void __user *buffer, size_t *lenp,
loff_t *ppos)
{
int ret;
- ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (ret == 0 && write)
dirty_background_bytes = 0;
return ret;
}
int dirty_background_bytes_handler(struct ctl_table *table, int write,
- struct file *filp, void __user *buffer, size_t *lenp,
+ void __user *buffer, size_t *lenp,
loff_t *ppos)
{
int ret;
- ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos);
+ ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
if (ret == 0 && write)
dirty_background_ratio = 0;
return ret;
}
int dirty_ratio_handler(struct ctl_table *table, int write,
- struct file *filp, void __user *buffer, size_t *lenp,
+ void __user *buffer, size_t *lenp,
loff_t *ppos)
{
int old_ratio = vm_dirty_ratio;
int ret;
- ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
update_completion_period();
vm_dirty_bytes = 0;
@@ -195,13 +198,13 @@ int dirty_ratio_handler(struct ctl_table *table, int write,
int dirty_bytes_handler(struct ctl_table *table, int write,
- struct file *filp, void __user *buffer, size_t *lenp,
+ void __user *buffer, size_t *lenp,
loff_t *ppos)
{
unsigned long old_bytes = vm_dirty_bytes;
int ret;
- ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos);
+ ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
if (ret == 0 && write && vm_dirty_bytes != old_bytes) {
update_completion_period();
vm_dirty_ratio = 0;
@@ -474,10 +477,11 @@ get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty,
* balance_dirty_pages() must be called by processes which are generating dirty
* data. It looks at the number of dirty pages in the machine and will force
* the caller to perform writeback if the system is over `vm_dirty_ratio'.
- * If we're over `background_thresh' then pdflush is woken to perform some
- * writeout.
+ * If we're over `background_thresh' then the writeback threads are woken to
+ * perform some writeout.
*/
-static void balance_dirty_pages(struct address_space *mapping)
+static void balance_dirty_pages(struct address_space *mapping,
+ unsigned long write_chunk)
{
long nr_reclaimable, bdi_nr_reclaimable;
long nr_writeback, bdi_nr_writeback;
@@ -485,7 +489,6 @@ static void balance_dirty_pages(struct address_space *mapping)
unsigned long dirty_thresh;
unsigned long bdi_thresh;
unsigned long pages_written = 0;
- unsigned long write_chunk = sync_writeback_pages();
unsigned long pause = 1;
struct backing_dev_info *bdi = mapping->backing_dev_info;
@@ -563,7 +566,8 @@ static void balance_dirty_pages(struct address_space *mapping)
if (pages_written >= write_chunk)
break; /* We've done our duty */
- schedule_timeout_interruptible(pause);
+ __set_current_state(TASK_INTERRUPTIBLE);
+ io_schedule_timeout(pause);
/*
* Increase the delay for each loop, up to our previous
@@ -579,7 +583,7 @@ static void balance_dirty_pages(struct address_space *mapping)
bdi->dirty_exceeded = 0;
if (writeback_in_progress(bdi))
- return; /* pdflush is already working this queue */
+ return;
/*
* In laptop mode, we wait until hitting the higher threshold before
@@ -590,10 +594,10 @@ static void balance_dirty_pages(struct address_space *mapping)
* background_thresh, to keep the amount of dirty memory low.
*/
if ((laptop_mode && pages_written) ||
- (!laptop_mode && ((nr_writeback = global_page_state(NR_FILE_DIRTY)
- + global_page_state(NR_UNSTABLE_NFS))
+ (!laptop_mode && ((global_page_state(NR_FILE_DIRTY)
+ + global_page_state(NR_UNSTABLE_NFS))
> background_thresh)))
- bdi_start_writeback(bdi, nr_writeback);
+ bdi_start_writeback(bdi, NULL, 0);
}
void set_page_dirty_balance(struct page *page, int page_mkwrite)
@@ -640,9 +644,10 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
p = &__get_cpu_var(bdp_ratelimits);
*p += nr_pages_dirtied;
if (unlikely(*p >= ratelimit)) {
+ ratelimit = sync_writeback_pages(*p);
*p = 0;
preempt_enable();
- balance_dirty_pages(mapping);
+ balance_dirty_pages(mapping, ratelimit);
return;
}
preempt_enable();
@@ -686,9 +691,9 @@ static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0);
* sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
*/
int dirty_writeback_centisecs_handler(ctl_table *table, int write,
- struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+ void __user *buffer, size_t *length, loff_t *ppos)
{
- proc_dointvec(table, write, file, buffer, length, ppos);
+ proc_dointvec(table, write, buffer, length, ppos);
return 0;
}
@@ -816,7 +821,6 @@ int write_cache_pages(struct address_space *mapping,
struct writeback_control *wbc, writepage_t writepage,
void *data)
{
- struct backing_dev_info *bdi = mapping->backing_dev_info;
int ret = 0;
int done = 0;
struct pagevec pvec;
@@ -829,11 +833,6 @@ int write_cache_pages(struct address_space *mapping,
int range_whole = 0;
long nr_to_write = wbc->nr_to_write;
- if (wbc->nonblocking && bdi_write_congested(bdi)) {
- wbc->encountered_congestion = 1;
- return 0;
- }
-
pagevec_init(&pvec, 0);
if (wbc->range_cyclic) {
writeback_index = mapping->writeback_index; /* prev offset */
@@ -952,12 +951,6 @@ continue_unlock:
break;
}
}
-
- if (wbc->nonblocking && bdi_write_congested(bdi)) {
- wbc->encountered_congestion = 1;
- done = 1;
- break;
- }
}
pagevec_release(&pvec);
cond_resched();
@@ -1149,6 +1142,13 @@ int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page)
EXPORT_SYMBOL(redirty_page_for_writepage);
/*
+ * Dirty a page.
+ *
+ * For pages with a mapping this should be done under the page lock
+ * for the benefit of asynchronous memory errors who prefer a consistent
+ * dirty state. This rule can be broken in some special cases,
+ * but should be better not to.
+ *
* If the mapping doesn't provide a set_page_dirty a_op, then
* just fall through and assume that it wants buffer_heads.
*/
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5717f27a0704..850c4a7e2fe5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -234,6 +234,12 @@ static void bad_page(struct page *page)
static unsigned long nr_shown;
static unsigned long nr_unshown;
+ /* Don't complain about poisoned pages */
+ if (PageHWPoison(page)) {
+ __ClearPageBuddy(page);
+ return;
+ }
+
/*
* Allow a burst of 60 reports, then keep quiet for that minute;
* or allow a steady drip of one report per second.
@@ -480,7 +486,6 @@ static inline void __free_one_page(struct page *page,
zone->free_area[order].nr_free++;
}
-#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT
/*
* free_page_mlock() -- clean up attempts to free and mlocked() page.
* Page should not be on lru, so no need to fix that up.
@@ -491,9 +496,6 @@ static inline void free_page_mlock(struct page *page)
__dec_zone_page_state(page, NR_MLOCK);
__count_vm_event(UNEVICTABLE_MLOCKFREED);
}
-#else
-static void free_page_mlock(struct page *page) { }
-#endif
static inline int free_pages_check(struct page *page)
{
@@ -666,7 +668,7 @@ static inline void expand(struct zone *zone, struct page *page,
/*
* This page is about to be returned from the page allocator
*/
-static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
+static inline int check_new_page(struct page *page)
{
if (unlikely(page_mapcount(page) |
(page->mapping != NULL) |
@@ -675,6 +677,18 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
bad_page(page);
return 1;
}
+ return 0;
+}
+
+static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
+{
+ int i;
+
+ for (i = 0; i < (1 << order); i++) {
+ struct page *p = page + i;
+ if (unlikely(check_new_page(p)))
+ return 1;
+ }
set_page_private(page, 0);
set_page_refcounted(page);
@@ -1640,12 +1654,22 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
if (page)
goto out;
- /* The OOM killer will not help higher order allocs */
- if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_NOFAIL))
- goto out;
-
+ if (!(gfp_mask & __GFP_NOFAIL)) {
+ /* The OOM killer will not help higher order allocs */
+ if (order > PAGE_ALLOC_COSTLY_ORDER)
+ goto out;
+ /*
+ * GFP_THISNODE contains __GFP_NORETRY and we never hit this.
+ * Sanity check for bare calls of __GFP_THISNODE, not real OOM.
+ * The caller should handle page allocation failure by itself if
+ * it specifies __GFP_THISNODE.
+ * Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER.
+ */
+ if (gfp_mask & __GFP_THISNODE)
+ goto out;
+ }
/* Exhausted what can be done so it's blamo time */
- out_of_memory(zonelist, gfp_mask, order);
+ out_of_memory(zonelist, gfp_mask, order, nodemask);
out:
clear_zonelist_oom(zonelist, gfp_mask);
@@ -1751,7 +1775,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
* See also cpuset_zone_allowed() comment in kernel/cpuset.c.
*/
alloc_flags &= ~ALLOC_CPUSET;
- } else if (unlikely(rt_task(p)))
+ } else if (unlikely(rt_task(p)) && !in_interrupt())
alloc_flags |= ALLOC_HARDER;
if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
@@ -1799,9 +1823,9 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
goto nopage;
+restart:
wake_all_kswapd(order, zonelist, high_zoneidx);
-restart:
/*
* OK, we're below the kswapd watermark and have kicked background
* reclaim. Now things get more complex, so set up alloc_flags according
@@ -2165,7 +2189,7 @@ void show_free_areas(void)
printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
" active_file:%lu inactive_file:%lu isolated_file:%lu\n"
" unevictable:%lu"
- " dirty:%lu writeback:%lu unstable:%lu buffer:%lu\n"
+ " dirty:%lu writeback:%lu unstable:%lu\n"
" free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"
" mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n",
global_page_state(NR_ACTIVE_ANON),
@@ -2178,7 +2202,6 @@ void show_free_areas(void)
global_page_state(NR_FILE_DIRTY),
global_page_state(NR_WRITEBACK),
global_page_state(NR_UNSTABLE_NFS),
- nr_blockdev_pages(),
global_page_state(NR_FREE_PAGES),
global_page_state(NR_SLAB_RECLAIMABLE),
global_page_state(NR_SLAB_UNRECLAIMABLE),
@@ -2373,7 +2396,7 @@ early_param("numa_zonelist_order", setup_numa_zonelist_order);
* sysctl handler for numa_zonelist_order
*/
int numa_zonelist_order_handler(ctl_table *table, int write,
- struct file *file, void __user *buffer, size_t *length,
+ void __user *buffer, size_t *length,
loff_t *ppos)
{
char saved_string[NUMA_ZONELIST_ORDER_LEN];
@@ -2382,7 +2405,7 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
if (write)
strncpy(saved_string, (char*)table->data,
NUMA_ZONELIST_ORDER_LEN);
- ret = proc_dostring(table, write, file, buffer, length, ppos);
+ ret = proc_dostring(table, write, buffer, length, ppos);
if (ret)
return ret;
if (write) {
@@ -3110,7 +3133,7 @@ static int __cpuinit process_zones(int cpu)
if (percpu_pagelist_fraction)
setup_pagelist_highmark(zone_pcp(zone, cpu),
- (zone->present_pages / percpu_pagelist_fraction));
+ (zone->present_pages / percpu_pagelist_fraction));
}
return 0;
@@ -4706,9 +4729,9 @@ module_init(init_per_zone_wmark_min)
* changes.
*/
int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
- struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+ void __user *buffer, size_t *length, loff_t *ppos)
{
- proc_dointvec(table, write, file, buffer, length, ppos);
+ proc_dointvec(table, write, buffer, length, ppos);
if (write)
setup_per_zone_wmarks();
return 0;
@@ -4716,12 +4739,12 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
#ifdef CONFIG_NUMA
int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
- struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+ void __user *buffer, size_t *length, loff_t *ppos)
{
struct zone *zone;
int rc;
- rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+ rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
if (rc)
return rc;
@@ -4732,12 +4755,12 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
}
int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
- struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+ void __user *buffer, size_t *length, loff_t *ppos)
{
struct zone *zone;
int rc;
- rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+ rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
if (rc)
return rc;
@@ -4758,9 +4781,9 @@ int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
* if in function of the boot time zone sizes.
*/
int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
- struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+ void __user *buffer, size_t *length, loff_t *ppos)
{
- proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+ proc_dointvec_minmax(table, write, buffer, length, ppos);
setup_per_zone_lowmem_reserve();
return 0;
}
@@ -4772,13 +4795,13 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
*/
int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
- struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+ void __user *buffer, size_t *length, loff_t *ppos)
{
struct zone *zone;
unsigned int cpu;
int ret;
- ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+ ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
if (!write || (ret == -EINVAL))
return ret;
for_each_populated_zone(zone) {
diff --git a/mm/page_io.c b/mm/page_io.c
index c6f3e5071de3..a19af956ee1b 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -19,20 +19,15 @@
#include <linux/writeback.h>
#include <asm/pgtable.h>
-static struct bio *get_swap_bio(gfp_t gfp_flags, pgoff_t index,
+static struct bio *get_swap_bio(gfp_t gfp_flags,
struct page *page, bio_end_io_t end_io)
{
struct bio *bio;
bio = bio_alloc(gfp_flags, 1);
if (bio) {
- struct swap_info_struct *sis;
- swp_entry_t entry = { .val = index, };
-
- sis = get_swap_info_struct(swp_type(entry));
- bio->bi_sector = map_swap_page(sis, swp_offset(entry)) *
- (PAGE_SIZE >> 9);
- bio->bi_bdev = sis->bdev;
+ bio->bi_sector = map_swap_page(page, &bio->bi_bdev);
+ bio->bi_sector <<= PAGE_SHIFT - 9;
bio->bi_io_vec[0].bv_page = page;
bio->bi_io_vec[0].bv_len = PAGE_SIZE;
bio->bi_io_vec[0].bv_offset = 0;
@@ -102,8 +97,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
unlock_page(page);
goto out;
}
- bio = get_swap_bio(GFP_NOIO, page_private(page), page,
- end_swap_bio_write);
+ bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write);
if (bio == NULL) {
set_page_dirty(page);
unlock_page(page);
@@ -127,8 +121,7 @@ int swap_readpage(struct page *page)
VM_BUG_ON(!PageLocked(page));
VM_BUG_ON(PageUptodate(page));
- bio = get_swap_bio(GFP_KERNEL, page_private(page), page,
- end_swap_bio_read);
+ bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read);
if (bio == NULL) {
unlock_page(page);
ret = -ENOMEM;
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index d5878bed7841..7b47a57b6646 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -1,6 +1,7 @@
#include <linux/mm.h>
#include <linux/highmem.h>
#include <linux/sched.h>
+#include <linux/hugetlb.h>
static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
struct mm_walk *walk)
@@ -107,6 +108,7 @@ int walk_page_range(unsigned long addr, unsigned long end,
pgd_t *pgd;
unsigned long next;
int err = 0;
+ struct vm_area_struct *vma;
if (addr >= end)
return err;
@@ -117,11 +119,38 @@ int walk_page_range(unsigned long addr, unsigned long end,
pgd = pgd_offset(walk->mm, addr);
do {
next = pgd_addr_end(addr, end);
+
+ /*
+ * handle hugetlb vma individually because pagetable walk for
+ * the hugetlb page is dependent on the architecture and
+ * we can't handled it in the same manner as non-huge pages.
+ */
+ vma = find_vma(walk->mm, addr);
+#ifdef CONFIG_HUGETLB_PAGE
+ if (vma && is_vm_hugetlb_page(vma)) {
+ pte_t *pte;
+ struct hstate *hs;
+
+ if (vma->vm_end < next)
+ next = vma->vm_end;
+ hs = hstate_vma(vma);
+ pte = huge_pte_offset(walk->mm,
+ addr & huge_page_mask(hs));
+ if (pte && !huge_pte_none(huge_ptep_get(pte))
+ && walk->hugetlb_entry)
+ err = walk->hugetlb_entry(pte, addr,
+ next, walk);
+ if (err)
+ break;
+ continue;
+ }
+#endif
if (pgd_none_or_clear_bad(pgd)) {
if (walk->pte_hole)
err = walk->pte_hole(addr, next, walk);
if (err)
break;
+ pgd++;
continue;
}
if (walk->pgd_entry)
@@ -131,7 +160,8 @@ int walk_page_range(unsigned long addr, unsigned long end,
err = walk_pud_range(pgd, addr, next, walk);
if (err)
break;
- } while (pgd++, addr = next, addr != end);
+ pgd++;
+ } while (addr = next, addr != end);
return err;
}
diff --git a/mm/percpu.c b/mm/percpu.c
index 43d8cacfdaa5..442010cc91c6 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -46,8 +46,6 @@
*
* To use this allocator, arch code should do the followings.
*
- * - drop CONFIG_HAVE_LEGACY_PER_CPU_AREA
- *
* - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate
* regular address to percpu pointer and back if they need to be
* different from the default
@@ -74,6 +72,7 @@
#include <asm/cacheflush.h>
#include <asm/sections.h>
#include <asm/tlbflush.h>
+#include <asm/io.h>
#define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */
#define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */
@@ -153,7 +152,10 @@ static int pcpu_reserved_chunk_limit;
*
* During allocation, pcpu_alloc_mutex is kept locked all the time and
* pcpu_lock is grabbed and released as necessary. All actual memory
- * allocations are done using GFP_KERNEL with pcpu_lock released.
+ * allocations are done using GFP_KERNEL with pcpu_lock released. In
+ * general, percpu memory can't be allocated with irq off but
+ * irqsave/restore are still used in alloc path so that it can be used
+ * from early init path - sched_init() specifically.
*
* Free path accesses and alters only the index data structures, so it
* can be safely called from atomic context. When memory needs to be
@@ -352,62 +354,86 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
}
/**
- * pcpu_extend_area_map - extend area map for allocation
- * @chunk: target chunk
+ * pcpu_need_to_extend - determine whether chunk area map needs to be extended
+ * @chunk: chunk of interest
*
- * Extend area map of @chunk so that it can accomodate an allocation.
- * A single allocation can split an area into three areas, so this
- * function makes sure that @chunk->map has at least two extra slots.
+ * Determine whether area map of @chunk needs to be extended to
+ * accomodate a new allocation.
*
* CONTEXT:
- * pcpu_alloc_mutex, pcpu_lock. pcpu_lock is released and reacquired
- * if area map is extended.
+ * pcpu_lock.
*
* RETURNS:
- * 0 if noop, 1 if successfully extended, -errno on failure.
+ * New target map allocation length if extension is necessary, 0
+ * otherwise.
*/
-static int pcpu_extend_area_map(struct pcpu_chunk *chunk)
+static int pcpu_need_to_extend(struct pcpu_chunk *chunk)
{
int new_alloc;
- int *new;
- size_t size;
- /* has enough? */
if (chunk->map_alloc >= chunk->map_used + 2)
return 0;
- spin_unlock_irq(&pcpu_lock);
-
new_alloc = PCPU_DFL_MAP_ALLOC;
while (new_alloc < chunk->map_used + 2)
new_alloc *= 2;
- new = pcpu_mem_alloc(new_alloc * sizeof(new[0]));
- if (!new) {
- spin_lock_irq(&pcpu_lock);
+ return new_alloc;
+}
+
+/**
+ * pcpu_extend_area_map - extend area map of a chunk
+ * @chunk: chunk of interest
+ * @new_alloc: new target allocation length of the area map
+ *
+ * Extend area map of @chunk to have @new_alloc entries.
+ *
+ * CONTEXT:
+ * Does GFP_KERNEL allocation. Grabs and releases pcpu_lock.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc)
+{
+ int *old = NULL, *new = NULL;
+ size_t old_size = 0, new_size = new_alloc * sizeof(new[0]);
+ unsigned long flags;
+
+ new = pcpu_mem_alloc(new_size);
+ if (!new)
return -ENOMEM;
- }
- /*
- * Acquire pcpu_lock and switch to new area map. Only free
- * could have happened inbetween, so map_used couldn't have
- * grown.
- */
- spin_lock_irq(&pcpu_lock);
- BUG_ON(new_alloc < chunk->map_used + 2);
+ /* acquire pcpu_lock and switch to new area map */
+ spin_lock_irqsave(&pcpu_lock, flags);
+
+ if (new_alloc <= chunk->map_alloc)
+ goto out_unlock;
- size = chunk->map_alloc * sizeof(chunk->map[0]);
- memcpy(new, chunk->map, size);
+ old_size = chunk->map_alloc * sizeof(chunk->map[0]);
+ memcpy(new, chunk->map, old_size);
/*
* map_alloc < PCPU_DFL_MAP_ALLOC indicates that the chunk is
* one of the first chunks and still using static map.
*/
if (chunk->map_alloc >= PCPU_DFL_MAP_ALLOC)
- pcpu_mem_free(chunk->map, size);
+ old = chunk->map;
chunk->map_alloc = new_alloc;
chunk->map = new;
+ new = NULL;
+
+out_unlock:
+ spin_unlock_irqrestore(&pcpu_lock, flags);
+
+ /*
+ * pcpu_mem_free() might end up calling vfree() which uses
+ * IRQ-unsafe lock and thus can't be called under pcpu_lock.
+ */
+ pcpu_mem_free(old, old_size);
+ pcpu_mem_free(new, new_size);
+
return 0;
}
@@ -1043,8 +1069,11 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void)
*/
static void *pcpu_alloc(size_t size, size_t align, bool reserved)
{
+ static int warn_limit = 10;
struct pcpu_chunk *chunk;
- int slot, off;
+ const char *err;
+ int slot, off, new_alloc;
+ unsigned long flags;
if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) {
WARN(true, "illegal size (%zu) or align (%zu) for "
@@ -1053,17 +1082,31 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved)
}
mutex_lock(&pcpu_alloc_mutex);
- spin_lock_irq(&pcpu_lock);
+ spin_lock_irqsave(&pcpu_lock, flags);
/* serve reserved allocations from the reserved chunk if available */
if (reserved && pcpu_reserved_chunk) {
chunk = pcpu_reserved_chunk;
- if (size > chunk->contig_hint ||
- pcpu_extend_area_map(chunk) < 0)
+
+ if (size > chunk->contig_hint) {
+ err = "alloc from reserved chunk failed";
goto fail_unlock;
+ }
+
+ while ((new_alloc = pcpu_need_to_extend(chunk))) {
+ spin_unlock_irqrestore(&pcpu_lock, flags);
+ if (pcpu_extend_area_map(chunk, new_alloc) < 0) {
+ err = "failed to extend area map of reserved chunk";
+ goto fail_unlock_mutex;
+ }
+ spin_lock_irqsave(&pcpu_lock, flags);
+ }
+
off = pcpu_alloc_area(chunk, size, align);
if (off >= 0)
goto area_found;
+
+ err = "alloc from reserved chunk failed";
goto fail_unlock;
}
@@ -1074,13 +1117,20 @@ restart:
if (size > chunk->contig_hint)
continue;
- switch (pcpu_extend_area_map(chunk)) {
- case 0:
- break;
- case 1:
- goto restart; /* pcpu_lock dropped, restart */
- default:
- goto fail_unlock;
+ new_alloc = pcpu_need_to_extend(chunk);
+ if (new_alloc) {
+ spin_unlock_irqrestore(&pcpu_lock, flags);
+ if (pcpu_extend_area_map(chunk,
+ new_alloc) < 0) {
+ err = "failed to extend area map";
+ goto fail_unlock_mutex;
+ }
+ spin_lock_irqsave(&pcpu_lock, flags);
+ /*
+ * pcpu_lock has been dropped, need to
+ * restart cpu_slot list walking.
+ */
+ goto restart;
}
off = pcpu_alloc_area(chunk, size, align);
@@ -1090,23 +1140,26 @@ restart:
}
/* hmmm... no space left, create a new chunk */
- spin_unlock_irq(&pcpu_lock);
+ spin_unlock_irqrestore(&pcpu_lock, flags);
chunk = alloc_pcpu_chunk();
- if (!chunk)
+ if (!chunk) {
+ err = "failed to allocate new chunk";
goto fail_unlock_mutex;
+ }
- spin_lock_irq(&pcpu_lock);
+ spin_lock_irqsave(&pcpu_lock, flags);
pcpu_chunk_relocate(chunk, -1);
goto restart;
area_found:
- spin_unlock_irq(&pcpu_lock);
+ spin_unlock_irqrestore(&pcpu_lock, flags);
/* populate, map and clear the area */
if (pcpu_populate_chunk(chunk, off, size)) {
- spin_lock_irq(&pcpu_lock);
+ spin_lock_irqsave(&pcpu_lock, flags);
pcpu_free_area(chunk, off);
+ err = "failed to populate";
goto fail_unlock;
}
@@ -1116,9 +1169,16 @@ area_found:
return __addr_to_pcpu_ptr(chunk->base_addr + off);
fail_unlock:
- spin_unlock_irq(&pcpu_lock);
+ spin_unlock_irqrestore(&pcpu_lock, flags);
fail_unlock_mutex:
mutex_unlock(&pcpu_alloc_mutex);
+ if (warn_limit) {
+ pr_warning("PERCPU: allocation failed, size=%zu align=%zu, "
+ "%s\n", size, align, err);
+ dump_stack();
+ if (!--warn_limit)
+ pr_info("PERCPU: limit reached, disable warning\n");
+ }
return NULL;
}
@@ -1241,6 +1301,27 @@ void free_percpu(void *ptr)
}
EXPORT_SYMBOL_GPL(free_percpu);
+/**
+ * per_cpu_ptr_to_phys - convert translated percpu address to physical address
+ * @addr: the address to be converted to physical address
+ *
+ * Given @addr which is dereferenceable address obtained via one of
+ * percpu access macros, this function translates it into its physical
+ * address. The caller is responsible for ensuring @addr stays valid
+ * until this function finishes.
+ *
+ * RETURNS:
+ * The physical address for @addr.
+ */
+phys_addr_t per_cpu_ptr_to_phys(void *addr)
+{
+ if ((unsigned long)addr < VMALLOC_START ||
+ (unsigned long)addr >= VMALLOC_END)
+ return __pa(addr);
+ else
+ return page_to_phys(vmalloc_to_page(addr));
+}
+
static inline size_t pcpu_calc_fc_sizes(size_t static_size,
size_t reserved_size,
ssize_t *dyn_sizep)
@@ -1347,6 +1428,10 @@ struct pcpu_alloc_info * __init pcpu_build_alloc_info(
struct pcpu_alloc_info *ai;
unsigned int *cpu_map;
+ /* this function may be called multiple times */
+ memset(group_map, 0, sizeof(group_map));
+ memset(group_cnt, 0, sizeof(group_map));
+
/*
* Determine min_unit_size, alloc_size and max_upa such that
* alloc_size is multiple of atom_size and is the smallest
@@ -1574,6 +1659,7 @@ static void pcpu_dump_alloc_info(const char *lvl,
int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
void *base_addr)
{
+ static char cpus_buf[4096] __initdata;
static int smap[2], dmap[2];
size_t dyn_size = ai->dyn_size;
size_t size_sum = ai->static_size + ai->reserved_size + dyn_size;
@@ -1585,17 +1671,26 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
int *unit_map;
int group, unit, i;
+ cpumask_scnprintf(cpus_buf, sizeof(cpus_buf), cpu_possible_mask);
+
+#define PCPU_SETUP_BUG_ON(cond) do { \
+ if (unlikely(cond)) { \
+ pr_emerg("PERCPU: failed to initialize, %s", #cond); \
+ pr_emerg("PERCPU: cpu_possible_mask=%s\n", cpus_buf); \
+ pcpu_dump_alloc_info(KERN_EMERG, ai); \
+ BUG(); \
+ } \
+} while (0)
+
/* sanity checks */
BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC ||
ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC);
- BUG_ON(ai->nr_groups <= 0);
- BUG_ON(!ai->static_size);
- BUG_ON(!base_addr);
- BUG_ON(ai->unit_size < size_sum);
- BUG_ON(ai->unit_size & ~PAGE_MASK);
- BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
-
- pcpu_dump_alloc_info(KERN_DEBUG, ai);
+ PCPU_SETUP_BUG_ON(ai->nr_groups <= 0);
+ PCPU_SETUP_BUG_ON(!ai->static_size);
+ PCPU_SETUP_BUG_ON(!base_addr);
+ PCPU_SETUP_BUG_ON(ai->unit_size < size_sum);
+ PCPU_SETUP_BUG_ON(ai->unit_size & ~PAGE_MASK);
+ PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
/* process group information and build config tables accordingly */
group_offsets = alloc_bootmem(ai->nr_groups * sizeof(group_offsets[0]));
@@ -1604,7 +1699,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
unit_off = alloc_bootmem(nr_cpu_ids * sizeof(unit_off[0]));
for (cpu = 0; cpu < nr_cpu_ids; cpu++)
- unit_map[cpu] = NR_CPUS;
+ unit_map[cpu] = UINT_MAX;
pcpu_first_unit_cpu = NR_CPUS;
for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) {
@@ -1618,8 +1713,9 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
if (cpu == NR_CPUS)
continue;
- BUG_ON(cpu > nr_cpu_ids || !cpu_possible(cpu));
- BUG_ON(unit_map[cpu] != NR_CPUS);
+ PCPU_SETUP_BUG_ON(cpu > nr_cpu_ids);
+ PCPU_SETUP_BUG_ON(!cpu_possible(cpu));
+ PCPU_SETUP_BUG_ON(unit_map[cpu] != UINT_MAX);
unit_map[cpu] = unit + i;
unit_off[cpu] = gi->base_offset + i * ai->unit_size;
@@ -1632,7 +1728,11 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
pcpu_nr_units = unit;
for_each_possible_cpu(cpu)
- BUG_ON(unit_map[cpu] == NR_CPUS);
+ PCPU_SETUP_BUG_ON(unit_map[cpu] == UINT_MAX);
+
+ /* we're done parsing the input, undefine BUG macro and dump config */
+#undef PCPU_SETUP_BUG_ON
+ pcpu_dump_alloc_info(KERN_INFO, ai);
pcpu_nr_groups = ai->nr_groups;
pcpu_group_offsets = group_offsets;
@@ -1782,7 +1882,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size,
void *base = (void *)ULONG_MAX;
void **areas = NULL;
struct pcpu_alloc_info *ai;
- size_t size_sum, areas_size;
+ size_t size_sum, areas_size, max_distance;
int group, i, rc;
ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size,
@@ -1832,8 +1932,25 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size,
}
/* base address is now known, determine group base offsets */
- for (group = 0; group < ai->nr_groups; group++)
+ max_distance = 0;
+ for (group = 0; group < ai->nr_groups; group++) {
ai->groups[group].base_offset = areas[group] - base;
+ max_distance = max_t(size_t, max_distance,
+ ai->groups[group].base_offset);
+ }
+ max_distance += ai->unit_size;
+
+ /* warn if maximum distance is further than 75% of vmalloc space */
+ if (max_distance > (VMALLOC_END - VMALLOC_START) * 3 / 4) {
+ pr_warning("PERCPU: max_distance=0x%zx too large for vmalloc "
+ "space 0x%lx\n",
+ max_distance, VMALLOC_END - VMALLOC_START);
+#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
+ /* and fail if we have fallback */
+ rc = -EINVAL;
+ goto out_free;
+#endif
+ }
pr_info("PERCPU: Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n",
PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size,
diff --git a/mm/quicklist.c b/mm/quicklist.c
index 6eedf7e473d1..6633965bb27b 100644
--- a/mm/quicklist.c
+++ b/mm/quicklist.c
@@ -29,7 +29,6 @@ static unsigned long max_pages(unsigned long min_pages)
int node = numa_node_id();
struct zone *zones = NODE_DATA(node)->node_zones;
int num_cpus_on_node;
- const struct cpumask *cpumask_on_node = cpumask_of_node(node);
node_free_pages =
#ifdef CONFIG_ZONE_DMA
@@ -42,7 +41,7 @@ static unsigned long max_pages(unsigned long min_pages)
max = node_free_pages / FRACTION_OF_NODE_MEM;
- num_cpus_on_node = cpus_weight_nr(*cpumask_on_node);
+ num_cpus_on_node = cpumask_weight(cpumask_of_node(node));
max /= num_cpus_on_node;
return max(max, min_pages);
diff --git a/mm/rmap.c b/mm/rmap.c
index 720fc03a7bc4..278cd277bdec 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -36,6 +36,11 @@
* mapping->tree_lock (widely used, in set_page_dirty,
* in arch-dependent flush_dcache_mmap_lock,
* within inode_lock in __sync_single_inode)
+ *
+ * (code doesn't rely on that order so it could be switched around)
+ * ->tasklist_lock
+ * anon_vma->lock (memory_failure, collect_procs_anon)
+ * pte map lock
*/
#include <linux/mm.h>
@@ -44,6 +49,7 @@
#include <linux/swapops.h>
#include <linux/slab.h>
#include <linux/init.h>
+#include <linux/ksm.h>
#include <linux/rmap.h>
#include <linux/rcupdate.h>
#include <linux/module.h>
@@ -62,7 +68,7 @@ static inline struct anon_vma *anon_vma_alloc(void)
return kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
}
-static inline void anon_vma_free(struct anon_vma *anon_vma)
+void anon_vma_free(struct anon_vma *anon_vma)
{
kmem_cache_free(anon_vma_cachep, anon_vma);
}
@@ -166,7 +172,7 @@ void anon_vma_unlink(struct vm_area_struct *vma)
list_del(&vma->anon_vma_node);
/* We must garbage collect the anon_vma if it's empty */
- empty = list_empty(&anon_vma->head);
+ empty = list_empty(&anon_vma->head) && !ksm_refcount(anon_vma);
spin_unlock(&anon_vma->lock);
if (empty)
@@ -178,6 +184,7 @@ static void anon_vma_ctor(void *data)
struct anon_vma *anon_vma = data;
spin_lock_init(&anon_vma->lock);
+ ksm_refcount_init(anon_vma);
INIT_LIST_HEAD(&anon_vma->head);
}
@@ -191,14 +198,14 @@ void __init anon_vma_init(void)
* Getting a lock on a stable anon_vma from a page off the LRU is
* tricky: page_lock_anon_vma rely on RCU to guard against the races.
*/
-static struct anon_vma *page_lock_anon_vma(struct page *page)
+struct anon_vma *page_lock_anon_vma(struct page *page)
{
struct anon_vma *anon_vma;
unsigned long anon_mapping;
rcu_read_lock();
- anon_mapping = (unsigned long) page->mapping;
- if (!(anon_mapping & PAGE_MAPPING_ANON))
+ anon_mapping = (unsigned long) ACCESS_ONCE(page->mapping);
+ if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
goto out;
if (!page_mapped(page))
goto out;
@@ -211,7 +218,7 @@ out:
return NULL;
}
-static void page_unlock_anon_vma(struct anon_vma *anon_vma)
+void page_unlock_anon_vma(struct anon_vma *anon_vma)
{
spin_unlock(&anon_vma->lock);
rcu_read_unlock();
@@ -237,14 +244,13 @@ vma_address(struct page *page, struct vm_area_struct *vma)
}
/*
- * At what user virtual address is page expected in vma? checking that the
- * page matches the vma: currently only used on anon pages, by unuse_vma;
+ * At what user virtual address is page expected in vma?
+ * checking that the page matches the vma.
*/
unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
{
if (PageAnon(page)) {
- if ((void *)vma->anon_vma !=
- (void *)page->mapping - PAGE_MAPPING_ANON)
+ if (vma->anon_vma != page_anon_vma(page))
return -EFAULT;
} else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) {
if (!vma->vm_file ||
@@ -311,7 +317,7 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
* if the page is not mapped into the page tables of this VMA. Only
* valid for normal file or anonymous VMAs.
*/
-static int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
+int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
{
unsigned long address;
pte_t *pte;
@@ -332,21 +338,15 @@ static int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
* Subfunctions of page_referenced: page_referenced_one called
* repeatedly from either page_referenced_anon or page_referenced_file.
*/
-static int page_referenced_one(struct page *page,
- struct vm_area_struct *vma,
- unsigned int *mapcount,
- unsigned long *vm_flags)
+int page_referenced_one(struct page *page, struct vm_area_struct *vma,
+ unsigned long address, unsigned int *mapcount,
+ unsigned long *vm_flags)
{
struct mm_struct *mm = vma->vm_mm;
- unsigned long address;
pte_t *pte;
spinlock_t *ptl;
int referenced = 0;
- address = vma_address(page, vma);
- if (address == -EFAULT)
- goto out;
-
pte = page_check_address(page, mm, address, &ptl, 0);
if (!pte)
goto out;
@@ -383,9 +383,10 @@ static int page_referenced_one(struct page *page,
out_unmap:
(*mapcount)--;
pte_unmap_unlock(pte, ptl);
-out:
+
if (referenced)
*vm_flags |= vma->vm_flags;
+out:
return referenced;
}
@@ -404,6 +405,9 @@ static int page_referenced_anon(struct page *page,
mapcount = page_mapcount(page);
list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
+ unsigned long address = vma_address(page, vma);
+ if (address == -EFAULT)
+ continue;
/*
* If we are reclaiming on behalf of a cgroup, skip
* counting on behalf of references from different
@@ -411,7 +415,7 @@ static int page_referenced_anon(struct page *page,
*/
if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont))
continue;
- referenced += page_referenced_one(page, vma,
+ referenced += page_referenced_one(page, vma, address,
&mapcount, vm_flags);
if (!mapcount)
break;
@@ -469,6 +473,9 @@ static int page_referenced_file(struct page *page,
mapcount = page_mapcount(page);
vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+ unsigned long address = vma_address(page, vma);
+ if (address == -EFAULT)
+ continue;
/*
* If we are reclaiming on behalf of a cgroup, skip
* counting on behalf of references from different
@@ -476,7 +483,7 @@ static int page_referenced_file(struct page *page,
*/
if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont))
continue;
- referenced += page_referenced_one(page, vma,
+ referenced += page_referenced_one(page, vma, address,
&mapcount, vm_flags);
if (!mapcount)
break;
@@ -502,46 +509,47 @@ int page_referenced(struct page *page,
unsigned long *vm_flags)
{
int referenced = 0;
+ int we_locked = 0;
if (TestClearPageReferenced(page))
referenced++;
*vm_flags = 0;
- if (page_mapped(page) && page->mapping) {
- if (PageAnon(page))
+ if (page_mapped(page) && page_rmapping(page)) {
+ if (!is_locked && (!PageAnon(page) || PageKsm(page))) {
+ we_locked = trylock_page(page);
+ if (!we_locked) {
+ referenced++;
+ goto out;
+ }
+ }
+ if (unlikely(PageKsm(page)))
+ referenced += page_referenced_ksm(page, mem_cont,
+ vm_flags);
+ else if (PageAnon(page))
referenced += page_referenced_anon(page, mem_cont,
vm_flags);
- else if (is_locked)
+ else if (page->mapping)
referenced += page_referenced_file(page, mem_cont,
vm_flags);
- else if (!trylock_page(page))
- referenced++;
- else {
- if (page->mapping)
- referenced += page_referenced_file(page,
- mem_cont, vm_flags);
+ if (we_locked)
unlock_page(page);
- }
}
-
+out:
if (page_test_and_clear_young(page))
referenced++;
return referenced;
}
-static int page_mkclean_one(struct page *page, struct vm_area_struct *vma)
+static int page_mkclean_one(struct page *page, struct vm_area_struct *vma,
+ unsigned long address)
{
struct mm_struct *mm = vma->vm_mm;
- unsigned long address;
pte_t *pte;
spinlock_t *ptl;
int ret = 0;
- address = vma_address(page, vma);
- if (address == -EFAULT)
- goto out;
-
pte = page_check_address(page, mm, address, &ptl, 1);
if (!pte)
goto out;
@@ -573,8 +581,12 @@ static int page_mkclean_file(struct address_space *mapping, struct page *page)
spin_lock(&mapping->i_mmap_lock);
vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
- if (vma->vm_flags & VM_SHARED)
- ret += page_mkclean_one(page, vma);
+ if (vma->vm_flags & VM_SHARED) {
+ unsigned long address = vma_address(page, vma);
+ if (address == -EFAULT)
+ continue;
+ ret += page_mkclean_one(page, vma, address);
+ }
}
spin_unlock(&mapping->i_mmap_lock);
return ret;
@@ -615,14 +627,7 @@ static void __page_set_anon_rmap(struct page *page,
BUG_ON(!anon_vma);
anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
page->mapping = (struct address_space *) anon_vma;
-
page->index = linear_page_index(vma, address);
-
- /*
- * nr_mapped state can be updated without turning off
- * interrupts because it is not modified via interrupt.
- */
- __inc_zone_page_state(page, NR_ANON_PAGES);
}
/**
@@ -660,14 +665,23 @@ static void __page_check_anon_rmap(struct page *page,
* @vma: the vm area in which the mapping is added
* @address: the user virtual address mapped
*
- * The caller needs to hold the pte lock and the page must be locked.
+ * The caller needs to hold the pte lock, and the page must be locked in
+ * the anon_vma case: to serialize mapping,index checking after setting,
+ * and to ensure that PageAnon is not being upgraded racily to PageKsm
+ * (but PageKsm is never downgraded to PageAnon).
*/
void page_add_anon_rmap(struct page *page,
struct vm_area_struct *vma, unsigned long address)
{
+ int first = atomic_inc_and_test(&page->_mapcount);
+ if (first)
+ __inc_zone_page_state(page, NR_ANON_PAGES);
+ if (unlikely(PageKsm(page)))
+ return;
+
VM_BUG_ON(!PageLocked(page));
VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
- if (atomic_inc_and_test(&page->_mapcount))
+ if (first)
__page_set_anon_rmap(page, vma, address);
else
__page_check_anon_rmap(page, vma, address);
@@ -689,6 +703,7 @@ void page_add_new_anon_rmap(struct page *page,
VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
SetPageSwapBacked(page);
atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */
+ __inc_zone_page_state(page, NR_ANON_PAGES);
__page_set_anon_rmap(page, vma, address);
if (page_evictable(page, vma))
lru_cache_add_lru(page, LRU_ACTIVE_ANON);
@@ -706,7 +721,7 @@ void page_add_file_rmap(struct page *page)
{
if (atomic_inc_and_test(&page->_mapcount)) {
__inc_zone_page_state(page, NR_FILE_MAPPED);
- mem_cgroup_update_mapped_file_stat(page, 1);
+ mem_cgroup_update_file_mapped(page, 1);
}
}
@@ -738,8 +753,8 @@ void page_remove_rmap(struct page *page)
__dec_zone_page_state(page, NR_ANON_PAGES);
} else {
__dec_zone_page_state(page, NR_FILE_MAPPED);
+ mem_cgroup_update_file_mapped(page, -1);
}
- mem_cgroup_update_mapped_file_stat(page, -1);
/*
* It would be tidy to reset the PageAnon mapping here,
* but that might overwrite a racing page_add_anon_rmap
@@ -755,20 +770,15 @@ void page_remove_rmap(struct page *page)
* Subfunctions of try_to_unmap: try_to_unmap_one called
* repeatedly from either try_to_unmap_anon or try_to_unmap_file.
*/
-static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
- int migration)
+int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
+ unsigned long address, enum ttu_flags flags)
{
struct mm_struct *mm = vma->vm_mm;
- unsigned long address;
pte_t *pte;
pte_t pteval;
spinlock_t *ptl;
int ret = SWAP_AGAIN;
- address = vma_address(page, vma);
- if (address == -EFAULT)
- goto out;
-
pte = page_check_address(page, mm, address, &ptl, 0);
if (!pte)
goto out;
@@ -778,11 +788,14 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
* If it's recently referenced (perhaps page_referenced
* skipped over this mm) then we should reactivate it.
*/
- if (!migration) {
- if (vma->vm_flags & VM_LOCKED) {
- ret = SWAP_MLOCK;
+ if (!(flags & TTU_IGNORE_MLOCK)) {
+ if (vma->vm_flags & VM_LOCKED)
+ goto out_mlock;
+
+ if (TTU_ACTION(flags) == TTU_MUNLOCK)
goto out_unmap;
- }
+ }
+ if (!(flags & TTU_IGNORE_ACCESS)) {
if (ptep_clear_flush_young_notify(vma, address, pte)) {
ret = SWAP_FAIL;
goto out_unmap;
@@ -800,7 +813,14 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
/* Update high watermark before we lower rss */
update_hiwater_rss(mm);
- if (PageAnon(page)) {
+ if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
+ if (PageAnon(page))
+ dec_mm_counter(mm, anon_rss);
+ else
+ dec_mm_counter(mm, file_rss);
+ set_pte_at(mm, address, pte,
+ swp_entry_to_pte(make_hwpoison_entry(page)));
+ } else if (PageAnon(page)) {
swp_entry_t entry = { .val = page_private(page) };
if (PageSwapCache(page)) {
@@ -808,7 +828,11 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
* Store the swap location in the pte.
* See handle_pte_fault() ...
*/
- swap_duplicate(entry);
+ if (swap_duplicate(entry) < 0) {
+ set_pte_at(mm, address, pte, pteval);
+ ret = SWAP_FAIL;
+ goto out_unmap;
+ }
if (list_empty(&mm->mmlist)) {
spin_lock(&mmlist_lock);
if (list_empty(&mm->mmlist))
@@ -822,12 +846,12 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
* pte. do_swap_page() will wait until the migration
* pte is removed and then restart fault handling.
*/
- BUG_ON(!migration);
+ BUG_ON(TTU_ACTION(flags) != TTU_MIGRATION);
entry = make_migration_entry(page, pte_write(pteval));
}
set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
BUG_ON(pte_file(*pte));
- } else if (PAGE_MIGRATION && migration) {
+ } else if (PAGE_MIGRATION && (TTU_ACTION(flags) == TTU_MIGRATION)) {
/* Establish migration entry for a file page */
swp_entry_t entry;
entry = make_migration_entry(page, pte_write(pteval));
@@ -835,7 +859,6 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
} else
dec_mm_counter(mm, file_rss);
-
page_remove_rmap(page);
page_cache_release(page);
@@ -843,6 +866,27 @@ out_unmap:
pte_unmap_unlock(pte, ptl);
out:
return ret;
+
+out_mlock:
+ pte_unmap_unlock(pte, ptl);
+
+
+ /*
+ * We need mmap_sem locking, Otherwise VM_LOCKED check makes
+ * unstable result and race. Plus, We can't wait here because
+ * we now hold anon_vma->lock or mapping->i_mmap_lock.
+ * if trylock failed, the page remain in evictable lru and later
+ * vmscan could retry to move the page to unevictable lru if the
+ * page is actually mlocked.
+ */
+ if (down_read_trylock(&vma->vm_mm->mmap_sem)) {
+ if (vma->vm_flags & VM_LOCKED) {
+ mlock_vma_page(page);
+ ret = SWAP_MLOCK;
+ }
+ up_read(&vma->vm_mm->mmap_sem);
+ }
+ return ret;
}
/*
@@ -908,11 +952,10 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
return ret;
/*
- * MLOCK_PAGES => feature is configured.
- * if we can acquire the mmap_sem for read, and vma is VM_LOCKED,
+ * If we can acquire the mmap_sem for read, and vma is VM_LOCKED,
* keep the sem while scanning the cluster for mlocking pages.
*/
- if (MLOCK_PAGES && down_read_trylock(&vma->vm_mm->mmap_sem)) {
+ if (down_read_trylock(&vma->vm_mm->mmap_sem)) {
locked_vma = (vma->vm_flags & VM_LOCKED);
if (!locked_vma)
up_read(&vma->vm_mm->mmap_sem); /* don't need it */
@@ -962,29 +1005,11 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
return ret;
}
-/*
- * common handling for pages mapped in VM_LOCKED vmas
- */
-static int try_to_mlock_page(struct page *page, struct vm_area_struct *vma)
-{
- int mlocked = 0;
-
- if (down_read_trylock(&vma->vm_mm->mmap_sem)) {
- if (vma->vm_flags & VM_LOCKED) {
- mlock_vma_page(page);
- mlocked++; /* really mlocked the page */
- }
- up_read(&vma->vm_mm->mmap_sem);
- }
- return mlocked;
-}
-
/**
* try_to_unmap_anon - unmap or unlock anonymous page using the object-based
* rmap method
* @page: the page to unmap/unlock
- * @unlock: request for unlock rather than unmap [unlikely]
- * @migration: unmapping for migration - ignored if @unlock
+ * @flags: action and flags
*
* Find all the mappings of a page using the mapping pointer and the vma chains
* contained in the anon_vma struct it points to.
@@ -996,53 +1021,33 @@ static int try_to_mlock_page(struct page *page, struct vm_area_struct *vma)
* vm_flags for that VMA. That should be OK, because that vma shouldn't be
* 'LOCKED.
*/
-static int try_to_unmap_anon(struct page *page, int unlock, int migration)
+static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
{
struct anon_vma *anon_vma;
struct vm_area_struct *vma;
- unsigned int mlocked = 0;
int ret = SWAP_AGAIN;
- if (MLOCK_PAGES && unlikely(unlock))
- ret = SWAP_SUCCESS; /* default for try_to_munlock() */
-
anon_vma = page_lock_anon_vma(page);
if (!anon_vma)
return ret;
list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
- if (MLOCK_PAGES && unlikely(unlock)) {
- if (!((vma->vm_flags & VM_LOCKED) &&
- page_mapped_in_vma(page, vma)))
- continue; /* must visit all unlocked vmas */
- ret = SWAP_MLOCK; /* saw at least one mlocked vma */
- } else {
- ret = try_to_unmap_one(page, vma, migration);
- if (ret == SWAP_FAIL || !page_mapped(page))
- break;
- }
- if (ret == SWAP_MLOCK) {
- mlocked = try_to_mlock_page(page, vma);
- if (mlocked)
- break; /* stop if actually mlocked page */
- }
+ unsigned long address = vma_address(page, vma);
+ if (address == -EFAULT)
+ continue;
+ ret = try_to_unmap_one(page, vma, address, flags);
+ if (ret != SWAP_AGAIN || !page_mapped(page))
+ break;
}
page_unlock_anon_vma(anon_vma);
-
- if (mlocked)
- ret = SWAP_MLOCK; /* actually mlocked the page */
- else if (ret == SWAP_MLOCK)
- ret = SWAP_AGAIN; /* saw VM_LOCKED vma */
-
return ret;
}
/**
* try_to_unmap_file - unmap/unlock file page using the object-based rmap method
* @page: the page to unmap/unlock
- * @unlock: request for unlock rather than unmap [unlikely]
- * @migration: unmapping for migration - ignored if @unlock
+ * @flags: action and flags
*
* Find all the mappings of a page using the mapping pointer and the vma chains
* contained in the address_space struct it points to.
@@ -1054,7 +1059,7 @@ static int try_to_unmap_anon(struct page *page, int unlock, int migration)
* vm_flags for that VMA. That should be OK, because that vma shouldn't be
* 'LOCKED.
*/
-static int try_to_unmap_file(struct page *page, int unlock, int migration)
+static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
{
struct address_space *mapping = page->mapping;
pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
@@ -1065,46 +1070,30 @@ static int try_to_unmap_file(struct page *page, int unlock, int migration)
unsigned long max_nl_cursor = 0;
unsigned long max_nl_size = 0;
unsigned int mapcount;
- unsigned int mlocked = 0;
-
- if (MLOCK_PAGES && unlikely(unlock))
- ret = SWAP_SUCCESS; /* default for try_to_munlock() */
spin_lock(&mapping->i_mmap_lock);
vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
- if (MLOCK_PAGES && unlikely(unlock)) {
- if (!((vma->vm_flags & VM_LOCKED) &&
- page_mapped_in_vma(page, vma)))
- continue; /* must visit all vmas */
- ret = SWAP_MLOCK;
- } else {
- ret = try_to_unmap_one(page, vma, migration);
- if (ret == SWAP_FAIL || !page_mapped(page))
- goto out;
- }
- if (ret == SWAP_MLOCK) {
- mlocked = try_to_mlock_page(page, vma);
- if (mlocked)
- break; /* stop if actually mlocked page */
- }
+ unsigned long address = vma_address(page, vma);
+ if (address == -EFAULT)
+ continue;
+ ret = try_to_unmap_one(page, vma, address, flags);
+ if (ret != SWAP_AGAIN || !page_mapped(page))
+ goto out;
}
- if (mlocked)
+ if (list_empty(&mapping->i_mmap_nonlinear))
goto out;
- if (list_empty(&mapping->i_mmap_nonlinear))
+ /*
+ * We don't bother to try to find the munlocked page in nonlinears.
+ * It's costly. Instead, later, page reclaim logic may call
+ * try_to_unmap(TTU_MUNLOCK) and recover PG_mlocked lazily.
+ */
+ if (TTU_ACTION(flags) == TTU_MUNLOCK)
goto out;
list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
shared.vm_set.list) {
- if (MLOCK_PAGES && unlikely(unlock)) {
- if (!(vma->vm_flags & VM_LOCKED))
- continue; /* must visit all vmas */
- ret = SWAP_MLOCK; /* leave mlocked == 0 */
- goto out; /* no need to look further */
- }
- if (!MLOCK_PAGES && !migration && (vma->vm_flags & VM_LOCKED))
- continue;
cursor = (unsigned long) vma->vm_private_data;
if (cursor > max_nl_cursor)
max_nl_cursor = cursor;
@@ -1137,16 +1126,12 @@ static int try_to_unmap_file(struct page *page, int unlock, int migration)
do {
list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
shared.vm_set.list) {
- if (!MLOCK_PAGES && !migration &&
- (vma->vm_flags & VM_LOCKED))
- continue;
cursor = (unsigned long) vma->vm_private_data;
while ( cursor < max_nl_cursor &&
cursor < vma->vm_end - vma->vm_start) {
- ret = try_to_unmap_cluster(cursor, &mapcount,
- vma, page);
- if (ret == SWAP_MLOCK)
- mlocked = 2; /* to return below */
+ if (try_to_unmap_cluster(cursor, &mapcount,
+ vma, page) == SWAP_MLOCK)
+ ret = SWAP_MLOCK;
cursor += CLUSTER_SIZE;
vma->vm_private_data = (void *) cursor;
if ((int)mapcount <= 0)
@@ -1167,17 +1152,13 @@ static int try_to_unmap_file(struct page *page, int unlock, int migration)
vma->vm_private_data = NULL;
out:
spin_unlock(&mapping->i_mmap_lock);
- if (mlocked)
- ret = SWAP_MLOCK; /* actually mlocked the page */
- else if (ret == SWAP_MLOCK)
- ret = SWAP_AGAIN; /* saw VM_LOCKED vma */
return ret;
}
/**
* try_to_unmap - try to remove all page table mappings to a page
* @page: the page to get unmapped
- * @migration: migration flag
+ * @flags: action and flags
*
* Tries to remove all the page table entries which are mapping this
* page, used in the pageout path. Caller must hold the page lock.
@@ -1188,16 +1169,18 @@ out:
* SWAP_FAIL - the page is unswappable
* SWAP_MLOCK - page is mlocked.
*/
-int try_to_unmap(struct page *page, int migration)
+int try_to_unmap(struct page *page, enum ttu_flags flags)
{
int ret;
BUG_ON(!PageLocked(page));
- if (PageAnon(page))
- ret = try_to_unmap_anon(page, 0, migration);
+ if (unlikely(PageKsm(page)))
+ ret = try_to_unmap_ksm(page, flags);
+ else if (PageAnon(page))
+ ret = try_to_unmap_anon(page, flags);
else
- ret = try_to_unmap_file(page, 0, migration);
+ ret = try_to_unmap_file(page, flags);
if (ret != SWAP_MLOCK && !page_mapped(page))
ret = SWAP_SUCCESS;
return ret;
@@ -1213,17 +1196,98 @@ int try_to_unmap(struct page *page, int migration)
*
* Return values are:
*
- * SWAP_SUCCESS - no vma's holding page mlocked.
+ * SWAP_AGAIN - no vma is holding page mlocked, or,
* SWAP_AGAIN - page mapped in mlocked vma -- couldn't acquire mmap sem
+ * SWAP_FAIL - page cannot be located at present
* SWAP_MLOCK - page is now mlocked.
*/
int try_to_munlock(struct page *page)
{
VM_BUG_ON(!PageLocked(page) || PageLRU(page));
- if (PageAnon(page))
- return try_to_unmap_anon(page, 1, 0);
+ if (unlikely(PageKsm(page)))
+ return try_to_unmap_ksm(page, TTU_MUNLOCK);
+ else if (PageAnon(page))
+ return try_to_unmap_anon(page, TTU_MUNLOCK);
else
- return try_to_unmap_file(page, 1, 0);
+ return try_to_unmap_file(page, TTU_MUNLOCK);
}
+#ifdef CONFIG_MIGRATION
+/*
+ * rmap_walk() and its helpers rmap_walk_anon() and rmap_walk_file():
+ * Called by migrate.c to remove migration ptes, but might be used more later.
+ */
+static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
+ struct vm_area_struct *, unsigned long, void *), void *arg)
+{
+ struct anon_vma *anon_vma;
+ struct vm_area_struct *vma;
+ int ret = SWAP_AGAIN;
+
+ /*
+ * Note: remove_migration_ptes() cannot use page_lock_anon_vma()
+ * because that depends on page_mapped(); but not all its usages
+ * are holding mmap_sem, which also gave the necessary guarantee
+ * (that this anon_vma's slab has not already been destroyed).
+ * This needs to be reviewed later: avoiding page_lock_anon_vma()
+ * is risky, and currently limits the usefulness of rmap_walk().
+ */
+ anon_vma = page_anon_vma(page);
+ if (!anon_vma)
+ return ret;
+ spin_lock(&anon_vma->lock);
+ list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
+ unsigned long address = vma_address(page, vma);
+ if (address == -EFAULT)
+ continue;
+ ret = rmap_one(page, vma, address, arg);
+ if (ret != SWAP_AGAIN)
+ break;
+ }
+ spin_unlock(&anon_vma->lock);
+ return ret;
+}
+
+static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *,
+ struct vm_area_struct *, unsigned long, void *), void *arg)
+{
+ struct address_space *mapping = page->mapping;
+ pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+ struct vm_area_struct *vma;
+ struct prio_tree_iter iter;
+ int ret = SWAP_AGAIN;
+
+ if (!mapping)
+ return ret;
+ spin_lock(&mapping->i_mmap_lock);
+ vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+ unsigned long address = vma_address(page, vma);
+ if (address == -EFAULT)
+ continue;
+ ret = rmap_one(page, vma, address, arg);
+ if (ret != SWAP_AGAIN)
+ break;
+ }
+ /*
+ * No nonlinear handling: being always shared, nonlinear vmas
+ * never contain migration ptes. Decide what to do about this
+ * limitation to linear when we need rmap_walk() on nonlinear.
+ */
+ spin_unlock(&mapping->i_mmap_lock);
+ return ret;
+}
+
+int rmap_walk(struct page *page, int (*rmap_one)(struct page *,
+ struct vm_area_struct *, unsigned long, void *), void *arg)
+{
+ VM_BUG_ON(!PageLocked(page));
+
+ if (unlikely(PageKsm(page)))
+ return rmap_walk_ksm(page, rmap_one, arg);
+ else if (PageAnon(page))
+ return rmap_walk_anon(page, rmap_one, arg);
+ else
+ return rmap_walk_file(page, rmap_one, arg);
+}
+#endif /* CONFIG_MIGRATION */
diff --git a/mm/shmem.c b/mm/shmem.c
index b206a7a32e2a..4fb41c83daca 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -218,7 +218,7 @@ static const struct file_operations shmem_file_operations;
static const struct inode_operations shmem_inode_operations;
static const struct inode_operations shmem_dir_inode_operations;
static const struct inode_operations shmem_special_inode_operations;
-static struct vm_operations_struct shmem_vm_ops;
+static const struct vm_operations_struct shmem_vm_ops;
static struct backing_dev_info shmem_backing_dev_info __read_mostly = {
.ra_pages = 0, /* No readahead */
@@ -1017,7 +1017,14 @@ int shmem_unuse(swp_entry_t entry, struct page *page)
goto out;
}
mutex_unlock(&shmem_swaplist_mutex);
-out: return found; /* 0 or 1 or -ENOMEM */
+ /*
+ * Can some race bring us here? We've been holding page lock,
+ * so I think not; but would rather try again later than BUG()
+ */
+ unlock_page(page);
+ page_cache_release(page);
+out:
+ return (found < 0) ? found : 0;
}
/*
@@ -1046,8 +1053,9 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
* sync from ever calling shmem_writepage; but a stacking filesystem
* may use the ->writepage of its underlying filesystem, in which case
* tmpfs should write out to swap only in response to memory pressure,
- * and not for pdflush or sync. However, in those cases, we do still
- * want to check if there's a redundant swappage to be discarded.
+ * and not for the writeback threads or sync. However, in those cases,
+ * we do still want to check if there's a redundant swappage to be
+ * discarded.
*/
if (wbc->for_reclaim)
swap = get_swap_page();
@@ -1079,7 +1087,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
else
inode = NULL;
spin_unlock(&info->lock);
- swap_duplicate(swap);
+ swap_shmem_alloc(swap);
BUG_ON(page_mapped(page));
page_cache_release(page); /* pagecache ref */
swap_writepage(page, wbc);
@@ -1633,8 +1641,8 @@ shmem_write_end(struct file *file, struct address_space *mapping,
if (pos + copied > inode->i_size)
i_size_write(inode, pos + copied);
- unlock_page(page);
set_page_dirty(page);
+ unlock_page(page);
page_cache_release(page);
return copied;
@@ -1971,13 +1979,13 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
iput(inode);
return error;
}
- unlock_page(page);
inode->i_mapping->a_ops = &shmem_aops;
inode->i_op = &shmem_symlink_inode_operations;
kaddr = kmap_atomic(page, KM_USER0);
memcpy(kaddr, symname, len);
kunmap_atomic(kaddr, KM_USER0);
set_page_dirty(page);
+ unlock_page(page);
page_cache_release(page);
}
if (dir->i_mode & S_ISGID)
@@ -2420,6 +2428,7 @@ static const struct address_space_operations shmem_aops = {
.write_end = shmem_write_end,
#endif
.migratepage = migrate_page,
+ .error_remove_page = generic_error_remove_page,
};
static const struct file_operations shmem_file_operations = {
@@ -2496,7 +2505,7 @@ static const struct super_operations shmem_ops = {
.put_super = shmem_put_super,
};
-static struct vm_operations_struct shmem_vm_ops = {
+static const struct vm_operations_struct shmem_vm_ops = {
.fault = shmem_fault,
#ifdef CONFIG_NUMA
.set_policy = shmem_set_policy,
diff --git a/mm/slab.c b/mm/slab.c
index 7dfa481c96ba..3f4822938f46 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -490,7 +490,7 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp)
#endif
-#ifdef CONFIG_KMEMTRACE
+#ifdef CONFIG_TRACING
size_t slab_buffer_size(struct kmem_cache *cachep)
{
return cachep->buffer_size;
@@ -604,6 +604,26 @@ static struct kmem_cache cache_cache = {
#define BAD_ALIEN_MAGIC 0x01020304ul
+/*
+ * chicken and egg problem: delay the per-cpu array allocation
+ * until the general caches are up.
+ */
+static enum {
+ NONE,
+ PARTIAL_AC,
+ PARTIAL_L3,
+ EARLY,
+ FULL
+} g_cpucache_up;
+
+/*
+ * used by boot code to determine if it can use slab based allocator
+ */
+int slab_is_available(void)
+{
+ return g_cpucache_up >= EARLY;
+}
+
#ifdef CONFIG_LOCKDEP
/*
@@ -620,40 +640,52 @@ static struct kmem_cache cache_cache = {
static struct lock_class_key on_slab_l3_key;
static struct lock_class_key on_slab_alc_key;
-static inline void init_lock_keys(void)
-
+static void init_node_lock_keys(int q)
{
- int q;
struct cache_sizes *s = malloc_sizes;
- while (s->cs_size != ULONG_MAX) {
- for_each_node(q) {
- struct array_cache **alc;
- int r;
- struct kmem_list3 *l3 = s->cs_cachep->nodelists[q];
- if (!l3 || OFF_SLAB(s->cs_cachep))
- continue;
- lockdep_set_class(&l3->list_lock, &on_slab_l3_key);
- alc = l3->alien;
- /*
- * FIXME: This check for BAD_ALIEN_MAGIC
- * should go away when common slab code is taught to
- * work even without alien caches.
- * Currently, non NUMA code returns BAD_ALIEN_MAGIC
- * for alloc_alien_cache,
- */
- if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC)
- continue;
- for_each_node(r) {
- if (alc[r])
- lockdep_set_class(&alc[r]->lock,
- &on_slab_alc_key);
- }
+ if (g_cpucache_up != FULL)
+ return;
+
+ for (s = malloc_sizes; s->cs_size != ULONG_MAX; s++) {
+ struct array_cache **alc;
+ struct kmem_list3 *l3;
+ int r;
+
+ l3 = s->cs_cachep->nodelists[q];
+ if (!l3 || OFF_SLAB(s->cs_cachep))
+ return;
+ lockdep_set_class(&l3->list_lock, &on_slab_l3_key);
+ alc = l3->alien;
+ /*
+ * FIXME: This check for BAD_ALIEN_MAGIC
+ * should go away when common slab code is taught to
+ * work even without alien caches.
+ * Currently, non NUMA code returns BAD_ALIEN_MAGIC
+ * for alloc_alien_cache,
+ */
+ if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC)
+ return;
+ for_each_node(r) {
+ if (alc[r])
+ lockdep_set_class(&alc[r]->lock,
+ &on_slab_alc_key);
}
- s++;
}
}
+
+static inline void init_lock_keys(void)
+{
+ int node;
+
+ for_each_node(node)
+ init_node_lock_keys(node);
+}
#else
+static void init_node_lock_keys(int q)
+{
+}
+
static inline void init_lock_keys(void)
{
}
@@ -665,27 +697,7 @@ static inline void init_lock_keys(void)
static DEFINE_MUTEX(cache_chain_mutex);
static struct list_head cache_chain;
-/*
- * chicken and egg problem: delay the per-cpu array allocation
- * until the general caches are up.
- */
-static enum {
- NONE,
- PARTIAL_AC,
- PARTIAL_L3,
- EARLY,
- FULL
-} g_cpucache_up;
-
-/*
- * used by boot code to determine if it can use slab based allocator
- */
-int slab_is_available(void)
-{
- return g_cpucache_up >= EARLY;
-}
-
-static DEFINE_PER_CPU(struct delayed_work, reap_work);
+static DEFINE_PER_CPU(struct delayed_work, slab_reap_work);
static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
{
@@ -826,7 +838,7 @@ __setup("noaliencache", noaliencache_setup);
* objects freed on different nodes from which they were allocated) and the
* flushing of remote pcps by calling drain_node_pages.
*/
-static DEFINE_PER_CPU(unsigned long, reap_node);
+static DEFINE_PER_CPU(unsigned long, slab_reap_node);
static void init_reap_node(int cpu)
{
@@ -836,17 +848,17 @@ static void init_reap_node(int cpu)
if (node == MAX_NUMNODES)
node = first_node(node_online_map);
- per_cpu(reap_node, cpu) = node;
+ per_cpu(slab_reap_node, cpu) = node;
}
static void next_reap_node(void)
{
- int node = __get_cpu_var(reap_node);
+ int node = __get_cpu_var(slab_reap_node);
node = next_node(node, node_online_map);
if (unlikely(node >= MAX_NUMNODES))
node = first_node(node_online_map);
- __get_cpu_var(reap_node) = node;
+ __get_cpu_var(slab_reap_node) = node;
}
#else
@@ -863,7 +875,7 @@ static void next_reap_node(void)
*/
static void __cpuinit start_cpu_timer(int cpu)
{
- struct delayed_work *reap_work = &per_cpu(reap_work, cpu);
+ struct delayed_work *reap_work = &per_cpu(slab_reap_work, cpu);
/*
* When this gets called from do_initcalls via cpucache_init(),
@@ -1027,7 +1039,7 @@ static void __drain_alien_cache(struct kmem_cache *cachep,
*/
static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3)
{
- int node = __get_cpu_var(reap_node);
+ int node = __get_cpu_var(slab_reap_node);
if (l3->alien) {
struct array_cache *ac = l3->alien[node];
@@ -1254,6 +1266,8 @@ static int __cpuinit cpuup_prepare(long cpu)
kfree(shared);
free_alien_cache(alien);
}
+ init_node_lock_keys(node);
+
return 0;
bad:
cpuup_canceled(cpu);
@@ -1286,9 +1300,9 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
* anything expensive but will only modify reap_work
* and reschedule the timer.
*/
- cancel_rearming_delayed_work(&per_cpu(reap_work, cpu));
+ cancel_rearming_delayed_work(&per_cpu(slab_reap_work, cpu));
/* Now the cache_reaper is guaranteed to be not running. */
- per_cpu(reap_work, cpu).work.func = NULL;
+ per_cpu(slab_reap_work, cpu).work.func = NULL;
break;
case CPU_DOWN_FAILED:
case CPU_DOWN_FAILED_FROZEN:
@@ -3103,13 +3117,19 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
} else {
STATS_INC_ALLOCMISS(cachep);
objp = cache_alloc_refill(cachep, flags);
+ /*
+ * the 'ac' may be updated by cache_alloc_refill(),
+ * and kmemleak_erase() requires its correct value.
+ */
+ ac = cpu_cache_get(cachep);
}
/*
* To avoid a false negative, if an object that is in one of the
* per-CPU caches is leaked, we need to make sure kmemleak doesn't
* treat the array pointers as a reference to the object.
*/
- kmemleak_erase(&ac->entry[ac->avail]);
+ if (objp)
+ kmemleak_erase(&ac->entry[ac->avail]);
return objp;
}
@@ -3306,7 +3326,7 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
cache_alloc_debugcheck_before(cachep, flags);
local_irq_save(save_flags);
- if (unlikely(nodeid == -1))
+ if (nodeid == -1)
nodeid = numa_node_id();
if (unlikely(!cachep->nodelists[nodeid])) {
@@ -3558,7 +3578,7 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
}
EXPORT_SYMBOL(kmem_cache_alloc);
-#ifdef CONFIG_KMEMTRACE
+#ifdef CONFIG_TRACING
void *kmem_cache_alloc_notrace(struct kmem_cache *cachep, gfp_t flags)
{
return __cache_alloc(cachep, flags, __builtin_return_address(0));
@@ -3621,7 +3641,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
}
EXPORT_SYMBOL(kmem_cache_alloc_node);
-#ifdef CONFIG_KMEMTRACE
+#ifdef CONFIG_TRACING
void *kmem_cache_alloc_node_notrace(struct kmem_cache *cachep,
gfp_t flags,
int nodeid)
@@ -3649,7 +3669,7 @@ __do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller)
return ret;
}
-#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_KMEMTRACE)
+#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING)
void *__kmalloc_node(size_t size, gfp_t flags, int node)
{
return __do_kmalloc_node(size, flags, node,
@@ -3669,7 +3689,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
return __do_kmalloc_node(size, flags, node, NULL);
}
EXPORT_SYMBOL(__kmalloc_node);
-#endif /* CONFIG_DEBUG_SLAB */
+#endif /* CONFIG_DEBUG_SLAB || CONFIG_TRACING */
#endif /* CONFIG_NUMA */
/**
@@ -3701,7 +3721,7 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
}
-#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_KMEMTRACE)
+#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING)
void *__kmalloc(size_t size, gfp_t flags)
{
return __do_kmalloc(size, flags, __builtin_return_address(0));
diff --git a/mm/slub.c b/mm/slub.c
index 4996fc719552..8d71aaf888d7 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1735,7 +1735,7 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
}
local_irq_restore(flags);
- if (unlikely((gfpflags & __GFP_ZERO) && object))
+ if (unlikely(gfpflags & __GFP_ZERO) && object)
memset(object, 0, objsize);
kmemcheck_slab_alloc(s, gfpflags, object, c->objsize);
@@ -1754,7 +1754,7 @@ void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
}
EXPORT_SYMBOL(kmem_cache_alloc);
-#ifdef CONFIG_KMEMTRACE
+#ifdef CONFIG_TRACING
void *kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags)
{
return slab_alloc(s, gfpflags, -1, _RET_IP_);
@@ -1775,7 +1775,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
EXPORT_SYMBOL(kmem_cache_alloc_node);
#endif
-#ifdef CONFIG_KMEMTRACE
+#ifdef CONFIG_TRACING
void *kmem_cache_alloc_node_notrace(struct kmem_cache *s,
gfp_t gfpflags,
int node)
@@ -4371,12 +4371,28 @@ static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si)
return len + sprintf(buf + len, "\n");
}
+static void clear_stat(struct kmem_cache *s, enum stat_item si)
+{
+ int cpu;
+
+ for_each_online_cpu(cpu)
+ get_cpu_slab(s, cpu)->stat[si] = 0;
+}
+
#define STAT_ATTR(si, text) \
static ssize_t text##_show(struct kmem_cache *s, char *buf) \
{ \
return show_stat(s, buf, si); \
} \
-SLAB_ATTR_RO(text); \
+static ssize_t text##_store(struct kmem_cache *s, \
+ const char *buf, size_t length) \
+{ \
+ if (buf[0] != '0') \
+ return -EINVAL; \
+ clear_stat(s, si); \
+ return length; \
+} \
+SLAB_ATTR(text); \
STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath);
STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index f1bf19daadc6..6c0585b16418 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -22,6 +22,7 @@
#include <linux/seq_file.h>
#include <linux/init.h>
#include <linux/module.h>
+#include <linux/ksm.h>
#include <linux/rmap.h>
#include <linux/security.h>
#include <linux/backing-dev.h>
@@ -35,11 +36,15 @@
#include <linux/swapops.h>
#include <linux/page_cgroup.h>
+static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
+ unsigned char);
+static void free_swap_count_continuations(struct swap_info_struct *);
+static sector_t map_swap_entry(swp_entry_t, struct block_device**);
+
static DEFINE_SPINLOCK(swap_lock);
static unsigned int nr_swapfiles;
long nr_swap_pages;
long total_swap_pages;
-static int swap_overflow;
static int least_priority;
static const char Bad_file[] = "Bad swap file entry ";
@@ -49,42 +54,20 @@ static const char Unused_offset[] = "Unused swap offset entry ";
static struct swap_list_t swap_list = {-1, -1};
-static struct swap_info_struct swap_info[MAX_SWAPFILES];
+static struct swap_info_struct *swap_info[MAX_SWAPFILES];
static DEFINE_MUTEX(swapon_mutex);
-/* For reference count accounting in swap_map */
-/* enum for swap_map[] handling. internal use only */
-enum {
- SWAP_MAP = 0, /* ops for reference from swap users */
- SWAP_CACHE, /* ops for reference from swap cache */
-};
-
-static inline int swap_count(unsigned short ent)
-{
- return ent & SWAP_COUNT_MASK;
-}
-
-static inline bool swap_has_cache(unsigned short ent)
+static inline unsigned char swap_count(unsigned char ent)
{
- return !!(ent & SWAP_HAS_CACHE);
+ return ent & ~SWAP_HAS_CACHE; /* may include SWAP_HAS_CONT flag */
}
-static inline unsigned short encode_swapmap(int count, bool has_cache)
-{
- unsigned short ret = count;
-
- if (has_cache)
- return SWAP_HAS_CACHE | ret;
- return ret;
-}
-
-/* returnes 1 if swap entry is freed */
+/* returns 1 if swap entry is freed */
static int
__try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
{
- int type = si - swap_info;
- swp_entry_t entry = swp_entry(type, offset);
+ swp_entry_t entry = swp_entry(si->type, offset);
struct page *page;
int ret = 0;
@@ -120,7 +103,7 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page)
down_read(&swap_unplug_sem);
entry.val = page_private(page);
if (PageSwapCache(page)) {
- struct block_device *bdev = swap_info[swp_type(entry)].bdev;
+ struct block_device *bdev = swap_info[swp_type(entry)]->bdev;
struct backing_dev_info *bdi;
/*
@@ -146,23 +129,28 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page)
static int discard_swap(struct swap_info_struct *si)
{
struct swap_extent *se;
+ sector_t start_block;
+ sector_t nr_blocks;
int err = 0;
- list_for_each_entry(se, &si->extent_list, list) {
- sector_t start_block = se->start_block << (PAGE_SHIFT - 9);
- sector_t nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
+ /* Do not discard the swap header page! */
+ se = &si->first_swap_extent;
+ start_block = (se->start_block + 1) << (PAGE_SHIFT - 9);
+ nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
+ if (nr_blocks) {
+ err = blkdev_issue_discard(si->bdev, start_block,
+ nr_blocks, GFP_KERNEL, DISCARD_FL_BARRIER);
+ if (err)
+ return err;
+ cond_resched();
+ }
- if (se->start_page == 0) {
- /* Do not discard the swap header page! */
- start_block += 1 << (PAGE_SHIFT - 9);
- nr_blocks -= 1 << (PAGE_SHIFT - 9);
- if (!nr_blocks)
- continue;
- }
+ list_for_each_entry(se, &si->first_swap_extent.list, list) {
+ start_block = se->start_block << (PAGE_SHIFT - 9);
+ nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
err = blkdev_issue_discard(si->bdev, start_block,
- nr_blocks, GFP_KERNEL,
- DISCARD_FL_BARRIER);
+ nr_blocks, GFP_KERNEL, DISCARD_FL_BARRIER);
if (err)
break;
@@ -201,14 +189,11 @@ static void discard_swap_cluster(struct swap_info_struct *si,
start_block <<= PAGE_SHIFT - 9;
nr_blocks <<= PAGE_SHIFT - 9;
if (blkdev_issue_discard(si->bdev, start_block,
- nr_blocks, GFP_NOIO,
- DISCARD_FL_BARRIER))
+ nr_blocks, GFP_NOIO, DISCARD_FL_BARRIER))
break;
}
lh = se->list.next;
- if (lh == &si->extent_list)
- lh = lh->next;
se = list_entry(lh, struct swap_extent, list);
}
}
@@ -223,7 +208,7 @@ static int wait_for_discard(void *word)
#define LATENCY_LIMIT 256
static inline unsigned long scan_swap_map(struct swap_info_struct *si,
- int cache)
+ unsigned char usage)
{
unsigned long offset;
unsigned long scan_base;
@@ -354,10 +339,7 @@ checks:
si->lowest_bit = si->max;
si->highest_bit = 0;
}
- if (cache == SWAP_CACHE) /* at usual swap-out via vmscan.c */
- si->swap_map[offset] = encode_swapmap(0, true);
- else /* at suspend */
- si->swap_map[offset] = encode_swapmap(1, false);
+ si->swap_map[offset] = usage;
si->cluster_next = offset + 1;
si->flags -= SWP_SCANNING;
@@ -467,10 +449,10 @@ swp_entry_t get_swap_page(void)
nr_swap_pages--;
for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) {
- si = swap_info + type;
+ si = swap_info[type];
next = si->next;
if (next < 0 ||
- (!wrapped && si->prio != swap_info[next].prio)) {
+ (!wrapped && si->prio != swap_info[next]->prio)) {
next = swap_list.head;
wrapped++;
}
@@ -482,7 +464,7 @@ swp_entry_t get_swap_page(void)
swap_list.next = next;
/* This is called for allocating swap entry for cache */
- offset = scan_swap_map(si, SWAP_CACHE);
+ offset = scan_swap_map(si, SWAP_HAS_CACHE);
if (offset) {
spin_unlock(&swap_lock);
return swp_entry(type, offset);
@@ -503,11 +485,11 @@ swp_entry_t get_swap_page_of_type(int type)
pgoff_t offset;
spin_lock(&swap_lock);
- si = swap_info + type;
- if (si->flags & SWP_WRITEOK) {
+ si = swap_info[type];
+ if (si && (si->flags & SWP_WRITEOK)) {
nr_swap_pages--;
/* This is called for allocating swap entry, not cache */
- offset = scan_swap_map(si, SWAP_MAP);
+ offset = scan_swap_map(si, 1);
if (offset) {
spin_unlock(&swap_lock);
return swp_entry(type, offset);
@@ -518,9 +500,9 @@ swp_entry_t get_swap_page_of_type(int type)
return (swp_entry_t) {0};
}
-static struct swap_info_struct * swap_info_get(swp_entry_t entry)
+static struct swap_info_struct *swap_info_get(swp_entry_t entry)
{
- struct swap_info_struct * p;
+ struct swap_info_struct *p;
unsigned long offset, type;
if (!entry.val)
@@ -528,7 +510,7 @@ static struct swap_info_struct * swap_info_get(swp_entry_t entry)
type = swp_type(entry);
if (type >= nr_swapfiles)
goto bad_nofile;
- p = & swap_info[type];
+ p = swap_info[type];
if (!(p->flags & SWP_USED))
goto bad_device;
offset = swp_offset(entry);
@@ -554,41 +536,56 @@ out:
return NULL;
}
-static int swap_entry_free(struct swap_info_struct *p,
- swp_entry_t ent, int cache)
+static unsigned char swap_entry_free(struct swap_info_struct *p,
+ swp_entry_t entry, unsigned char usage)
{
- unsigned long offset = swp_offset(ent);
- int count = swap_count(p->swap_map[offset]);
- bool has_cache;
+ unsigned long offset = swp_offset(entry);
+ unsigned char count;
+ unsigned char has_cache;
- has_cache = swap_has_cache(p->swap_map[offset]);
+ count = p->swap_map[offset];
+ has_cache = count & SWAP_HAS_CACHE;
+ count &= ~SWAP_HAS_CACHE;
- if (cache == SWAP_MAP) { /* dropping usage count of swap */
- if (count < SWAP_MAP_MAX) {
- count--;
- p->swap_map[offset] = encode_swapmap(count, has_cache);
- }
- } else { /* dropping swap cache flag */
+ if (usage == SWAP_HAS_CACHE) {
VM_BUG_ON(!has_cache);
- p->swap_map[offset] = encode_swapmap(count, false);
-
+ has_cache = 0;
+ } else if (count == SWAP_MAP_SHMEM) {
+ /*
+ * Or we could insist on shmem.c using a special
+ * swap_shmem_free() and free_shmem_swap_and_cache()...
+ */
+ count = 0;
+ } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) {
+ if (count == COUNT_CONTINUED) {
+ if (swap_count_continued(p, offset, count))
+ count = SWAP_MAP_MAX | COUNT_CONTINUED;
+ else
+ count = SWAP_MAP_MAX;
+ } else
+ count--;
}
- /* return code. */
- count = p->swap_map[offset];
+
+ if (!count)
+ mem_cgroup_uncharge_swap(entry);
+
+ usage = count | has_cache;
+ p->swap_map[offset] = usage;
+
/* free if no reference */
- if (!count) {
+ if (!usage) {
if (offset < p->lowest_bit)
p->lowest_bit = offset;
if (offset > p->highest_bit)
p->highest_bit = offset;
- if (p->prio > swap_info[swap_list.next].prio)
- swap_list.next = p - swap_info;
+ if (swap_list.next >= 0 &&
+ p->prio > swap_info[swap_list.next]->prio)
+ swap_list.next = p->type;
nr_swap_pages++;
p->inuse_pages--;
}
- if (!swap_count(count))
- mem_cgroup_uncharge_swap(ent);
- return count;
+
+ return usage;
}
/*
@@ -597,11 +594,11 @@ static int swap_entry_free(struct swap_info_struct *p,
*/
void swap_free(swp_entry_t entry)
{
- struct swap_info_struct * p;
+ struct swap_info_struct *p;
p = swap_info_get(entry);
if (p) {
- swap_entry_free(p, entry, SWAP_MAP);
+ swap_entry_free(p, entry, 1);
spin_unlock(&swap_lock);
}
}
@@ -612,26 +609,21 @@ void swap_free(swp_entry_t entry)
void swapcache_free(swp_entry_t entry, struct page *page)
{
struct swap_info_struct *p;
- int ret;
+ unsigned char count;
p = swap_info_get(entry);
if (p) {
- ret = swap_entry_free(p, entry, SWAP_CACHE);
- if (page) {
- bool swapout;
- if (ret)
- swapout = true; /* the end of swap out */
- else
- swapout = false; /* no more swap users! */
- mem_cgroup_uncharge_swapcache(page, entry, swapout);
- }
+ count = swap_entry_free(p, entry, SWAP_HAS_CACHE);
+ if (page)
+ mem_cgroup_uncharge_swapcache(page, entry, count != 0);
spin_unlock(&swap_lock);
}
- return;
}
/*
* How many references to page are currently swapped out?
+ * This does not give an exact answer when swap count is continued,
+ * but does include the high COUNT_CONTINUED flag to allow for that.
*/
static inline int page_swapcount(struct page *page)
{
@@ -659,6 +651,8 @@ int reuse_swap_page(struct page *page)
int count;
VM_BUG_ON(!PageLocked(page));
+ if (unlikely(PageKsm(page)))
+ return 0;
count = page_mapcount(page);
if (count <= 1 && PageSwapCache(page)) {
count += page_swapcount(page);
@@ -667,7 +661,7 @@ int reuse_swap_page(struct page *page)
SetPageDirty(page);
}
}
- return count == 1;
+ return count <= 1;
}
/*
@@ -699,12 +693,12 @@ int free_swap_and_cache(swp_entry_t entry)
struct swap_info_struct *p;
struct page *page = NULL;
- if (is_migration_entry(entry))
+ if (non_swap_entry(entry))
return 1;
p = swap_info_get(entry);
if (p) {
- if (swap_entry_free(p, entry, SWAP_MAP) == SWAP_HAS_CACHE) {
+ if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) {
page = find_get_page(&swapper_space, entry.val);
if (page && !trylock_page(page)) {
page_cache_release(page);
@@ -741,14 +735,14 @@ int free_swap_and_cache(swp_entry_t entry)
int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
{
struct block_device *bdev = NULL;
- int i;
+ int type;
if (device)
bdev = bdget(device);
spin_lock(&swap_lock);
- for (i = 0; i < nr_swapfiles; i++) {
- struct swap_info_struct *sis = swap_info + i;
+ for (type = 0; type < nr_swapfiles; type++) {
+ struct swap_info_struct *sis = swap_info[type];
if (!(sis->flags & SWP_WRITEOK))
continue;
@@ -758,20 +752,18 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
*bdev_p = bdgrab(sis->bdev);
spin_unlock(&swap_lock);
- return i;
+ return type;
}
if (bdev == sis->bdev) {
- struct swap_extent *se;
+ struct swap_extent *se = &sis->first_swap_extent;
- se = list_entry(sis->extent_list.next,
- struct swap_extent, list);
if (se->start_block == offset) {
if (bdev_p)
*bdev_p = bdgrab(sis->bdev);
spin_unlock(&swap_lock);
bdput(bdev);
- return i;
+ return type;
}
}
}
@@ -783,6 +775,21 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
}
/*
+ * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev
+ * corresponding to given index in swap_info (swap type).
+ */
+sector_t swapdev_block(int type, pgoff_t offset)
+{
+ struct block_device *bdev;
+
+ if ((unsigned int)type >= nr_swapfiles)
+ return 0;
+ if (!(swap_info[type]->flags & SWP_WRITEOK))
+ return 0;
+ return map_swap_entry(swp_entry(type, offset), &bdev);
+}
+
+/*
* Return either the total number of swap pages of given type, or the number
* of free pages of that type (depending on @free)
*
@@ -792,18 +799,20 @@ unsigned int count_swap_pages(int type, int free)
{
unsigned int n = 0;
- if (type < nr_swapfiles) {
- spin_lock(&swap_lock);
- if (swap_info[type].flags & SWP_WRITEOK) {
- n = swap_info[type].pages;
+ spin_lock(&swap_lock);
+ if ((unsigned int)type < nr_swapfiles) {
+ struct swap_info_struct *sis = swap_info[type];
+
+ if (sis->flags & SWP_WRITEOK) {
+ n = sis->pages;
if (free)
- n -= swap_info[type].inuse_pages;
+ n -= sis->inuse_pages;
}
- spin_unlock(&swap_lock);
}
+ spin_unlock(&swap_lock);
return n;
}
-#endif
+#endif /* CONFIG_HIBERNATION */
/*
* No need to decide whether this PTE shares the swap entry with others,
@@ -932,7 +941,7 @@ static int unuse_vma(struct vm_area_struct *vma,
unsigned long addr, end, next;
int ret;
- if (page->mapping) {
+ if (page_anon_vma(page)) {
addr = page_address_in_vma(page, vma);
if (addr == -EFAULT)
return 0;
@@ -988,7 +997,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
{
unsigned int max = si->max;
unsigned int i = prev;
- int count;
+ unsigned char count;
/*
* No need for swap_lock here: we're just looking
@@ -1024,16 +1033,14 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
*/
static int try_to_unuse(unsigned int type)
{
- struct swap_info_struct * si = &swap_info[type];
+ struct swap_info_struct *si = swap_info[type];
struct mm_struct *start_mm;
- unsigned short *swap_map;
- unsigned short swcount;
+ unsigned char *swap_map;
+ unsigned char swcount;
struct page *page;
swp_entry_t entry;
unsigned int i = 0;
int retval = 0;
- int reset_overflow = 0;
- int shmem;
/*
* When searching mms for an entry, a good strategy is to
@@ -1047,8 +1054,7 @@ static int try_to_unuse(unsigned int type)
* together, child after parent. If we race with dup_mmap(), we
* prefer to resolve parent before child, lest we miss entries
* duplicated after we scanned child: using last mm would invert
- * that. Though it's only a serious concern when an overflowed
- * swap count is reset from SWAP_MAP_MAX, preventing a rescan.
+ * that.
*/
start_mm = &init_mm;
atomic_inc(&init_mm.mm_users);
@@ -1110,17 +1116,18 @@ static int try_to_unuse(unsigned int type)
/*
* Remove all references to entry.
- * Whenever we reach init_mm, there's no address space
- * to search, but use it as a reminder to search shmem.
*/
- shmem = 0;
swcount = *swap_map;
- if (swap_count(swcount)) {
- if (start_mm == &init_mm)
- shmem = shmem_unuse(entry, page);
- else
- retval = unuse_mm(start_mm, entry, page);
+ if (swap_count(swcount) == SWAP_MAP_SHMEM) {
+ retval = shmem_unuse(entry, page);
+ /* page has already been unlocked and released */
+ if (retval < 0)
+ break;
+ continue;
}
+ if (swap_count(swcount) && start_mm != &init_mm)
+ retval = unuse_mm(start_mm, entry, page);
+
if (swap_count(*swap_map)) {
int set_start_mm = (*swap_map >= swcount);
struct list_head *p = &start_mm->mmlist;
@@ -1131,7 +1138,7 @@ static int try_to_unuse(unsigned int type)
atomic_inc(&new_start_mm->mm_users);
atomic_inc(&prev_mm->mm_users);
spin_lock(&mmlist_lock);
- while (swap_count(*swap_map) && !retval && !shmem &&
+ while (swap_count(*swap_map) && !retval &&
(p = p->next) != &start_mm->mmlist) {
mm = list_entry(p, struct mm_struct, mmlist);
if (!atomic_inc_not_zero(&mm->mm_users))
@@ -1145,14 +1152,12 @@ static int try_to_unuse(unsigned int type)
swcount = *swap_map;
if (!swap_count(swcount)) /* any usage ? */
;
- else if (mm == &init_mm) {
+ else if (mm == &init_mm)
set_start_mm = 1;
- shmem = shmem_unuse(entry, page);
- } else
+ else
retval = unuse_mm(mm, entry, page);
- if (set_start_mm &&
- swap_count(*swap_map) < swcount) {
+ if (set_start_mm && *swap_map < swcount) {
mmput(new_start_mm);
atomic_inc(&mm->mm_users);
new_start_mm = mm;
@@ -1165,13 +1170,6 @@ static int try_to_unuse(unsigned int type)
mmput(start_mm);
start_mm = new_start_mm;
}
- if (shmem) {
- /* page has already been unlocked and released */
- if (shmem > 0)
- continue;
- retval = shmem;
- break;
- }
if (retval) {
unlock_page(page);
page_cache_release(page);
@@ -1179,30 +1177,6 @@ static int try_to_unuse(unsigned int type)
}
/*
- * How could swap count reach 0x7ffe ?
- * There's no way to repeat a swap page within an mm
- * (except in shmem, where it's the shared object which takes
- * the reference count)?
- * We believe SWAP_MAP_MAX cannot occur.(if occur, unsigned
- * short is too small....)
- * If that's wrong, then we should worry more about
- * exit_mmap() and do_munmap() cases described above:
- * we might be resetting SWAP_MAP_MAX too early here.
- * We know "Undead"s can happen, they're okay, so don't
- * report them; but do report if we reset SWAP_MAP_MAX.
- */
- /* We might release the lock_page() in unuse_mm(). */
- if (!PageSwapCache(page) || page_private(page) != entry.val)
- goto retry;
-
- if (swap_count(*swap_map) == SWAP_MAP_MAX) {
- spin_lock(&swap_lock);
- *swap_map = encode_swapmap(0, true);
- spin_unlock(&swap_lock);
- reset_overflow = 1;
- }
-
- /*
* If a reference remains (rare), we would like to leave
* the page in the swap cache; but try_to_unmap could
* then re-duplicate the entry once we drop page lock,
@@ -1214,6 +1188,12 @@ static int try_to_unuse(unsigned int type)
* read from disk into another page. Splitting into two
* pages would be incorrect if swap supported "shared
* private" pages, but they are handled by tmpfs files.
+ *
+ * Given how unuse_vma() targets one particular offset
+ * in an anon_vma, once the anon_vma has been determined,
+ * this splitting happens to be just what is needed to
+ * handle where KSM pages have been swapped out: re-reading
+ * is unnecessarily slow, but we can fix that later on.
*/
if (swap_count(*swap_map) &&
PageDirty(page) && PageSwapCache(page)) {
@@ -1243,7 +1223,6 @@ static int try_to_unuse(unsigned int type)
* mark page dirty so shrink_page_list will preserve it.
*/
SetPageDirty(page);
-retry:
unlock_page(page);
page_cache_release(page);
@@ -1255,10 +1234,6 @@ retry:
}
mmput(start_mm);
- if (reset_overflow) {
- printk(KERN_WARNING "swapoff: cleared swap entry overflow\n");
- swap_overflow = 0;
- }
return retval;
}
@@ -1271,10 +1246,10 @@ retry:
static void drain_mmlist(void)
{
struct list_head *p, *next;
- unsigned int i;
+ unsigned int type;
- for (i = 0; i < nr_swapfiles; i++)
- if (swap_info[i].inuse_pages)
+ for (type = 0; type < nr_swapfiles; type++)
+ if (swap_info[type]->inuse_pages)
return;
spin_lock(&mmlist_lock);
list_for_each_safe(p, next, &init_mm.mmlist)
@@ -1284,12 +1259,23 @@ static void drain_mmlist(void)
/*
* Use this swapdev's extent info to locate the (PAGE_SIZE) block which
- * corresponds to page offset `offset'.
+ * corresponds to page offset for the specified swap entry.
+ * Note that the type of this function is sector_t, but it returns page offset
+ * into the bdev, not sector offset.
*/
-sector_t map_swap_page(struct swap_info_struct *sis, pgoff_t offset)
+static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev)
{
- struct swap_extent *se = sis->curr_swap_extent;
- struct swap_extent *start_se = se;
+ struct swap_info_struct *sis;
+ struct swap_extent *start_se;
+ struct swap_extent *se;
+ pgoff_t offset;
+
+ sis = swap_info[swp_type(entry)];
+ *bdev = sis->bdev;
+
+ offset = swp_offset(entry);
+ start_se = sis->curr_swap_extent;
+ se = start_se;
for ( ; ; ) {
struct list_head *lh;
@@ -1299,40 +1285,31 @@ sector_t map_swap_page(struct swap_info_struct *sis, pgoff_t offset)
return se->start_block + (offset - se->start_page);
}
lh = se->list.next;
- if (lh == &sis->extent_list)
- lh = lh->next;
se = list_entry(lh, struct swap_extent, list);
sis->curr_swap_extent = se;
BUG_ON(se == start_se); /* It *must* be present */
}
}
-#ifdef CONFIG_HIBERNATION
/*
- * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev
- * corresponding to given index in swap_info (swap type).
+ * Returns the page offset into bdev for the specified page's swap entry.
*/
-sector_t swapdev_block(int swap_type, pgoff_t offset)
+sector_t map_swap_page(struct page *page, struct block_device **bdev)
{
- struct swap_info_struct *sis;
-
- if (swap_type >= nr_swapfiles)
- return 0;
-
- sis = swap_info + swap_type;
- return (sis->flags & SWP_WRITEOK) ? map_swap_page(sis, offset) : 0;
+ swp_entry_t entry;
+ entry.val = page_private(page);
+ return map_swap_entry(entry, bdev);
}
-#endif /* CONFIG_HIBERNATION */
/*
* Free all of a swapdev's extent information
*/
static void destroy_swap_extents(struct swap_info_struct *sis)
{
- while (!list_empty(&sis->extent_list)) {
+ while (!list_empty(&sis->first_swap_extent.list)) {
struct swap_extent *se;
- se = list_entry(sis->extent_list.next,
+ se = list_entry(sis->first_swap_extent.list.next,
struct swap_extent, list);
list_del(&se->list);
kfree(se);
@@ -1353,8 +1330,15 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
struct swap_extent *new_se;
struct list_head *lh;
- lh = sis->extent_list.prev; /* The highest page extent */
- if (lh != &sis->extent_list) {
+ if (start_page == 0) {
+ se = &sis->first_swap_extent;
+ sis->curr_swap_extent = se;
+ se->start_page = 0;
+ se->nr_pages = nr_pages;
+ se->start_block = start_block;
+ return 1;
+ } else {
+ lh = sis->first_swap_extent.list.prev; /* Highest extent */
se = list_entry(lh, struct swap_extent, list);
BUG_ON(se->start_page + se->nr_pages != start_page);
if (se->start_block + se->nr_pages == start_block) {
@@ -1374,7 +1358,7 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
new_se->nr_pages = nr_pages;
new_se->start_block = start_block;
- list_add_tail(&new_se->list, &sis->extent_list);
+ list_add_tail(&new_se->list, &sis->first_swap_extent.list);
return 1;
}
@@ -1426,7 +1410,7 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
if (S_ISBLK(inode->i_mode)) {
ret = add_swap_extent(sis, 0, sis->max, 0);
*span = sis->pages;
- goto done;
+ goto out;
}
blkbits = inode->i_blkbits;
@@ -1497,25 +1481,22 @@ reprobe:
sis->max = page_no;
sis->pages = page_no - 1;
sis->highest_bit = page_no - 1;
-done:
- sis->curr_swap_extent = list_entry(sis->extent_list.prev,
- struct swap_extent, list);
- goto out;
+out:
+ return ret;
bad_bmap:
printk(KERN_ERR "swapon: swapfile has holes\n");
ret = -EINVAL;
-out:
- return ret;
+ goto out;
}
SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
{
- struct swap_info_struct * p = NULL;
- unsigned short *swap_map;
+ struct swap_info_struct *p = NULL;
+ unsigned char *swap_map;
struct file *swap_file, *victim;
struct address_space *mapping;
struct inode *inode;
- char * pathname;
+ char *pathname;
int i, type, prev;
int err;
@@ -1536,8 +1517,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
mapping = victim->f_mapping;
prev = -1;
spin_lock(&swap_lock);
- for (type = swap_list.head; type >= 0; type = swap_info[type].next) {
- p = swap_info + type;
+ for (type = swap_list.head; type >= 0; type = swap_info[type]->next) {
+ p = swap_info[type];
if (p->flags & SWP_WRITEOK) {
if (p->swap_file->f_mapping == mapping)
break;
@@ -1556,18 +1537,17 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
spin_unlock(&swap_lock);
goto out_dput;
}
- if (prev < 0) {
+ if (prev < 0)
swap_list.head = p->next;
- } else {
- swap_info[prev].next = p->next;
- }
+ else
+ swap_info[prev]->next = p->next;
if (type == swap_list.next) {
/* just pick something that's safe... */
swap_list.next = swap_list.head;
}
if (p->prio < 0) {
- for (i = p->next; i >= 0; i = swap_info[i].next)
- swap_info[i].prio = p->prio--;
+ for (i = p->next; i >= 0; i = swap_info[i]->next)
+ swap_info[i]->prio = p->prio--;
least_priority++;
}
nr_swap_pages -= p->pages;
@@ -1585,16 +1565,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
if (p->prio < 0)
p->prio = --least_priority;
prev = -1;
- for (i = swap_list.head; i >= 0; i = swap_info[i].next) {
- if (p->prio >= swap_info[i].prio)
+ for (i = swap_list.head; i >= 0; i = swap_info[i]->next) {
+ if (p->prio >= swap_info[i]->prio)
break;
prev = i;
}
p->next = i;
if (prev < 0)
- swap_list.head = swap_list.next = p - swap_info;
+ swap_list.head = swap_list.next = type;
else
- swap_info[prev].next = p - swap_info;
+ swap_info[prev]->next = type;
nr_swap_pages += p->pages;
total_swap_pages += p->pages;
p->flags |= SWP_WRITEOK;
@@ -1607,6 +1587,9 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
up_write(&swap_unplug_sem);
destroy_swap_extents(p);
+ if (p->flags & SWP_CONTINUED)
+ free_swap_count_continuations(p);
+
mutex_lock(&swapon_mutex);
spin_lock(&swap_lock);
drain_mmlist();
@@ -1654,8 +1637,8 @@ out:
/* iterator */
static void *swap_start(struct seq_file *swap, loff_t *pos)
{
- struct swap_info_struct *ptr = swap_info;
- int i;
+ struct swap_info_struct *si;
+ int type;
loff_t l = *pos;
mutex_lock(&swapon_mutex);
@@ -1663,11 +1646,13 @@ static void *swap_start(struct seq_file *swap, loff_t *pos)
if (!l)
return SEQ_START_TOKEN;
- for (i = 0; i < nr_swapfiles; i++, ptr++) {
- if (!(ptr->flags & SWP_USED) || !ptr->swap_map)
+ for (type = 0; type < nr_swapfiles; type++) {
+ smp_rmb(); /* read nr_swapfiles before swap_info[type] */
+ si = swap_info[type];
+ if (!(si->flags & SWP_USED) || !si->swap_map)
continue;
if (!--l)
- return ptr;
+ return si;
}
return NULL;
@@ -1675,21 +1660,21 @@ static void *swap_start(struct seq_file *swap, loff_t *pos)
static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
{
- struct swap_info_struct *ptr;
- struct swap_info_struct *endptr = swap_info + nr_swapfiles;
+ struct swap_info_struct *si = v;
+ int type;
if (v == SEQ_START_TOKEN)
- ptr = swap_info;
- else {
- ptr = v;
- ptr++;
- }
+ type = 0;
+ else
+ type = si->type + 1;
- for (; ptr < endptr; ptr++) {
- if (!(ptr->flags & SWP_USED) || !ptr->swap_map)
+ for (; type < nr_swapfiles; type++) {
+ smp_rmb(); /* read nr_swapfiles before swap_info[type] */
+ si = swap_info[type];
+ if (!(si->flags & SWP_USED) || !si->swap_map)
continue;
++*pos;
- return ptr;
+ return si;
}
return NULL;
@@ -1702,24 +1687,24 @@ static void swap_stop(struct seq_file *swap, void *v)
static int swap_show(struct seq_file *swap, void *v)
{
- struct swap_info_struct *ptr = v;
+ struct swap_info_struct *si = v;
struct file *file;
int len;
- if (ptr == SEQ_START_TOKEN) {
+ if (si == SEQ_START_TOKEN) {
seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
return 0;
}
- file = ptr->swap_file;
+ file = si->swap_file;
len = seq_path(swap, &file->f_path, " \t\n\\");
seq_printf(swap, "%*s%s\t%u\t%u\t%d\n",
len < 40 ? 40 - len : 1, " ",
S_ISBLK(file->f_path.dentry->d_inode->i_mode) ?
"partition" : "file\t",
- ptr->pages << (PAGE_SHIFT - 10),
- ptr->inuse_pages << (PAGE_SHIFT - 10),
- ptr->prio);
+ si->pages << (PAGE_SHIFT - 10),
+ si->inuse_pages << (PAGE_SHIFT - 10),
+ si->prio);
return 0;
}
@@ -1766,7 +1751,7 @@ late_initcall(max_swapfiles_check);
*/
SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
{
- struct swap_info_struct * p;
+ struct swap_info_struct *p;
char *name = NULL;
struct block_device *bdev = NULL;
struct file *swap_file = NULL;
@@ -1780,30 +1765,52 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
sector_t span;
unsigned long maxpages = 1;
unsigned long swapfilepages;
- unsigned short *swap_map = NULL;
+ unsigned char *swap_map = NULL;
struct page *page = NULL;
struct inode *inode = NULL;
int did_down = 0;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
+
+ p = kzalloc(sizeof(*p), GFP_KERNEL);
+ if (!p)
+ return -ENOMEM;
+
spin_lock(&swap_lock);
- p = swap_info;
- for (type = 0 ; type < nr_swapfiles ; type++,p++)
- if (!(p->flags & SWP_USED))
+ for (type = 0; type < nr_swapfiles; type++) {
+ if (!(swap_info[type]->flags & SWP_USED))
break;
+ }
error = -EPERM;
if (type >= MAX_SWAPFILES) {
spin_unlock(&swap_lock);
+ kfree(p);
goto out;
}
- if (type >= nr_swapfiles)
- nr_swapfiles = type+1;
- memset(p, 0, sizeof(*p));
- INIT_LIST_HEAD(&p->extent_list);
+ if (type >= nr_swapfiles) {
+ p->type = type;
+ swap_info[type] = p;
+ /*
+ * Write swap_info[type] before nr_swapfiles, in case a
+ * racing procfs swap_start() or swap_next() is reading them.
+ * (We never shrink nr_swapfiles, we never free this entry.)
+ */
+ smp_wmb();
+ nr_swapfiles++;
+ } else {
+ kfree(p);
+ p = swap_info[type];
+ /*
+ * Do not memset this entry: a racing procfs swap_next()
+ * would be relying on p->type to remain valid.
+ */
+ }
+ INIT_LIST_HEAD(&p->first_swap_extent.list);
p->flags = SWP_USED;
p->next = -1;
spin_unlock(&swap_lock);
+
name = getname(specialfile);
error = PTR_ERR(name);
if (IS_ERR(name)) {
@@ -1823,7 +1830,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
error = -EBUSY;
for (i = 0; i < nr_swapfiles; i++) {
- struct swap_info_struct *q = &swap_info[i];
+ struct swap_info_struct *q = swap_info[i];
if (i == type || !q->swap_file)
continue;
@@ -1898,6 +1905,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
p->lowest_bit = 1;
p->cluster_next = 1;
+ p->cluster_nr = 0;
/*
* Find out how many pages are allowed for a single swap
@@ -1933,13 +1941,13 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
goto bad_swap;
/* OK, set up the swap map and apply the bad block list */
- swap_map = vmalloc(maxpages * sizeof(short));
+ swap_map = vmalloc(maxpages);
if (!swap_map) {
error = -ENOMEM;
goto bad_swap;
}
- memset(swap_map, 0, maxpages * sizeof(short));
+ memset(swap_map, 0, maxpages);
for (i = 0; i < swap_header->info.nr_badpages; i++) {
int page_nr = swap_header->info.badpages[i];
if (page_nr <= 0 || page_nr >= swap_header->info.last_page) {
@@ -1974,12 +1982,14 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
goto bad_swap;
}
- if (blk_queue_nonrot(bdev_get_queue(p->bdev))) {
- p->flags |= SWP_SOLIDSTATE;
- p->cluster_next = 1 + (random32() % p->highest_bit);
+ if (p->bdev) {
+ if (blk_queue_nonrot(bdev_get_queue(p->bdev))) {
+ p->flags |= SWP_SOLIDSTATE;
+ p->cluster_next = 1 + (random32() % p->highest_bit);
+ }
+ if (discard_swap(p) == 0)
+ p->flags |= SWP_DISCARDABLE;
}
- if (discard_swap(p) == 0)
- p->flags |= SWP_DISCARDABLE;
mutex_lock(&swapon_mutex);
spin_lock(&swap_lock);
@@ -2002,18 +2012,16 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
/* insert swap space into swap_list: */
prev = -1;
- for (i = swap_list.head; i >= 0; i = swap_info[i].next) {
- if (p->prio >= swap_info[i].prio) {
+ for (i = swap_list.head; i >= 0; i = swap_info[i]->next) {
+ if (p->prio >= swap_info[i]->prio)
break;
- }
prev = i;
}
p->next = i;
- if (prev < 0) {
- swap_list.head = swap_list.next = p - swap_info;
- } else {
- swap_info[prev].next = p - swap_info;
- }
+ if (prev < 0)
+ swap_list.head = swap_list.next = type;
+ else
+ swap_info[prev]->next = type;
spin_unlock(&swap_lock);
mutex_unlock(&swapon_mutex);
error = 0;
@@ -2050,15 +2058,15 @@ out:
void si_swapinfo(struct sysinfo *val)
{
- unsigned int i;
+ unsigned int type;
unsigned long nr_to_be_unused = 0;
spin_lock(&swap_lock);
- for (i = 0; i < nr_swapfiles; i++) {
- if (!(swap_info[i].flags & SWP_USED) ||
- (swap_info[i].flags & SWP_WRITEOK))
- continue;
- nr_to_be_unused += swap_info[i].inuse_pages;
+ for (type = 0; type < nr_swapfiles; type++) {
+ struct swap_info_struct *si = swap_info[type];
+
+ if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
+ nr_to_be_unused += si->inuse_pages;
}
val->freeswap = nr_swap_pages + nr_to_be_unused;
val->totalswap = total_swap_pages + nr_to_be_unused;
@@ -2068,101 +2076,107 @@ void si_swapinfo(struct sysinfo *val)
/*
* Verify that a swap entry is valid and increment its swap map count.
*
- * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as
- * "permanent", but will be reclaimed by the next swapoff.
* Returns error code in following case.
* - success -> 0
* - swp_entry is invalid -> EINVAL
* - swp_entry is migration entry -> EINVAL
* - swap-cache reference is requested but there is already one. -> EEXIST
* - swap-cache reference is requested but the entry is not used. -> ENOENT
+ * - swap-mapped reference requested but needs continued swap count. -> ENOMEM
*/
-static int __swap_duplicate(swp_entry_t entry, bool cache)
+static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
{
- struct swap_info_struct * p;
+ struct swap_info_struct *p;
unsigned long offset, type;
- int result = -EINVAL;
- int count;
- bool has_cache;
+ unsigned char count;
+ unsigned char has_cache;
+ int err = -EINVAL;
- if (is_migration_entry(entry))
- return -EINVAL;
+ if (non_swap_entry(entry))
+ goto out;
type = swp_type(entry);
if (type >= nr_swapfiles)
goto bad_file;
- p = type + swap_info;
+ p = swap_info[type];
offset = swp_offset(entry);
spin_lock(&swap_lock);
-
if (unlikely(offset >= p->max))
goto unlock_out;
- count = swap_count(p->swap_map[offset]);
- has_cache = swap_has_cache(p->swap_map[offset]);
+ count = p->swap_map[offset];
+ has_cache = count & SWAP_HAS_CACHE;
+ count &= ~SWAP_HAS_CACHE;
+ err = 0;
- if (cache == SWAP_CACHE) { /* called for swapcache/swapin-readahead */
+ if (usage == SWAP_HAS_CACHE) {
/* set SWAP_HAS_CACHE if there is no cache and entry is used */
- if (!has_cache && count) {
- p->swap_map[offset] = encode_swapmap(count, true);
- result = 0;
- } else if (has_cache) /* someone added cache */
- result = -EEXIST;
- else if (!count) /* no users */
- result = -ENOENT;
+ if (!has_cache && count)
+ has_cache = SWAP_HAS_CACHE;
+ else if (has_cache) /* someone else added cache */
+ err = -EEXIST;
+ else /* no users remaining */
+ err = -ENOENT;
} else if (count || has_cache) {
- if (count < SWAP_MAP_MAX - 1) {
- p->swap_map[offset] = encode_swapmap(count + 1,
- has_cache);
- result = 0;
- } else if (count <= SWAP_MAP_MAX) {
- if (swap_overflow++ < 5)
- printk(KERN_WARNING
- "swap_dup: swap entry overflow\n");
- p->swap_map[offset] = encode_swapmap(SWAP_MAP_MAX,
- has_cache);
- result = 0;
- }
+
+ if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
+ count += usage;
+ else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX)
+ err = -EINVAL;
+ else if (swap_count_continued(p, offset, count))
+ count = COUNT_CONTINUED;
+ else
+ err = -ENOMEM;
} else
- result = -ENOENT; /* unused swap entry */
+ err = -ENOENT; /* unused swap entry */
+
+ p->swap_map[offset] = count | has_cache;
+
unlock_out:
spin_unlock(&swap_lock);
out:
- return result;
+ return err;
bad_file:
printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val);
goto out;
}
+
+/*
+ * Help swapoff by noting that swap entry belongs to shmem/tmpfs
+ * (in which case its reference count is never incremented).
+ */
+void swap_shmem_alloc(swp_entry_t entry)
+{
+ __swap_duplicate(entry, SWAP_MAP_SHMEM);
+}
+
/*
* increase reference count of swap entry by 1.
*/
-void swap_duplicate(swp_entry_t entry)
+int swap_duplicate(swp_entry_t entry)
{
- __swap_duplicate(entry, SWAP_MAP);
+ int err = 0;
+
+ while (!err && __swap_duplicate(entry, 1) == -ENOMEM)
+ err = add_swap_count_continuation(entry, GFP_ATOMIC);
+ return err;
}
/*
* @entry: swap entry for which we allocate swap cache.
*
- * Called when allocating swap cache for exising swap entry,
+ * Called when allocating swap cache for existing swap entry,
* This can return error codes. Returns 0 at success.
* -EBUSY means there is a swap cache.
* Note: return code is different from swap_duplicate().
*/
int swapcache_prepare(swp_entry_t entry)
{
- return __swap_duplicate(entry, SWAP_CACHE);
-}
-
-
-struct swap_info_struct *
-get_swap_info_struct(unsigned type)
-{
- return &swap_info[type];
+ return __swap_duplicate(entry, SWAP_HAS_CACHE);
}
/*
@@ -2180,7 +2194,7 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
if (!our_page_cluster) /* no readahead */
return 0;
- si = &swap_info[swp_type(entry)];
+ si = swap_info[swp_type(entry)];
target = swp_offset(entry);
base = (target >> our_page_cluster) << our_page_cluster;
end = base + (1 << our_page_cluster);
@@ -2216,3 +2230,219 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
*offset = ++toff;
return nr_pages? ++nr_pages: 0;
}
+
+/*
+ * add_swap_count_continuation - called when a swap count is duplicated
+ * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's
+ * page of the original vmalloc'ed swap_map, to hold the continuation count
+ * (for that entry and for its neighbouring PAGE_SIZE swap entries). Called
+ * again when count is duplicated beyond SWAP_MAP_MAX * SWAP_CONT_MAX, etc.
+ *
+ * These continuation pages are seldom referenced: the common paths all work
+ * on the original swap_map, only referring to a continuation page when the
+ * low "digit" of a count is incremented or decremented through SWAP_MAP_MAX.
+ *
+ * add_swap_count_continuation(, GFP_ATOMIC) can be called while holding
+ * page table locks; if it fails, add_swap_count_continuation(, GFP_KERNEL)
+ * can be called after dropping locks.
+ */
+int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
+{
+ struct swap_info_struct *si;
+ struct page *head;
+ struct page *page;
+ struct page *list_page;
+ pgoff_t offset;
+ unsigned char count;
+
+ /*
+ * When debugging, it's easier to use __GFP_ZERO here; but it's better
+ * for latency not to zero a page while GFP_ATOMIC and holding locks.
+ */
+ page = alloc_page(gfp_mask | __GFP_HIGHMEM);
+
+ si = swap_info_get(entry);
+ if (!si) {
+ /*
+ * An acceptable race has occurred since the failing
+ * __swap_duplicate(): the swap entry has been freed,
+ * perhaps even the whole swap_map cleared for swapoff.
+ */
+ goto outer;
+ }
+
+ offset = swp_offset(entry);
+ count = si->swap_map[offset] & ~SWAP_HAS_CACHE;
+
+ if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) {
+ /*
+ * The higher the swap count, the more likely it is that tasks
+ * will race to add swap count continuation: we need to avoid
+ * over-provisioning.
+ */
+ goto out;
+ }
+
+ if (!page) {
+ spin_unlock(&swap_lock);
+ return -ENOMEM;
+ }
+
+ /*
+ * We are fortunate that although vmalloc_to_page uses pte_offset_map,
+ * no architecture is using highmem pages for kernel pagetables: so it
+ * will not corrupt the GFP_ATOMIC caller's atomic pagetable kmaps.
+ */
+ head = vmalloc_to_page(si->swap_map + offset);
+ offset &= ~PAGE_MASK;
+
+ /*
+ * Page allocation does not initialize the page's lru field,
+ * but it does always reset its private field.
+ */
+ if (!page_private(head)) {
+ BUG_ON(count & COUNT_CONTINUED);
+ INIT_LIST_HEAD(&head->lru);
+ set_page_private(head, SWP_CONTINUED);
+ si->flags |= SWP_CONTINUED;
+ }
+
+ list_for_each_entry(list_page, &head->lru, lru) {
+ unsigned char *map;
+
+ /*
+ * If the previous map said no continuation, but we've found
+ * a continuation page, free our allocation and use this one.
+ */
+ if (!(count & COUNT_CONTINUED))
+ goto out;
+
+ map = kmap_atomic(list_page, KM_USER0) + offset;
+ count = *map;
+ kunmap_atomic(map, KM_USER0);
+
+ /*
+ * If this continuation count now has some space in it,
+ * free our allocation and use this one.
+ */
+ if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX)
+ goto out;
+ }
+
+ list_add_tail(&page->lru, &head->lru);
+ page = NULL; /* now it's attached, don't free it */
+out:
+ spin_unlock(&swap_lock);
+outer:
+ if (page)
+ __free_page(page);
+ return 0;
+}
+
+/*
+ * swap_count_continued - when the original swap_map count is incremented
+ * from SWAP_MAP_MAX, check if there is already a continuation page to carry
+ * into, carry if so, or else fail until a new continuation page is allocated;
+ * when the original swap_map count is decremented from 0 with continuation,
+ * borrow from the continuation and report whether it still holds more.
+ * Called while __swap_duplicate() or swap_entry_free() holds swap_lock.
+ */
+static bool swap_count_continued(struct swap_info_struct *si,
+ pgoff_t offset, unsigned char count)
+{
+ struct page *head;
+ struct page *page;
+ unsigned char *map;
+
+ head = vmalloc_to_page(si->swap_map + offset);
+ if (page_private(head) != SWP_CONTINUED) {
+ BUG_ON(count & COUNT_CONTINUED);
+ return false; /* need to add count continuation */
+ }
+
+ offset &= ~PAGE_MASK;
+ page = list_entry(head->lru.next, struct page, lru);
+ map = kmap_atomic(page, KM_USER0) + offset;
+
+ if (count == SWAP_MAP_MAX) /* initial increment from swap_map */
+ goto init_map; /* jump over SWAP_CONT_MAX checks */
+
+ if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) { /* incrementing */
+ /*
+ * Think of how you add 1 to 999
+ */
+ while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) {
+ kunmap_atomic(map, KM_USER0);
+ page = list_entry(page->lru.next, struct page, lru);
+ BUG_ON(page == head);
+ map = kmap_atomic(page, KM_USER0) + offset;
+ }
+ if (*map == SWAP_CONT_MAX) {
+ kunmap_atomic(map, KM_USER0);
+ page = list_entry(page->lru.next, struct page, lru);
+ if (page == head)
+ return false; /* add count continuation */
+ map = kmap_atomic(page, KM_USER0) + offset;
+init_map: *map = 0; /* we didn't zero the page */
+ }
+ *map += 1;
+ kunmap_atomic(map, KM_USER0);
+ page = list_entry(page->lru.prev, struct page, lru);
+ while (page != head) {
+ map = kmap_atomic(page, KM_USER0) + offset;
+ *map = COUNT_CONTINUED;
+ kunmap_atomic(map, KM_USER0);
+ page = list_entry(page->lru.prev, struct page, lru);
+ }
+ return true; /* incremented */
+
+ } else { /* decrementing */
+ /*
+ * Think of how you subtract 1 from 1000
+ */
+ BUG_ON(count != COUNT_CONTINUED);
+ while (*map == COUNT_CONTINUED) {
+ kunmap_atomic(map, KM_USER0);
+ page = list_entry(page->lru.next, struct page, lru);
+ BUG_ON(page == head);
+ map = kmap_atomic(page, KM_USER0) + offset;
+ }
+ BUG_ON(*map == 0);
+ *map -= 1;
+ if (*map == 0)
+ count = 0;
+ kunmap_atomic(map, KM_USER0);
+ page = list_entry(page->lru.prev, struct page, lru);
+ while (page != head) {
+ map = kmap_atomic(page, KM_USER0) + offset;
+ *map = SWAP_CONT_MAX | count;
+ count = COUNT_CONTINUED;
+ kunmap_atomic(map, KM_USER0);
+ page = list_entry(page->lru.prev, struct page, lru);
+ }
+ return count == COUNT_CONTINUED;
+ }
+}
+
+/*
+ * free_swap_count_continuations - swapoff free all the continuation pages
+ * appended to the swap_map, after swap_map is quiesced, before vfree'ing it.
+ */
+static void free_swap_count_continuations(struct swap_info_struct *si)
+{
+ pgoff_t offset;
+
+ for (offset = 0; offset < si->max; offset += PAGE_SIZE) {
+ struct page *head;
+ head = vmalloc_to_page(si->swap_map + offset);
+ if (page_private(head)) {
+ struct list_head *this, *next;
+ list_for_each_safe(this, next, &head->lru) {
+ struct page *page;
+ page = list_entry(this, struct page, lru);
+ list_del(this);
+ __free_page(page);
+ }
+ }
+ }
+}
diff --git a/mm/truncate.c b/mm/truncate.c
index ccc3ecf7cb98..342deee22684 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -93,11 +93,11 @@ EXPORT_SYMBOL(cancel_dirty_page);
* its lock, b) when a concurrent invalidate_mapping_pages got there first and
* c) when tmpfs swizzles a page between a tmpfs inode and swapper_space.
*/
-static void
+static int
truncate_complete_page(struct address_space *mapping, struct page *page)
{
if (page->mapping != mapping)
- return;
+ return -EIO;
if (page_has_private(page))
do_invalidatepage(page, 0);
@@ -108,6 +108,7 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
remove_from_page_cache(page);
ClearPageMappedToDisk(page);
page_cache_release(page); /* pagecache ref */
+ return 0;
}
/*
@@ -135,6 +136,51 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
return ret;
}
+int truncate_inode_page(struct address_space *mapping, struct page *page)
+{
+ if (page_mapped(page)) {
+ unmap_mapping_range(mapping,
+ (loff_t)page->index << PAGE_CACHE_SHIFT,
+ PAGE_CACHE_SIZE, 0);
+ }
+ return truncate_complete_page(mapping, page);
+}
+
+/*
+ * Used to get rid of pages on hardware memory corruption.
+ */
+int generic_error_remove_page(struct address_space *mapping, struct page *page)
+{
+ if (!mapping)
+ return -EINVAL;
+ /*
+ * Only punch for normal data pages for now.
+ * Handling other types like directories would need more auditing.
+ */
+ if (!S_ISREG(mapping->host->i_mode))
+ return -EIO;
+ return truncate_inode_page(mapping, page);
+}
+EXPORT_SYMBOL(generic_error_remove_page);
+
+/*
+ * Safely invalidate one page from its pagecache mapping.
+ * It only drops clean, unused pages. The page must be locked.
+ *
+ * Returns 1 if the page is successfully invalidated, otherwise 0.
+ */
+int invalidate_inode_page(struct page *page)
+{
+ struct address_space *mapping = page_mapping(page);
+ if (!mapping)
+ return 0;
+ if (PageDirty(page) || PageWriteback(page))
+ return 0;
+ if (page_mapped(page))
+ return 0;
+ return invalidate_complete_page(mapping, page);
+}
+
/**
* truncate_inode_pages - truncate range of pages specified by start & end byte offsets
* @mapping: mapping to truncate
@@ -196,12 +242,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
unlock_page(page);
continue;
}
- if (page_mapped(page)) {
- unmap_mapping_range(mapping,
- (loff_t)page_index<<PAGE_CACHE_SHIFT,
- PAGE_CACHE_SIZE, 0);
- }
- truncate_complete_page(mapping, page);
+ truncate_inode_page(mapping, page);
unlock_page(page);
}
pagevec_release(&pvec);
@@ -231,6 +272,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
pagevec_release(&pvec);
break;
}
+ mem_cgroup_uncharge_start();
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
@@ -238,18 +280,14 @@ void truncate_inode_pages_range(struct address_space *mapping,
break;
lock_page(page);
wait_on_page_writeback(page);
- if (page_mapped(page)) {
- unmap_mapping_range(mapping,
- (loff_t)page->index<<PAGE_CACHE_SHIFT,
- PAGE_CACHE_SIZE, 0);
- }
+ truncate_inode_page(mapping, page);
if (page->index > next)
next = page->index;
next++;
- truncate_complete_page(mapping, page);
unlock_page(page);
}
pagevec_release(&pvec);
+ mem_cgroup_uncharge_end();
}
}
EXPORT_SYMBOL(truncate_inode_pages_range);
@@ -291,6 +329,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
pagevec_init(&pvec, 0);
while (next <= end &&
pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+ mem_cgroup_uncharge_start();
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
pgoff_t index;
@@ -311,17 +350,14 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
if (lock_failed)
continue;
- if (PageDirty(page) || PageWriteback(page))
- goto unlock;
- if (page_mapped(page))
- goto unlock;
- ret += invalidate_complete_page(mapping, page);
-unlock:
+ ret += invalidate_inode_page(page);
+
unlock_page(page);
if (next > end)
break;
}
pagevec_release(&pvec);
+ mem_cgroup_uncharge_end();
cond_resched();
}
return ret;
@@ -396,6 +432,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
while (next <= end && !wrapped &&
pagevec_lookup(&pvec, mapping, next,
min(end - next, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
+ mem_cgroup_uncharge_start();
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
pgoff_t page_index;
@@ -445,6 +482,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
unlock_page(page);
}
pagevec_release(&pvec);
+ mem_cgroup_uncharge_end();
cond_resched();
}
return ret;
@@ -458,10 +496,74 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range);
* Any pages which are found to be mapped into pagetables are unmapped prior to
* invalidation.
*
- * Returns -EIO if any pages could not be invalidated.
+ * Returns -EBUSY if any pages could not be invalidated.
*/
int invalidate_inode_pages2(struct address_space *mapping)
{
return invalidate_inode_pages2_range(mapping, 0, -1);
}
EXPORT_SYMBOL_GPL(invalidate_inode_pages2);
+
+/**
+ * truncate_pagecache - unmap and remove pagecache that has been truncated
+ * @inode: inode
+ * @old: old file offset
+ * @new: new file offset
+ *
+ * inode's new i_size must already be written before truncate_pagecache
+ * is called.
+ *
+ * This function should typically be called before the filesystem
+ * releases resources associated with the freed range (eg. deallocates
+ * blocks). This way, pagecache will always stay logically coherent
+ * with on-disk format, and the filesystem would not have to deal with
+ * situations such as writepage being called for a page that has already
+ * had its underlying blocks deallocated.
+ */
+void truncate_pagecache(struct inode *inode, loff_t old, loff_t new)
+{
+ if (new < old) {
+ struct address_space *mapping = inode->i_mapping;
+
+ /*
+ * unmap_mapping_range is called twice, first simply for
+ * efficiency so that truncate_inode_pages does fewer
+ * single-page unmaps. However after this first call, and
+ * before truncate_inode_pages finishes, it is possible for
+ * private pages to be COWed, which remain after
+ * truncate_inode_pages finishes, hence the second
+ * unmap_mapping_range call must be made for correctness.
+ */
+ unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1);
+ truncate_inode_pages(mapping, new);
+ unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1);
+ }
+}
+EXPORT_SYMBOL(truncate_pagecache);
+
+/**
+ * vmtruncate - unmap mappings "freed" by truncate() syscall
+ * @inode: inode of the file used
+ * @offset: file offset to start truncating
+ *
+ * NOTE! We have to be ready to update the memory sharing
+ * between the file and the memory map for a potential last
+ * incomplete page. Ugly, but necessary.
+ */
+int vmtruncate(struct inode *inode, loff_t offset)
+{
+ loff_t oldsize;
+ int error;
+
+ error = inode_newsize_ok(inode, offset);
+ if (error)
+ return error;
+ oldsize = inode->i_size;
+ i_size_write(inode, offset);
+ truncate_pagecache(inode, oldsize, offset);
+ if (inode->i_op->truncate)
+ inode->i_op->truncate(inode);
+
+ return error;
+}
+EXPORT_SYMBOL(vmtruncate);
diff --git a/mm/util.c b/mm/util.c
index 7c35ad95f927..b377ce430803 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -4,6 +4,10 @@
#include <linux/module.h>
#include <linux/err.h>
#include <linux/sched.h>
+#include <linux/hugetlb.h>
+#include <linux/syscalls.h>
+#include <linux/mman.h>
+#include <linux/file.h>
#include <asm/uaccess.h>
#define CREATE_TRACE_POINTS
@@ -268,6 +272,46 @@ int __attribute__((weak)) get_user_pages_fast(unsigned long start,
}
EXPORT_SYMBOL_GPL(get_user_pages_fast);
+SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
+ unsigned long, prot, unsigned long, flags,
+ unsigned long, fd, unsigned long, pgoff)
+{
+ struct file * file = NULL;
+ unsigned long retval = -EBADF;
+
+ if (!(flags & MAP_ANONYMOUS)) {
+ if (unlikely(flags & MAP_HUGETLB))
+ return -EINVAL;
+ file = fget(fd);
+ if (!file)
+ goto out;
+ } else if (flags & MAP_HUGETLB) {
+ struct user_struct *user = NULL;
+ /*
+ * VM_NORESERVE is used because the reservations will be
+ * taken when vm_ops->mmap() is called
+ * A dummy user value is used because we are not locking
+ * memory so no accounting is necessary
+ */
+ len = ALIGN(len, huge_page_size(&default_hstate));
+ file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE,
+ &user, HUGETLB_ANONHUGE_INODE);
+ if (IS_ERR(file))
+ return PTR_ERR(file);
+ }
+
+ flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
+
+ down_write(&current->mm->mmap_sem);
+ retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
+ up_write(&current->mm->mmap_sem);
+
+ if (file)
+ fput(file);
+out:
+ return retval;
+}
+
/* Tracepoints definitions. */
EXPORT_TRACEPOINT_SYMBOL(kmalloc);
EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 69511e663234..37e69295f250 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -12,6 +12,7 @@
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/highmem.h>
+#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/interrupt.h>
@@ -25,10 +26,10 @@
#include <linux/rcupdate.h>
#include <linux/pfn.h>
#include <linux/kmemleak.h>
-#include <linux/highmem.h>
#include <asm/atomic.h>
#include <asm/uaccess.h>
#include <asm/tlbflush.h>
+#include <asm/shmparam.h>
/*** Page table manipulation functions ***/
@@ -760,7 +761,7 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
spin_lock(&vbq->lock);
list_add(&vb->free_list, &vbq->free);
spin_unlock(&vbq->lock);
- put_cpu_var(vmap_cpu_blocks);
+ put_cpu_var(vmap_block_queue);
return vb;
}
@@ -825,7 +826,7 @@ again:
}
spin_unlock(&vb->lock);
}
- put_cpu_var(vmap_cpu_blocks);
+ put_cpu_var(vmap_block_queue);
rcu_read_unlock();
if (!addr) {
@@ -1156,12 +1157,11 @@ static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
}
static struct vm_struct *__get_vm_area_node(unsigned long size,
- unsigned long flags, unsigned long start, unsigned long end,
- int node, gfp_t gfp_mask, void *caller)
+ unsigned long align, unsigned long flags, unsigned long start,
+ unsigned long end, int node, gfp_t gfp_mask, void *caller)
{
static struct vmap_area *va;
struct vm_struct *area;
- unsigned long align = 1;
BUG_ON(in_interrupt());
if (flags & VM_IOREMAP) {
@@ -1201,7 +1201,7 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
unsigned long start, unsigned long end)
{
- return __get_vm_area_node(size, flags, start, end, -1, GFP_KERNEL,
+ return __get_vm_area_node(size, 1, flags, start, end, -1, GFP_KERNEL,
__builtin_return_address(0));
}
EXPORT_SYMBOL_GPL(__get_vm_area);
@@ -1210,7 +1210,7 @@ struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
unsigned long start, unsigned long end,
void *caller)
{
- return __get_vm_area_node(size, flags, start, end, -1, GFP_KERNEL,
+ return __get_vm_area_node(size, 1, flags, start, end, -1, GFP_KERNEL,
caller);
}
@@ -1225,22 +1225,22 @@ struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
*/
struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
{
- return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END,
+ return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
-1, GFP_KERNEL, __builtin_return_address(0));
}
struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
void *caller)
{
- return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END,
+ return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
-1, GFP_KERNEL, caller);
}
struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags,
int node, gfp_t gfp_mask)
{
- return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, node,
- gfp_mask, __builtin_return_address(0));
+ return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
+ node, gfp_mask, __builtin_return_address(0));
}
static struct vm_struct *find_vm_area(const void *addr)
@@ -1403,13 +1403,15 @@ void *vmap(struct page **pages, unsigned int count,
}
EXPORT_SYMBOL(vmap);
-static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
+static void *__vmalloc_node(unsigned long size, unsigned long align,
+ gfp_t gfp_mask, pgprot_t prot,
int node, void *caller);
static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
pgprot_t prot, int node, void *caller)
{
struct page **pages;
unsigned int nr_pages, array_size, i;
+ gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
nr_pages = (area->size - PAGE_SIZE) >> PAGE_SHIFT;
array_size = (nr_pages * sizeof(struct page *));
@@ -1417,13 +1419,11 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
area->nr_pages = nr_pages;
/* Please note that the recursion is strictly bounded. */
if (array_size > PAGE_SIZE) {
- pages = __vmalloc_node(array_size, gfp_mask | __GFP_ZERO,
+ pages = __vmalloc_node(array_size, 1, nested_gfp|__GFP_HIGHMEM,
PAGE_KERNEL, node, caller);
area->flags |= VM_VPAGES;
} else {
- pages = kmalloc_node(array_size,
- (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO,
- node);
+ pages = kmalloc_node(array_size, nested_gfp, node);
}
area->pages = pages;
area->caller = caller;
@@ -1476,6 +1476,7 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
/**
* __vmalloc_node - allocate virtually contiguous memory
* @size: allocation size
+ * @align: desired alignment
* @gfp_mask: flags for the page level allocator
* @prot: protection mask for the allocated pages
* @node: node to use for allocation or -1
@@ -1485,8 +1486,9 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
* allocator with @gfp_mask flags. Map them into contiguous
* kernel virtual space, using a pagetable protection of @prot.
*/
-static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
- int node, void *caller)
+static void *__vmalloc_node(unsigned long size, unsigned long align,
+ gfp_t gfp_mask, pgprot_t prot,
+ int node, void *caller)
{
struct vm_struct *area;
void *addr;
@@ -1496,8 +1498,8 @@ static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
if (!size || (size >> PAGE_SHIFT) > totalram_pages)
return NULL;
- area = __get_vm_area_node(size, VM_ALLOC, VMALLOC_START, VMALLOC_END,
- node, gfp_mask, caller);
+ area = __get_vm_area_node(size, align, VM_ALLOC, VMALLOC_START,
+ VMALLOC_END, node, gfp_mask, caller);
if (!area)
return NULL;
@@ -1516,7 +1518,7 @@ static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
{
- return __vmalloc_node(size, gfp_mask, prot, -1,
+ return __vmalloc_node(size, 1, gfp_mask, prot, -1,
__builtin_return_address(0));
}
EXPORT_SYMBOL(__vmalloc);
@@ -1532,7 +1534,7 @@ EXPORT_SYMBOL(__vmalloc);
*/
void *vmalloc(unsigned long size)
{
- return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL,
+ return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL,
-1, __builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc);
@@ -1549,7 +1551,8 @@ void *vmalloc_user(unsigned long size)
struct vm_struct *area;
void *ret;
- ret = __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
+ ret = __vmalloc_node(size, SHMLBA,
+ GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
PAGE_KERNEL, -1, __builtin_return_address(0));
if (ret) {
area = find_vm_area(ret);
@@ -1572,7 +1575,7 @@ EXPORT_SYMBOL(vmalloc_user);
*/
void *vmalloc_node(unsigned long size, int node)
{
- return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL,
+ return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL,
node, __builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc_node);
@@ -1595,7 +1598,7 @@ EXPORT_SYMBOL(vmalloc_node);
void *vmalloc_exec(unsigned long size)
{
- return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC,
+ return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC,
-1, __builtin_return_address(0));
}
@@ -1616,7 +1619,7 @@ void *vmalloc_exec(unsigned long size)
*/
void *vmalloc_32(unsigned long size)
{
- return __vmalloc_node(size, GFP_VMALLOC32, PAGE_KERNEL,
+ return __vmalloc_node(size, 1, GFP_VMALLOC32, PAGE_KERNEL,
-1, __builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc_32);
@@ -1633,7 +1636,7 @@ void *vmalloc_32_user(unsigned long size)
struct vm_struct *area;
void *ret;
- ret = __vmalloc_node(size, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL,
+ ret = __vmalloc_node(size, 1, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL,
-1, __builtin_return_address(0));
if (ret) {
area = find_vm_area(ret);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 613e89f471d9..885207a6b6b7 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -55,6 +55,11 @@ struct scan_control {
/* Number of pages freed so far during a call to shrink_zones() */
unsigned long nr_reclaimed;
+ /* How many pages shrink_list() should reclaim */
+ unsigned long nr_to_reclaim;
+
+ unsigned long hibernation_mode;
+
/* This context's GFP mask */
gfp_t gfp_mask;
@@ -66,12 +71,6 @@ struct scan_control {
/* Can pages be swapped as part of reclaim? */
int may_swap;
- /* This context's SWAP_CLUSTER_MAX. If freeing memory for
- * suspend, we effectively ignore SWAP_CLUSTER_MAX.
- * In this context, it doesn't matter that we scan the
- * whole list at once. */
- int swap_cluster_max;
-
int swappiness;
int all_unreclaimable;
@@ -358,7 +357,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
* stalls if we need to run get_block(). We could test
* PagePrivate for that.
*
- * If this process is currently in generic_file_write() against
+ * If this process is currently in __generic_file_aio_write() against
* this page's queue, we can perform writeback even if that
* will block.
*
@@ -544,6 +543,16 @@ redo:
*/
lru = LRU_UNEVICTABLE;
add_page_to_unevictable_list(page);
+ /*
+ * When racing with an mlock clearing (page is
+ * unlocked), make sure that if the other thread does
+ * not observe our setting of PG_lru and fails
+ * isolation, we see PG_mlocked cleared below and move
+ * the page back to the evictable list.
+ *
+ * The other side is TestClearPageMlocked().
+ */
+ smp_mb();
}
/*
@@ -663,7 +672,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
* processes. Try to unmap it here.
*/
if (page_mapped(page) && mapping) {
- switch (try_to_unmap(page, 0)) {
+ switch (try_to_unmap(page, TTU_UNMAP)) {
case SWAP_FAIL:
goto activate_locked;
case SWAP_AGAIN:
@@ -1088,7 +1097,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
int lumpy_reclaim = 0;
while (unlikely(too_many_isolated(zone, file, sc))) {
- congestion_wait(WRITE, HZ/10);
+ congestion_wait(BLK_RW_ASYNC, HZ/10);
/* We are about to die and free our memory. Return now. */
if (fatal_signal_pending(current))
@@ -1122,7 +1131,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
unsigned long nr_anon;
unsigned long nr_file;
- nr_taken = sc->isolate_pages(sc->swap_cluster_max,
+ nr_taken = sc->isolate_pages(SWAP_CLUSTER_MAX,
&page_list, &nr_scan, sc->order, mode,
zone, sc->mem_cgroup, 0, file);
@@ -1156,10 +1165,8 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
__mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon);
__mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file);
- reclaim_stat->recent_scanned[0] += count[LRU_INACTIVE_ANON];
- reclaim_stat->recent_scanned[0] += count[LRU_ACTIVE_ANON];
- reclaim_stat->recent_scanned[1] += count[LRU_INACTIVE_FILE];
- reclaim_stat->recent_scanned[1] += count[LRU_ACTIVE_FILE];
+ reclaim_stat->recent_scanned[0] += nr_anon;
+ reclaim_stat->recent_scanned[1] += nr_file;
spin_unlock_irq(&zone->lru_lock);
@@ -1356,7 +1363,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
* IO, plus JVM can create lots of anon VM_EXEC pages,
* so we ignore them here.
*/
- if ((vm_flags & VM_EXEC) && !PageAnon(page)) {
+ if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) {
list_add(&page->lru, &l_active);
continue;
}
@@ -1454,20 +1461,26 @@ static int inactive_file_is_low(struct zone *zone, struct scan_control *sc)
return low;
}
+static int inactive_list_is_low(struct zone *zone, struct scan_control *sc,
+ int file)
+{
+ if (file)
+ return inactive_file_is_low(zone, sc);
+ else
+ return inactive_anon_is_low(zone, sc);
+}
+
static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
struct zone *zone, struct scan_control *sc, int priority)
{
int file = is_file_lru(lru);
- if (lru == LRU_ACTIVE_FILE && inactive_file_is_low(zone, sc)) {
- shrink_active_list(nr_to_scan, zone, sc, priority, file);
+ if (is_active_lru(lru)) {
+ if (inactive_list_is_low(zone, sc, file))
+ shrink_active_list(nr_to_scan, zone, sc, priority, file);
return 0;
}
- if (lru == LRU_ACTIVE_ANON && inactive_anon_is_low(zone, sc)) {
- shrink_active_list(nr_to_scan, zone, sc, priority, file);
- return 0;
- }
return shrink_inactive_list(nr_to_scan, zone, sc, priority, file);
}
@@ -1557,15 +1570,14 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
* until we collected @swap_cluster_max pages to scan.
*/
static unsigned long nr_scan_try_batch(unsigned long nr_to_scan,
- unsigned long *nr_saved_scan,
- unsigned long swap_cluster_max)
+ unsigned long *nr_saved_scan)
{
unsigned long nr;
*nr_saved_scan += nr_to_scan;
nr = *nr_saved_scan;
- if (nr >= swap_cluster_max)
+ if (nr >= SWAP_CLUSTER_MAX)
*nr_saved_scan = 0;
else
nr = 0;
@@ -1584,7 +1596,7 @@ static void shrink_zone(int priority, struct zone *zone,
unsigned long percent[2]; /* anon @ 0; file @ 1 */
enum lru_list l;
unsigned long nr_reclaimed = sc->nr_reclaimed;
- unsigned long swap_cluster_max = sc->swap_cluster_max;
+ unsigned long nr_to_reclaim = sc->nr_to_reclaim;
struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
int noswap = 0;
@@ -1606,15 +1618,15 @@ static void shrink_zone(int priority, struct zone *zone,
scan = (scan * percent[file]) / 100;
}
nr[l] = nr_scan_try_batch(scan,
- &reclaim_stat->nr_saved_scan[l],
- swap_cluster_max);
+ &reclaim_stat->nr_saved_scan[l]);
}
while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
nr[LRU_INACTIVE_FILE]) {
for_each_evictable_lru(l) {
if (nr[l]) {
- nr_to_scan = min(nr[l], swap_cluster_max);
+ nr_to_scan = min_t(unsigned long,
+ nr[l], SWAP_CLUSTER_MAX);
nr[l] -= nr_to_scan;
nr_reclaimed += shrink_list(l, nr_to_scan,
@@ -1629,8 +1641,7 @@ static void shrink_zone(int priority, struct zone *zone,
* with multiple processes reclaiming pages, the total
* freeing target can get unreasonably large.
*/
- if (nr_reclaimed > swap_cluster_max &&
- priority < DEF_PRIORITY && !current_is_kswapd())
+ if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY)
break;
}
@@ -1709,10 +1720,10 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
*
* If the caller is !__GFP_FS then the probability of a failure is reasonably
* high - the zone may be full of dirty or under-writeback pages, which this
- * caller can't do much about. We kick pdflush and take explicit naps in the
- * hope that some of these pages can be written. But if the allocating task
- * holds filesystem locks which prevent writeout this might not work, and the
- * allocation attempt will fail.
+ * caller can't do much about. We kick the writeback threads and take explicit
+ * naps in the hope that some of these pages can be written. But if the
+ * allocating task holds filesystem locks which prevent writeout this might not
+ * work, and the allocation attempt will fail.
*
* returns: 0, if no pages reclaimed
* else, the number of pages reclaimed
@@ -1728,6 +1739,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
struct zoneref *z;
struct zone *zone;
enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
+ unsigned long writeback_threshold;
delayacct_freepages_start();
@@ -1763,7 +1775,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
}
}
total_scanned += sc->nr_scanned;
- if (sc->nr_reclaimed >= sc->swap_cluster_max) {
+ if (sc->nr_reclaimed >= sc->nr_to_reclaim) {
ret = sc->nr_reclaimed;
goto out;
}
@@ -1775,14 +1787,15 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
* that's undesirable in laptop mode, where we *want* lumpy
* writeout. So in laptop mode, write out the whole world.
*/
- if (total_scanned > sc->swap_cluster_max +
- sc->swap_cluster_max / 2) {
+ writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2;
+ if (total_scanned > writeback_threshold) {
wakeup_flusher_threads(laptop_mode ? 0 : total_scanned);
sc->may_writepage = 1;
}
/* Take a nap, wait for some writeback to complete */
- if (sc->nr_scanned && priority < DEF_PRIORITY - 2)
+ if (!sc->hibernation_mode && sc->nr_scanned &&
+ priority < DEF_PRIORITY - 2)
congestion_wait(BLK_RW_ASYNC, HZ/10);
}
/* top priority shrink_zones still had more to do? don't OOM, then */
@@ -1821,7 +1834,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
struct scan_control sc = {
.gfp_mask = gfp_mask,
.may_writepage = !laptop_mode,
- .swap_cluster_max = SWAP_CLUSTER_MAX,
+ .nr_to_reclaim = SWAP_CLUSTER_MAX,
.may_unmap = 1,
.may_swap = 1,
.swappiness = vm_swappiness,
@@ -1836,23 +1849,55 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
+ gfp_t gfp_mask, bool noswap,
+ unsigned int swappiness,
+ struct zone *zone, int nid)
+{
+ struct scan_control sc = {
+ .may_writepage = !laptop_mode,
+ .may_unmap = 1,
+ .may_swap = !noswap,
+ .swappiness = swappiness,
+ .order = 0,
+ .mem_cgroup = mem,
+ .isolate_pages = mem_cgroup_isolate_pages,
+ };
+ nodemask_t nm = nodemask_of_node(nid);
+
+ sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
+ (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
+ sc.nodemask = &nm;
+ sc.nr_reclaimed = 0;
+ sc.nr_scanned = 0;
+ /*
+ * NOTE: Although we can get the priority field, using it
+ * here is not a good idea, since it limits the pages we can scan.
+ * if we don't reclaim here, the shrink_zone from balance_pgdat
+ * will pick up pages from other mem cgroup's as well. We hack
+ * the priority and make it zero.
+ */
+ shrink_zone(0, zone, &sc);
+ return sc.nr_reclaimed;
+}
+
unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
gfp_t gfp_mask,
bool noswap,
unsigned int swappiness)
{
+ struct zonelist *zonelist;
struct scan_control sc = {
.may_writepage = !laptop_mode,
.may_unmap = 1,
.may_swap = !noswap,
- .swap_cluster_max = SWAP_CLUSTER_MAX,
+ .nr_to_reclaim = SWAP_CLUSTER_MAX,
.swappiness = swappiness,
.order = 0,
.mem_cgroup = mem_cont,
.isolate_pages = mem_cgroup_isolate_pages,
.nodemask = NULL, /* we don't care the placement */
};
- struct zonelist *zonelist;
sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
@@ -1861,6 +1906,30 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
}
#endif
+/* is kswapd sleeping prematurely? */
+static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining)
+{
+ int i;
+
+ /* If a direct reclaimer woke kswapd within HZ/10, it's premature */
+ if (remaining)
+ return 1;
+
+ /* If after HZ/10, a zone is below the high mark, it's premature */
+ for (i = 0; i < pgdat->nr_zones; i++) {
+ struct zone *zone = pgdat->node_zones + i;
+
+ if (!populated_zone(zone))
+ continue;
+
+ if (!zone_watermark_ok(zone, order, high_wmark_pages(zone),
+ 0, 0))
+ return 1;
+ }
+
+ return 0;
+}
+
/*
* For kswapd, balance_pgdat() will work across all this node's zones until
* they are all at high_wmark_pages(zone).
@@ -1893,7 +1962,11 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
.gfp_mask = GFP_KERNEL,
.may_unmap = 1,
.may_swap = 1,
- .swap_cluster_max = SWAP_CLUSTER_MAX,
+ /*
+ * kswapd doesn't want to be bailed out while reclaim. because
+ * we want to put equal scanning pressure on each zone.
+ */
+ .nr_to_reclaim = ULONG_MAX,
.swappiness = vm_swappiness,
.order = order,
.mem_cgroup = NULL,
@@ -1918,6 +1991,7 @@ loop_again:
for (priority = DEF_PRIORITY; priority >= 0; priority--) {
int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
unsigned long lru_pages = 0;
+ int has_under_min_watermark_zone = 0;
/* The swap token gets in the way of swapout... */
if (!priority)
@@ -1974,6 +2048,7 @@ loop_again:
for (i = 0; i <= end_zone; i++) {
struct zone *zone = pgdat->node_zones + i;
int nr_slab;
+ int nid, zid;
if (!populated_zone(zone))
continue;
@@ -1988,6 +2063,15 @@ loop_again:
temp_priority[i] = priority;
sc.nr_scanned = 0;
note_zone_scanning_priority(zone, priority);
+
+ nid = pgdat->node_id;
+ zid = zone_idx(zone);
+ /*
+ * Call soft limit reclaim before calling shrink_zone.
+ * For now we ignore the return value
+ */
+ mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask,
+ nid, zid);
/*
* We put equal pressure on every zone, unless one
* zone has way too many pages free already.
@@ -2014,6 +2098,15 @@ loop_again:
if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
sc.may_writepage = 1;
+
+ /*
+ * We are still under min water mark. it mean we have
+ * GFP_ATOMIC allocation failure risk. Hurry up!
+ */
+ if (!zone_watermark_ok(zone, order, min_wmark_pages(zone),
+ end_zone, 0))
+ has_under_min_watermark_zone = 1;
+
}
if (all_zones_ok)
break; /* kswapd: all done */
@@ -2021,8 +2114,12 @@ loop_again:
* OK, kswapd is getting into trouble. Take a nap, then take
* another pass across the zones.
*/
- if (total_scanned && priority < DEF_PRIORITY - 2)
- congestion_wait(BLK_RW_ASYNC, HZ/10);
+ if (total_scanned && (priority < DEF_PRIORITY - 2)) {
+ if (has_under_min_watermark_zone)
+ count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT);
+ else
+ congestion_wait(BLK_RW_ASYNC, HZ/10);
+ }
/*
* We do this so kswapd doesn't build up large priorities for
@@ -2120,6 +2217,7 @@ static int kswapd(void *p)
order = 0;
for ( ; ; ) {
unsigned long new_order;
+ int ret;
prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
new_order = pgdat->kswapd_max_order;
@@ -2131,19 +2229,45 @@ static int kswapd(void *p)
*/
order = new_order;
} else {
- if (!freezing(current))
- schedule();
+ if (!freezing(current) && !kthread_should_stop()) {
+ long remaining = 0;
+
+ /* Try to sleep for a short interval */
+ if (!sleeping_prematurely(pgdat, order, remaining)) {
+ remaining = schedule_timeout(HZ/10);
+ finish_wait(&pgdat->kswapd_wait, &wait);
+ prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
+ }
+
+ /*
+ * After a short sleep, check if it was a
+ * premature sleep. If not, then go fully
+ * to sleep until explicitly woken up
+ */
+ if (!sleeping_prematurely(pgdat, order, remaining))
+ schedule();
+ else {
+ if (remaining)
+ count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
+ else
+ count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
+ }
+ }
order = pgdat->kswapd_max_order;
}
finish_wait(&pgdat->kswapd_wait, &wait);
- if (!try_to_freeze()) {
- /* We can speed up thawing tasks if we don't call
- * balance_pgdat after returning from the refrigerator
- */
+ ret = try_to_freeze();
+ if (kthread_should_stop())
+ break;
+
+ /*
+ * We can speed up thawing tasks if we don't call balance_pgdat
+ * after returning from the refrigerator
+ */
+ if (!ret)
balance_pgdat(pgdat, order);
- }
}
return 0;
}
@@ -2207,148 +2331,43 @@ unsigned long zone_reclaimable_pages(struct zone *zone)
#ifdef CONFIG_HIBERNATION
/*
- * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages
- * from LRU lists system-wide, for given pass and priority.
- *
- * For pass > 3 we also try to shrink the LRU lists that contain a few pages
- */
-static void shrink_all_zones(unsigned long nr_pages, int prio,
- int pass, struct scan_control *sc)
-{
- struct zone *zone;
- unsigned long nr_reclaimed = 0;
- struct zone_reclaim_stat *reclaim_stat;
-
- for_each_populated_zone(zone) {
- enum lru_list l;
-
- if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY)
- continue;
-
- for_each_evictable_lru(l) {
- enum zone_stat_item ls = NR_LRU_BASE + l;
- unsigned long lru_pages = zone_page_state(zone, ls);
-
- /* For pass = 0, we don't shrink the active list */
- if (pass == 0 && (l == LRU_ACTIVE_ANON ||
- l == LRU_ACTIVE_FILE))
- continue;
-
- reclaim_stat = get_reclaim_stat(zone, sc);
- reclaim_stat->nr_saved_scan[l] +=
- (lru_pages >> prio) + 1;
- if (reclaim_stat->nr_saved_scan[l]
- >= nr_pages || pass > 3) {
- unsigned long nr_to_scan;
-
- reclaim_stat->nr_saved_scan[l] = 0;
- nr_to_scan = min(nr_pages, lru_pages);
- nr_reclaimed += shrink_list(l, nr_to_scan, zone,
- sc, prio);
- if (nr_reclaimed >= nr_pages) {
- sc->nr_reclaimed += nr_reclaimed;
- return;
- }
- }
- }
- }
- sc->nr_reclaimed += nr_reclaimed;
-}
-
-/*
- * Try to free `nr_pages' of memory, system-wide, and return the number of
+ * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of
* freed pages.
*
* Rather than trying to age LRUs the aim is to preserve the overall
* LRU order by reclaiming preferentially
* inactive > active > active referenced > active mapped
*/
-unsigned long shrink_all_memory(unsigned long nr_pages)
+unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
{
- unsigned long lru_pages, nr_slab;
- int pass;
struct reclaim_state reclaim_state;
struct scan_control sc = {
- .gfp_mask = GFP_KERNEL,
- .may_unmap = 0,
+ .gfp_mask = GFP_HIGHUSER_MOVABLE,
+ .may_swap = 1,
+ .may_unmap = 1,
.may_writepage = 1,
+ .nr_to_reclaim = nr_to_reclaim,
+ .hibernation_mode = 1,
+ .swappiness = vm_swappiness,
+ .order = 0,
.isolate_pages = isolate_pages_global,
- .nr_reclaimed = 0,
};
+ struct zonelist * zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
+ struct task_struct *p = current;
+ unsigned long nr_reclaimed;
- current->reclaim_state = &reclaim_state;
-
- lru_pages = global_reclaimable_pages();
- nr_slab = global_page_state(NR_SLAB_RECLAIMABLE);
- /* If slab caches are huge, it's better to hit them first */
- while (nr_slab >= lru_pages) {
- reclaim_state.reclaimed_slab = 0;
- shrink_slab(nr_pages, sc.gfp_mask, lru_pages);
- if (!reclaim_state.reclaimed_slab)
- break;
-
- sc.nr_reclaimed += reclaim_state.reclaimed_slab;
- if (sc.nr_reclaimed >= nr_pages)
- goto out;
-
- nr_slab -= reclaim_state.reclaimed_slab;
- }
-
- /*
- * We try to shrink LRUs in 5 passes:
- * 0 = Reclaim from inactive_list only
- * 1 = Reclaim from active list but don't reclaim mapped
- * 2 = 2nd pass of type 1
- * 3 = Reclaim mapped (normal reclaim)
- * 4 = 2nd pass of type 3
- */
- for (pass = 0; pass < 5; pass++) {
- int prio;
-
- /* Force reclaiming mapped pages in the passes #3 and #4 */
- if (pass > 2)
- sc.may_unmap = 1;
-
- for (prio = DEF_PRIORITY; prio >= 0; prio--) {
- unsigned long nr_to_scan = nr_pages - sc.nr_reclaimed;
-
- sc.nr_scanned = 0;
- sc.swap_cluster_max = nr_to_scan;
- shrink_all_zones(nr_to_scan, prio, pass, &sc);
- if (sc.nr_reclaimed >= nr_pages)
- goto out;
-
- reclaim_state.reclaimed_slab = 0;
- shrink_slab(sc.nr_scanned, sc.gfp_mask,
- global_reclaimable_pages());
- sc.nr_reclaimed += reclaim_state.reclaimed_slab;
- if (sc.nr_reclaimed >= nr_pages)
- goto out;
-
- if (sc.nr_scanned && prio < DEF_PRIORITY - 2)
- congestion_wait(BLK_RW_ASYNC, HZ / 10);
- }
- }
-
- /*
- * If sc.nr_reclaimed = 0, we could not shrink LRUs, but there may be
- * something in slab caches
- */
- if (!sc.nr_reclaimed) {
- do {
- reclaim_state.reclaimed_slab = 0;
- shrink_slab(nr_pages, sc.gfp_mask,
- global_reclaimable_pages());
- sc.nr_reclaimed += reclaim_state.reclaimed_slab;
- } while (sc.nr_reclaimed < nr_pages &&
- reclaim_state.reclaimed_slab > 0);
- }
+ p->flags |= PF_MEMALLOC;
+ lockdep_set_current_reclaim_state(sc.gfp_mask);
+ reclaim_state.reclaimed_slab = 0;
+ p->reclaim_state = &reclaim_state;
+ nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
-out:
- current->reclaim_state = NULL;
+ p->reclaim_state = NULL;
+ lockdep_clear_current_reclaim_state();
+ p->flags &= ~PF_MEMALLOC;
- return sc.nr_reclaimed;
+ return nr_reclaimed;
}
#endif /* CONFIG_HIBERNATION */
@@ -2398,6 +2417,17 @@ int kswapd_run(int nid)
return ret;
}
+/*
+ * Called by memory hotplug when all memory in a node is offlined.
+ */
+void kswapd_stop(int nid)
+{
+ struct task_struct *kswapd = NODE_DATA(nid)->kswapd;
+
+ if (kswapd)
+ kthread_stop(kswapd);
+}
+
static int __init kswapd_init(void)
{
int nid;
@@ -2500,8 +2530,8 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
.may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
.may_swap = 1,
- .swap_cluster_max = max_t(unsigned long, nr_pages,
- SWAP_CLUSTER_MAX),
+ .nr_to_reclaim = max_t(unsigned long, nr_pages,
+ SWAP_CLUSTER_MAX),
.gfp_mask = gfp_mask,
.swappiness = vm_swappiness,
.order = order,
@@ -2801,10 +2831,10 @@ static void scan_all_zones_unevictable_pages(void)
unsigned long scan_unevictable_pages;
int scan_unevictable_handler(struct ctl_table *table, int write,
- struct file *file, void __user *buffer,
+ void __user *buffer,
size_t *length, loff_t *ppos)
{
- proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
+ proc_doulongvec_minmax(table, write, buffer, length, ppos);
if (write && *(unsigned long *)table->data)
scan_all_zones_unevictable_pages();
diff --git a/mm/vmstat.c b/mm/vmstat.c
index c81321f9feec..6051fbab67ba 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -683,6 +683,9 @@ static const char * const vmstat_text[] = {
"slabs_scanned",
"kswapd_steal",
"kswapd_inodesteal",
+ "kswapd_low_wmark_hit_quickly",
+ "kswapd_high_wmark_hit_quickly",
+ "kswapd_skip_congestion_wait",
"pageoutrun",
"allocstall",
@@ -883,11 +886,10 @@ static void vmstat_update(struct work_struct *w)
static void __cpuinit start_cpu_timer(int cpu)
{
- struct delayed_work *vmstat_work = &per_cpu(vmstat_work, cpu);
+ struct delayed_work *work = &per_cpu(vmstat_work, cpu);
- INIT_DELAYED_WORK_DEFERRABLE(vmstat_work, vmstat_update);
- schedule_delayed_work_on(cpu, vmstat_work,
- __round_jiffies_relative(HZ, cpu));
+ INIT_DELAYED_WORK_DEFERRABLE(work, vmstat_update);
+ schedule_delayed_work_on(cpu, work, __round_jiffies_relative(HZ, cpu));
}
/*