From 2c3cf556b2f7ab5823195766eddec6f1a43134f5 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Tue, 9 Oct 2012 15:31:12 +0200
Subject: x86/mm: Introduce pte_accessible()

We need pte_present to return true for _PAGE_PROTNONE pages, to indicate that
the pte is associated with a page.

However, for TLB flushing purposes, we would like to know whether the pte
points to an actually accessible page.  This allows us to skip remote TLB
flushes for pages that are not actually accessible.

Fill in this method for x86 and provide a safe (but slower) method
on other architectures.

Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Fixed-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/n/tip-66p11te4uj23gevgh4j987ip@git.kernel.org
[ Added Linus's review fixes. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/asm-generic/pgtable.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index b36ce40bd1c6..48fc1dc1c74b 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -219,6 +219,10 @@ static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
 #define move_pte(pte, prot, old_addr, new_addr)	(pte)
 #endif
 
+#ifndef pte_accessible
+# define pte_accessible(pte)		((void)(pte),1)
+#endif
+
 #ifndef flush_tlb_fix_spurious_fault
 #define flush_tlb_fix_spurious_fault(vma, address) flush_tlb_page(vma, address)
 #endif
-- 
cgit v1.2.3


From 7da4d641c58d201c3cc1835c05ca1a7fa26f0856 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 19 Nov 2012 03:14:23 +0100
Subject: mm: Count the number of pages affected in change_protection()

This will be used for three kinds of purposes:

 - to optimize mprotect()

 - to speed up working set scanning for working set areas that
   have not been touched

 - to more accurately scan per real working set

No change in functionality from this patch.

Suggested-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Hugh Dickins <hughd@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/hugetlb.h | 8 ++++++--
 include/linux/mm.h      | 3 +++
 2 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 225164842ab6..06e691baab86 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -87,7 +87,7 @@ struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address,
 				pud_t *pud, int write);
 int pmd_huge(pmd_t pmd);
 int pud_huge(pud_t pmd);
-void hugetlb_change_protection(struct vm_area_struct *vma,
+unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
 		unsigned long address, unsigned long end, pgprot_t newprot);
 
 #else /* !CONFIG_HUGETLB_PAGE */
@@ -132,7 +132,11 @@ static inline void copy_huge_page(struct page *dst, struct page *src)
 {
 }
 
-#define hugetlb_change_protection(vma, address, end, newprot)
+static inline unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
+		unsigned long address, unsigned long end, pgprot_t newprot)
+{
+	return 0;
+}
 
 static inline void __unmap_hugepage_range_final(struct mmu_gather *tlb,
 			struct vm_area_struct *vma, unsigned long start,
diff --git a/include/linux/mm.h b/include/linux/mm.h
index bcaab4e6fe91..1856c62d82cd 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1078,6 +1078,9 @@ extern unsigned long move_page_tables(struct vm_area_struct *vma,
 extern unsigned long do_mremap(unsigned long addr,
 			       unsigned long old_len, unsigned long new_len,
 			       unsigned long flags, unsigned long new_addr);
+extern unsigned long change_protection(struct vm_area_struct *vma, unsigned long start,
+			      unsigned long end, pgprot_t newprot,
+			      int dirty_accountable);
 extern int mprotect_fixup(struct vm_area_struct *vma,
 			  struct vm_area_struct **pprev, unsigned long start,
 			  unsigned long end, unsigned long newflags);
-- 
cgit v1.2.3


From 5647bc293ab15f66a7b1cda850c5e9d162a6c7c2 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Fri, 19 Oct 2012 10:46:20 +0100
Subject: mm: compaction: Move migration fail/success stats to migrate.c

The compact_pages_moved and compact_pagemigrate_failed events are
convenient for determining if compaction is active and to what
degree migration is succeeding but it's at the wrong level. Other
users of migration may also want to know if migration is working
properly and this will be particularly true for any automated
NUMA migration. This patch moves the counters down to migration
with the new events called pgmigrate_success and pgmigrate_fail.
The compact_blocks_moved counter is removed because while it was
useful for debugging initially, it's worthless now as no meaningful
conclusions can be drawn from its value.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Reviewed-by: Rik van Riel <riel@redhat.com>
---
 include/linux/vm_event_item.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 3d3114594370..8aa7cb94b5fb 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -38,8 +38,10 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 		KSWAPD_LOW_WMARK_HIT_QUICKLY, KSWAPD_HIGH_WMARK_HIT_QUICKLY,
 		KSWAPD_SKIP_CONGESTION_WAIT,
 		PAGEOUTRUN, ALLOCSTALL, PGROTATED,
+#ifdef CONFIG_MIGRATION
+		PGMIGRATE_SUCCESS, PGMIGRATE_FAIL,
+#endif
 #ifdef CONFIG_COMPACTION
-		COMPACTBLOCKS, COMPACTPAGES, COMPACTPAGEFAILED,
 		COMPACTSTALL, COMPACTFAIL, COMPACTSUCCESS,
 #endif
 #ifdef CONFIG_HUGETLB_PAGE
-- 
cgit v1.2.3


From 7b2a2d4a18fffac3c4872021529b0657896db788 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Fri, 19 Oct 2012 14:07:31 +0100
Subject: mm: migrate: Add a tracepoint for migrate_pages

The pgmigrate_success and pgmigrate_fail vmstat counters tells the user
about migration activity but not the type or the reason. This patch adds
a tracepoint to identify the type of page migration and why the page is
being migrated.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Reviewed-by: Rik van Riel <riel@redhat.com>
---
 include/linux/migrate.h        | 13 +++++++++--
 include/trace/events/migrate.h | 51 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 62 insertions(+), 2 deletions(-)
 create mode 100644 include/trace/events/migrate.h

(limited to 'include')

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index ce7e6671968b..9d1c159e2427 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -7,6 +7,15 @@
 
 typedef struct page *new_page_t(struct page *, unsigned long private, int **);
 
+enum migrate_reason {
+	MR_COMPACTION,
+	MR_MEMORY_FAILURE,
+	MR_MEMORY_HOTPLUG,
+	MR_SYSCALL,		/* also applies to cpusets */
+	MR_MEMPOLICY_MBIND,
+	MR_CMA
+};
+
 #ifdef CONFIG_MIGRATION
 
 extern void putback_lru_pages(struct list_head *l);
@@ -14,7 +23,7 @@ extern int migrate_page(struct address_space *,
 			struct page *, struct page *, enum migrate_mode);
 extern int migrate_pages(struct list_head *l, new_page_t x,
 			unsigned long private, bool offlining,
-			enum migrate_mode mode);
+			enum migrate_mode mode, int reason);
 extern int migrate_huge_page(struct page *, new_page_t x,
 			unsigned long private, bool offlining,
 			enum migrate_mode mode);
@@ -35,7 +44,7 @@ extern int migrate_huge_page_move_mapping(struct address_space *mapping,
 static inline void putback_lru_pages(struct list_head *l) {}
 static inline int migrate_pages(struct list_head *l, new_page_t x,
 		unsigned long private, bool offlining,
-		enum migrate_mode mode) { return -ENOSYS; }
+		enum migrate_mode mode, int reason) { return -ENOSYS; }
 static inline int migrate_huge_page(struct page *page, new_page_t x,
 		unsigned long private, bool offlining,
 		enum migrate_mode mode) { return -ENOSYS; }
diff --git a/include/trace/events/migrate.h b/include/trace/events/migrate.h
new file mode 100644
index 000000000000..ec2a6ccfd7e5
--- /dev/null
+++ b/include/trace/events/migrate.h
@@ -0,0 +1,51 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM migrate
+
+#if !defined(_TRACE_MIGRATE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_MIGRATE_H
+
+#define MIGRATE_MODE						\
+	{MIGRATE_ASYNC,		"MIGRATE_ASYNC"},		\
+	{MIGRATE_SYNC_LIGHT,	"MIGRATE_SYNC_LIGHT"},		\
+	{MIGRATE_SYNC,		"MIGRATE_SYNC"}		
+
+#define MIGRATE_REASON						\
+	{MR_COMPACTION,		"compaction"},			\
+	{MR_MEMORY_FAILURE,	"memory_failure"},		\
+	{MR_MEMORY_HOTPLUG,	"memory_hotplug"},		\
+	{MR_SYSCALL,		"syscall_or_cpuset"},		\
+	{MR_MEMPOLICY_MBIND,	"mempolicy_mbind"},		\
+	{MR_CMA,		"cma"}
+
+TRACE_EVENT(mm_migrate_pages,
+
+	TP_PROTO(unsigned long succeeded, unsigned long failed,
+		 enum migrate_mode mode, int reason),
+
+	TP_ARGS(succeeded, failed, mode, reason),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,		succeeded)
+		__field(	unsigned long,		failed)
+		__field(	enum migrate_mode,	mode)
+		__field(	int,			reason)
+	),
+
+	TP_fast_assign(
+		__entry->succeeded	= succeeded;
+		__entry->failed		= failed;
+		__entry->mode		= mode;
+		__entry->reason		= reason;
+	),
+
+	TP_printk("nr_succeeded=%lu nr_failed=%lu mode=%s reason=%s",
+		__entry->succeeded,
+		__entry->failed,
+		__print_symbolic(__entry->mode, MIGRATE_MODE),
+		__print_symbolic(__entry->reason, MIGRATE_REASON))
+);
+
+#endif /* _TRACE_MIGRATE_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
-- 
cgit v1.2.3


From 397487db696cae0b026a474a5cd66f4e372995e6 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Fri, 19 Oct 2012 12:00:10 +0100
Subject: mm: compaction: Add scanned and isolated counters for compaction

Compaction already has tracepoints to count scanned and isolated pages
but it requires that ftrace be enabled and if that information has to be
written to disk then it can be disruptive. This patch adds vmstat counters
for compaction called compact_migrate_scanned, compact_free_scanned and
compact_isolated.

With these counters, it is possible to define a basic cost model for
compaction. This approximates of how much work compaction is doing and can
be compared that with an oprofile showing TLB misses and see if the cost of
compaction is being offset by THP for example. Minimally a compaction patch
can be evaluated in terms of whether it increases or decreases cost. The
basic cost model looks like this

Fundamental unit u:	a word	sizeof(void *)

Ca  = cost of struct page access = sizeof(struct page) / u

Cmc = Cost migrate page copy = (Ca + PAGE_SIZE/u) * 2
Cmf = Cost migrate failure   = Ca * 2
Ci  = Cost page isolation    = (Ca + Wi)
	where Wi is a constant that should reflect the approximate
	cost of the locking operation.

Csm = Cost migrate scanning = Ca
Csf = Cost free    scanning = Ca

Overall cost =	(Csm * compact_migrate_scanned) +
	      	(Csf * compact_free_scanned)    +
	      	(Ci  * compact_isolated)	+
		(Cmc * pgmigrate_success)	+
		(Cmf * pgmigrate_failed)

Where the values are read from /proc/vmstat.

This is very basic and ignores certain costs such as the allocation cost
to do a migrate page copy but any improvement to the model would still
use the same vmstat counters.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Reviewed-by: Rik van Riel <riel@redhat.com>
---
 include/linux/vm_event_item.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 8aa7cb94b5fb..a1f750b8e72a 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -42,6 +42,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 		PGMIGRATE_SUCCESS, PGMIGRATE_FAIL,
 #endif
 #ifdef CONFIG_COMPACTION
+		COMPACTMIGRATE_SCANNED, COMPACTFREE_SCANNED,
+		COMPACTISOLATED,
 		COMPACTSTALL, COMPACTFAIL, COMPACTSUCCESS,
 #endif
 #ifdef CONFIG_HUGETLB_PAGE
-- 
cgit v1.2.3


From be3a728427a605990a7a0b6dbf9e29b68e266146 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Thu, 4 Oct 2012 01:50:47 +0200
Subject: mm: numa: pte_numa() and pmd_numa()

Implement pte_numa and pmd_numa.

We must atomically set the numa bit and clear the present bit to
define a pte_numa or pmd_numa.

Once a pte or pmd has been set as pte_numa or pmd_numa, the next time
a thread touches a virtual address in the corresponding virtual range,
a NUMA hinting page fault will trigger. The NUMA hinting page fault
will clear the NUMA bit and set the present bit again to resolve the
page fault.

The expectation is that a NUMA hinting page fault is used as part
of a placement policy that decides if a page should remain on the
current node or migrated to a different node.

Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Mel Gorman <mgorman@suse.de>
---
 include/asm-generic/pgtable.h | 106 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 106 insertions(+)

(limited to 'include')

diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 48fc1dc1c74b..f27c83668d10 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -558,6 +558,112 @@ static inline int pmd_trans_unstable(pmd_t *pmd)
 #endif
 }
 
+#ifdef CONFIG_NUMA_BALANCING
+#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
+/*
+ * _PAGE_NUMA works identical to _PAGE_PROTNONE (it's actually the
+ * same bit too). It's set only when _PAGE_PRESET is not set and it's
+ * never set if _PAGE_PRESENT is set.
+ *
+ * pte/pmd_present() returns true if pte/pmd_numa returns true. Page
+ * fault triggers on those regions if pte/pmd_numa returns true
+ * (because _PAGE_PRESENT is not set).
+ */
+#ifndef pte_numa
+static inline int pte_numa(pte_t pte)
+{
+	return (pte_flags(pte) &
+		(_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA;
+}
+#endif
+
+#ifndef pmd_numa
+static inline int pmd_numa(pmd_t pmd)
+{
+	return (pmd_flags(pmd) &
+		(_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA;
+}
+#endif
+
+/*
+ * pte/pmd_mknuma sets the _PAGE_ACCESSED bitflag automatically
+ * because they're called by the NUMA hinting minor page fault. If we
+ * wouldn't set the _PAGE_ACCESSED bitflag here, the TLB miss handler
+ * would be forced to set it later while filling the TLB after we
+ * return to userland. That would trigger a second write to memory
+ * that we optimize away by setting _PAGE_ACCESSED here.
+ */
+#ifndef pte_mknonnuma
+static inline pte_t pte_mknonnuma(pte_t pte)
+{
+	pte = pte_clear_flags(pte, _PAGE_NUMA);
+	return pte_set_flags(pte, _PAGE_PRESENT|_PAGE_ACCESSED);
+}
+#endif
+
+#ifndef pmd_mknonnuma
+static inline pmd_t pmd_mknonnuma(pmd_t pmd)
+{
+	pmd = pmd_clear_flags(pmd, _PAGE_NUMA);
+	return pmd_set_flags(pmd, _PAGE_PRESENT|_PAGE_ACCESSED);
+}
+#endif
+
+#ifndef pte_mknuma
+static inline pte_t pte_mknuma(pte_t pte)
+{
+	pte = pte_set_flags(pte, _PAGE_NUMA);
+	return pte_clear_flags(pte, _PAGE_PRESENT);
+}
+#endif
+
+#ifndef pmd_mknuma
+static inline pmd_t pmd_mknuma(pmd_t pmd)
+{
+	pmd = pmd_set_flags(pmd, _PAGE_NUMA);
+	return pmd_clear_flags(pmd, _PAGE_PRESENT);
+}
+#endif
+#else
+extern int pte_numa(pte_t pte);
+extern int pmd_numa(pmd_t pmd);
+extern pte_t pte_mknonnuma(pte_t pte);
+extern pmd_t pmd_mknonnuma(pmd_t pmd);
+extern pte_t pte_mknuma(pte_t pte);
+extern pmd_t pmd_mknuma(pmd_t pmd);
+#endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */
+#else
+static inline int pmd_numa(pmd_t pmd)
+{
+	return 0;
+}
+
+static inline int pte_numa(pte_t pte)
+{
+	return 0;
+}
+
+static inline pte_t pte_mknonnuma(pte_t pte)
+{
+	return pte;
+}
+
+static inline pmd_t pmd_mknonnuma(pmd_t pmd)
+{
+	return pmd;
+}
+
+static inline pte_t pte_mknuma(pte_t pte)
+{
+	return pte;
+}
+
+static inline pmd_t pmd_mknuma(pmd_t pmd)
+{
+	return pmd;
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
 #endif /* CONFIG_MMU */
 
 #endif /* !__ASSEMBLY__ */
-- 
cgit v1.2.3


From 0b9d705297b273657923518dbea2377cd03532ed Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Fri, 5 Oct 2012 21:36:27 +0200
Subject: mm: numa: Support NUMA hinting page faults from gup/gup_fast

Introduce FOLL_NUMA to tell follow_page to check
pte/pmd_numa. get_user_pages must use FOLL_NUMA, and it's safe to do
so because it always invokes handle_mm_fault and retries the
follow_page later.

KVM secondary MMU page faults will trigger the NUMA hinting page
faults through gup_fast -> get_user_pages -> follow_page ->
handle_mm_fault.

Other follow_page callers like KSM should not use FOLL_NUMA, or they
would fail to get the pages if they use follow_page instead of
get_user_pages.

[ This patch was picked up from the AutoNUMA tree. ]

Originally-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
[ ported to this tree. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Rik van Riel <riel@redhat.com>
---
 include/linux/mm.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1856c62d82cd..fa1615211159 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1572,6 +1572,7 @@ struct page *follow_page(struct vm_area_struct *, unsigned long address,
 #define FOLL_MLOCK	0x40	/* mark page as mlocked */
 #define FOLL_SPLIT	0x80	/* don't return transhuge pages, split them */
 #define FOLL_HWPOISON	0x100	/* check page is hwpoisoned */
+#define FOLL_NUMA	0x200	/* force NUMA hinting page fault */
 
 typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
 			void *data);
-- 
cgit v1.2.3


From d10e63f29488b0f312a443f9507ea9b6fd3c9090 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Thu, 25 Oct 2012 14:16:31 +0200
Subject: mm: numa: Create basic numa page hinting infrastructure

Note: This patch started as "mm/mpol: Create special PROT_NONE
	infrastructure" and preserves the basic idea but steals *very*
	heavily from "autonuma: numa hinting page faults entry points" for
	the actual fault handlers without the migration parts.	The end
	result is barely recognisable as either patch so all Signed-off
	and Reviewed-bys are dropped. If Peter, Ingo and Andrea are ok with
	this version, I will re-add the signed-offs-by to reflect the history.

In order to facilitate a lazy -- fault driven -- migration of pages, create
a special transient PAGE_NUMA variant, we can then use the 'spurious'
protection faults to drive our migrations from.

The meaning of PAGE_NUMA depends on the architecture but on x86 it is
effectively PROT_NONE. Actual PROT_NONE mappings will not generate these
NUMA faults for the reason that the page fault code checks the permission on
the VMA (and will throw a segmentation fault on actual PROT_NONE mappings),
before it ever calls handle_mm_fault.

[dhillf@gmail.com: Fix typo]
Signed-off-by: Mel Gorman <mgorman@suse.de>
Reviewed-by: Rik van Riel <riel@redhat.com>
---
 include/linux/huge_mm.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index b31cb7da0346..a1d26a98c655 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -159,6 +159,10 @@ static inline struct page *compound_trans_head(struct page *page)
 	}
 	return page;
 }
+
+extern int do_huge_pmd_numa_page(struct mm_struct *mm, unsigned long addr,
+				  pmd_t pmd, pmd_t *pmdp);
+
 #else /* CONFIG_TRANSPARENT_HUGEPAGE */
 #define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; })
 #define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; })
@@ -195,6 +199,12 @@ static inline int pmd_trans_huge_lock(pmd_t *pmd,
 {
 	return 0;
 }
+
+static inline int do_huge_pmd_numa_page(struct mm_struct *mm, unsigned long addr,
+					pmd_t pmd, pmd_t *pmdp)
+{
+}
+
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
 #endif /* _LINUX_HUGE_MM_H */
-- 
cgit v1.2.3


From 479e2802d09f1e18a97262c4c6f8f17ae5884bd8 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 25 Oct 2012 14:16:28 +0200
Subject: mm: mempolicy: Make MPOL_LOCAL a real policy

Make MPOL_LOCAL a real and exposed policy such that applications that
relied on the previous default behaviour can explicitly request it.

Requested-by: Christoph Lameter <cl@linux.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Mel Gorman <mgorman@suse.de>
---
 include/uapi/linux/mempolicy.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h
index 23e62e0537e2..3e835c9d847b 100644
--- a/include/uapi/linux/mempolicy.h
+++ b/include/uapi/linux/mempolicy.h
@@ -20,6 +20,7 @@ enum {
 	MPOL_PREFERRED,
 	MPOL_BIND,
 	MPOL_INTERLEAVE,
+	MPOL_LOCAL,
 	MPOL_MAX,	/* always last member of enum */
 };
 
-- 
cgit v1.2.3


From d3a710337b0590f43fd236d5e6518439afc7410a Mon Sep 17 00:00:00 2001
From: Lee Schermerhorn <lee.schermerhorn@hp.com>
Date: Thu, 25 Oct 2012 14:16:29 +0200
Subject: mm: mempolicy: Add MPOL_NOOP

This patch augments the MPOL_MF_LAZY feature by adding a "NOOP" policy
to mbind().  When the NOOP policy is used with the 'MOVE and 'LAZY
flags, mbind() will map the pages PROT_NONE so that they will be
migrated on the next touch.

This allows an application to prepare for a new phase of operation
where different regions of shared storage will be assigned to
worker threads, w/o changing policy.  Note that we could just use
"default" policy in this case.  However, this also allows an
application to request that pages be migrated, only if necessary,
to follow any arbitrary policy that might currently apply to a
range of pages, without knowing the policy, or without specifying
multiple mbind()s for ranges with different policies.

[ Bug in early version of mpol_parse_str() reported by Fengguang Wu. ]

Bug-Reported-by: Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Mel Gorman <mgorman@suse.de>
---
 include/uapi/linux/mempolicy.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h
index 3e835c9d847b..d23dca8367cc 100644
--- a/include/uapi/linux/mempolicy.h
+++ b/include/uapi/linux/mempolicy.h
@@ -21,6 +21,7 @@ enum {
 	MPOL_BIND,
 	MPOL_INTERLEAVE,
 	MPOL_LOCAL,
+	MPOL_NOOP,		/* retain existing policy for range */
 	MPOL_MAX,	/* always last member of enum */
 };
 
-- 
cgit v1.2.3


From 771fb4d806a92bf6c988fcfbd286ae40a9374332 Mon Sep 17 00:00:00 2001
From: Lee Schermerhorn <lee.schermerhorn@hp.com>
Date: Thu, 25 Oct 2012 14:16:30 +0200
Subject: mm: mempolicy: Check for misplaced page

This patch provides a new function to test whether a page resides
on a node that is appropriate for the mempolicy for the vma and
address where the page is supposed to be mapped.  This involves
looking up the node where the page belongs.  So, the function
returns that node so that it may be used to allocated the page
without consulting the policy again.

A subsequent patch will call this function from the fault path.
Because of this, I don't want to go ahead and allocate the page, e.g.,
via alloc_page_vma() only to have to free it if it has the correct
policy.  So, I just mimic the alloc_page_vma() node computation
logic--sort of.

Note:  we could use this function to implement a MPOL_MF_STRICT
behavior when migrating pages to match mbind() mempolicy--e.g.,
to ensure that pages in an interleaved range are reinterleaved
rather than left where they are when they reside on any page in
the interleave nodemask.

Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
[ Added MPOL_F_LAZY to trigger migrate-on-fault;
  simplified code now that we don't have to bother
  with special crap for interleaved ]
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Mel Gorman <mgorman@suse.de>
---
 include/linux/mempolicy.h      | 8 ++++++++
 include/uapi/linux/mempolicy.h | 1 +
 2 files changed, 9 insertions(+)

(limited to 'include')

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index e5ccb9ddd90e..c511e2523560 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -198,6 +198,8 @@ static inline int vma_migratable(struct vm_area_struct *vma)
 	return 1;
 }
 
+extern int mpol_misplaced(struct page *, struct vm_area_struct *, unsigned long);
+
 #else
 
 struct mempolicy {};
@@ -323,5 +325,11 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol,
 	return 0;
 }
 
+static inline int mpol_misplaced(struct page *page, struct vm_area_struct *vma,
+				 unsigned long address)
+{
+	return -1; /* no node preference */
+}
+
 #endif /* CONFIG_NUMA */
 #endif
diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h
index d23dca8367cc..472de8a5d37e 100644
--- a/include/uapi/linux/mempolicy.h
+++ b/include/uapi/linux/mempolicy.h
@@ -61,6 +61,7 @@ enum mpol_rebind_step {
 #define MPOL_F_SHARED  (1 << 0)	/* identify shared policies */
 #define MPOL_F_LOCAL   (1 << 1)	/* preferred local allocation */
 #define MPOL_F_REBINDING (1 << 2)	/* identify policies in rebinding */
+#define MPOL_F_MOF	(1 << 3) /* this policy wants migrate on fault */
 
 
 #endif /* _UAPI_LINUX_MEMPOLICY_H */
-- 
cgit v1.2.3


From 7039e1dbec6eeaa8ecab43a82d6589eeced995c3 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 25 Oct 2012 14:16:34 +0200
Subject: mm: migrate: Introduce migrate_misplaced_page()

Note: This was originally based on Peter's patch "mm/migrate: Introduce
	migrate_misplaced_page()" but borrows extremely heavily from Andrea's
	"autonuma: memory follows CPU algorithm and task/mm_autonuma stats
	collection". The end result is barely recognisable so signed-offs
	had to be dropped. If original authors are ok with it, I'll
	re-add the signed-off-bys.

Add migrate_misplaced_page() which deals with migrating pages from
faults.

Based-on-work-by: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Based-on-work-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Based-on-work-by: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Reviewed-by: Rik van Riel <riel@redhat.com>
---
 include/linux/migrate.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include')

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 9d1c159e2427..f0d0313eea6f 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -13,6 +13,7 @@ enum migrate_reason {
 	MR_MEMORY_HOTPLUG,
 	MR_SYSCALL,		/* also applies to cpusets */
 	MR_MEMPOLICY_MBIND,
+	MR_NUMA_MISPLACED,
 	MR_CMA
 };
 
@@ -73,4 +74,14 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping,
 #define fail_migrate_page NULL
 
 #endif /* CONFIG_MIGRATION */
+
+#ifdef CONFIG_NUMA_BALANCING
+extern int migrate_misplaced_page(struct page *page, int node);
+#else
+static inline int migrate_misplaced_page(struct page *page, int node)
+{
+	return -EAGAIN; /* can't migrate now */
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
 #endif /* _LINUX_MIGRATE_H */
-- 
cgit v1.2.3


From 4daae3b4b9e49b7e0935499a352f1c59d90287d2 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Fri, 2 Nov 2012 11:33:45 +0000
Subject: mm: mempolicy: Use _PAGE_NUMA to migrate pages

Note: Based on "mm/mpol: Use special PROT_NONE to migrate pages" but
	sufficiently different that the signed-off-bys were dropped

Combine our previous _PAGE_NUMA, mpol_misplaced and migrate_misplaced_page()
pieces into an effective migrate on fault scheme.

Note that (on x86) we rely on PROT_NONE pages being !present and avoid
the TLB flush from try_to_unmap(TTU_MIGRATION). This greatly improves the
page-migration performance.

Based-on-work-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Mel Gorman <mgorman@suse.de>
---
 include/linux/huge_mm.h | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index a1d26a98c655..dabb5108d6c0 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -160,8 +160,8 @@ static inline struct page *compound_trans_head(struct page *page)
 	return page;
 }
 
-extern int do_huge_pmd_numa_page(struct mm_struct *mm, unsigned long addr,
-				  pmd_t pmd, pmd_t *pmdp);
+extern int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
+				unsigned long addr, pmd_t pmd, pmd_t *pmdp);
 
 #else /* CONFIG_TRANSPARENT_HUGEPAGE */
 #define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; })
@@ -200,9 +200,10 @@ static inline int pmd_trans_huge_lock(pmd_t *pmd,
 	return 0;
 }
 
-static inline int do_huge_pmd_numa_page(struct mm_struct *mm, unsigned long addr,
-					pmd_t pmd, pmd_t *pmdp)
+static inline int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
+					unsigned long addr, pmd_t pmd, pmd_t *pmdp)
 {
+	return 0;
 }
 
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-- 
cgit v1.2.3


From b24f53a0bea38b266d219ee651b22dba727c44ae Mon Sep 17 00:00:00 2001
From: Lee Schermerhorn <lee.schermerhorn@hp.com>
Date: Thu, 25 Oct 2012 14:16:32 +0200
Subject: mm: mempolicy: Add MPOL_MF_LAZY

NOTE: Once again there is a lot of patch stealing and the end result
	is sufficiently different that I had to drop the signed-offs.
	Will re-add if the original authors are ok with that.

This patch adds another mbind() flag to request "lazy migration".  The
flag, MPOL_MF_LAZY, modifies MPOL_MF_MOVE* such that the selected
pages are marked PROT_NONE. The pages will be migrated in the fault
path on "first touch", if the policy dictates at that time.

"Lazy Migration" will allow testing of migrate-on-fault via mbind().
Also allows applications to specify that only subsequently touched
pages be migrated to obey new policy, instead of all pages in range.
This can be useful for multi-threaded applications working on a
large shared data area that is initialized by an initial thread
resulting in all pages on one [or a few, if overflowed] nodes.
After PROT_NONE, the pages in regions assigned to the worker threads
will be automatically migrated local to the threads on 1st touch.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Reviewed-by: Rik van Riel <riel@redhat.com>
---
 include/linux/mm.h             |  5 +++++
 include/uapi/linux/mempolicy.h | 13 ++++++++++---
 2 files changed, 15 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index fa1615211159..471185e29bab 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1551,6 +1551,11 @@ static inline pgprot_t vm_get_page_prot(unsigned long vm_flags)
 }
 #endif
 
+#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
+void change_prot_numa(struct vm_area_struct *vma,
+			unsigned long start, unsigned long end);
+#endif
+
 struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr);
 int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
 			unsigned long pfn, unsigned long size, pgprot_t);
diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h
index 472de8a5d37e..6a1baae3775d 100644
--- a/include/uapi/linux/mempolicy.h
+++ b/include/uapi/linux/mempolicy.h
@@ -49,9 +49,16 @@ enum mpol_rebind_step {
 
 /* Flags for mbind */
 #define MPOL_MF_STRICT	(1<<0)	/* Verify existing pages in the mapping */
-#define MPOL_MF_MOVE	(1<<1)	/* Move pages owned by this process to conform to mapping */
-#define MPOL_MF_MOVE_ALL (1<<2)	/* Move every page to conform to mapping */
-#define MPOL_MF_INTERNAL (1<<3)	/* Internal flags start here */
+#define MPOL_MF_MOVE	 (1<<1)	/* Move pages owned by this process to conform
+				   to policy */
+#define MPOL_MF_MOVE_ALL (1<<2)	/* Move every page to conform to policy */
+#define MPOL_MF_LAZY	 (1<<3)	/* Modifies '_MOVE:  lazy migrate on fault */
+#define MPOL_MF_INTERNAL (1<<4)	/* Internal flags start here */
+
+#define MPOL_MF_VALID	(MPOL_MF_STRICT   | 	\
+			 MPOL_MF_MOVE     | 	\
+			 MPOL_MF_MOVE_ALL |	\
+			 MPOL_MF_LAZY)
 
 /*
  * Internal flags that share the struct mempolicy flags word with
-- 
cgit v1.2.3


From 4b10e7d562c90d0a72f324832c26653947a07381 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Thu, 25 Oct 2012 14:16:32 +0200
Subject: mm: mempolicy: Implement change_prot_numa() in terms of
 change_protection()

This patch converts change_prot_numa() to use change_protection(). As
pte_numa and friends check the PTE bits directly it is necessary for
change_protection() to use pmd_mknuma(). Hence the required
modifications to change_protection() are a little clumsy but the
end result is that most of the numa page table helpers are just one or
two instructions.

Signed-off-by: Mel Gorman <mgorman@suse.de>
---
 include/linux/huge_mm.h | 3 ++-
 include/linux/mm.h      | 4 ++--
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index dabb5108d6c0..027ad04ef3a8 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -27,7 +27,8 @@ extern int move_huge_pmd(struct vm_area_struct *vma,
 			 unsigned long new_addr, unsigned long old_end,
 			 pmd_t *old_pmd, pmd_t *new_pmd);
 extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
-			unsigned long addr, pgprot_t newprot);
+			unsigned long addr, pgprot_t newprot,
+			int prot_numa);
 
 enum transparent_hugepage_flag {
 	TRANSPARENT_HUGEPAGE_FLAG,
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 471185e29bab..d04c2f0aab36 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1080,7 +1080,7 @@ extern unsigned long do_mremap(unsigned long addr,
 			       unsigned long flags, unsigned long new_addr);
 extern unsigned long change_protection(struct vm_area_struct *vma, unsigned long start,
 			      unsigned long end, pgprot_t newprot,
-			      int dirty_accountable);
+			      int dirty_accountable, int prot_numa);
 extern int mprotect_fixup(struct vm_area_struct *vma,
 			  struct vm_area_struct **pprev, unsigned long start,
 			  unsigned long end, unsigned long newflags);
@@ -1552,7 +1552,7 @@ static inline pgprot_t vm_get_page_prot(unsigned long vm_flags)
 #endif
 
 #ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
-void change_prot_numa(struct vm_area_struct *vma,
+unsigned long change_prot_numa(struct vm_area_struct *vma,
 			unsigned long start, unsigned long end);
 #endif
 
-- 
cgit v1.2.3


From a720094ded8cbb303111035be91858011d2eac71 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Fri, 16 Nov 2012 09:37:58 +0000
Subject: mm: mempolicy: Hide MPOL_NOOP and MPOL_MF_LAZY from userspace for now

The use of MPOL_NOOP and MPOL_MF_LAZY to allow an application to
explicitly request lazy migration is a good idea but the actual
API has not been well reviewed and once released we have to support it.
For now this patch prevents an application using the services. This
will need to be revisited.

Signed-off-by: Mel Gorman <mgorman@suse.de>
---
 include/uapi/linux/mempolicy.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h
index 6a1baae3775d..16fb4e6efbc4 100644
--- a/include/uapi/linux/mempolicy.h
+++ b/include/uapi/linux/mempolicy.h
@@ -21,7 +21,6 @@ enum {
 	MPOL_BIND,
 	MPOL_INTERLEAVE,
 	MPOL_LOCAL,
-	MPOL_NOOP,		/* retain existing policy for range */
 	MPOL_MAX,	/* always last member of enum */
 };
 
@@ -57,8 +56,7 @@ enum mpol_rebind_step {
 
 #define MPOL_MF_VALID	(MPOL_MF_STRICT   | 	\
 			 MPOL_MF_MOVE     | 	\
-			 MPOL_MF_MOVE_ALL |	\
-			 MPOL_MF_LAZY)
+			 MPOL_MF_MOVE_ALL)
 
 /*
  * Internal flags that share the struct mempolicy flags word with
-- 
cgit v1.2.3


From cbee9f88ec1b8dd6b58f25f54e4f52c82ed77690 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 25 Oct 2012 14:16:43 +0200
Subject: mm: numa: Add fault driven placement and migration

NOTE: This patch is based on "sched, numa, mm: Add fault driven
	placement and migration policy" but as it throws away all the policy
	to just leave a basic foundation I had to drop the signed-offs-by.

This patch creates a bare-bones method for setting PTEs pte_numa in the
context of the scheduler that when faulted later will be faulted onto the
node the CPU is running on.  In itself this does nothing useful but any
placement policy will fundamentally depend on receiving hints on placement
from fault context and doing something intelligent about it.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Acked-by: Rik van Riel <riel@redhat.com>
---
 include/linux/mm_types.h | 11 +++++++++++
 include/linux/sched.h    | 20 ++++++++++++++++++++
 2 files changed, 31 insertions(+)

(limited to 'include')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 31f8a3af7d94..ed8638c29b3e 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -397,6 +397,17 @@ struct mm_struct {
 #endif
 #ifdef CONFIG_CPUMASK_OFFSTACK
 	struct cpumask cpumask_allocation;
+#endif
+#ifdef CONFIG_NUMA_BALANCING
+	/*
+	 * numa_next_scan is the next time when the PTEs will me marked
+	 * pte_numa to gather statistics and migrate pages to new nodes
+	 * if necessary
+	 */
+	unsigned long numa_next_scan;
+
+	/* numa_scan_seq prevents two threads setting pte_numa */
+	int numa_scan_seq;
 #endif
 	struct uprobes_state uprobes_state;
 };
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0dd42a02df2e..844af5b12cb2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1479,6 +1479,14 @@ struct task_struct {
 	short il_next;
 	short pref_node_fork;
 #endif
+#ifdef CONFIG_NUMA_BALANCING
+	int numa_scan_seq;
+	int numa_migrate_seq;
+	unsigned int numa_scan_period;
+	u64 node_stamp;			/* migration stamp  */
+	struct callback_head numa_work;
+#endif /* CONFIG_NUMA_BALANCING */
+
 	struct rcu_head rcu;
 
 	/*
@@ -1553,6 +1561,14 @@ struct task_struct {
 /* Future-safe accessor for struct task_struct's cpus_allowed. */
 #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
 
+#ifdef CONFIG_NUMA_BALANCING
+extern void task_numa_fault(int node, int pages);
+#else
+static inline void task_numa_fault(int node, int pages)
+{
+}
+#endif
+
 /*
  * Priority of a process goes from 0..MAX_PRIO-1, valid RT
  * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
@@ -1990,6 +2006,10 @@ enum sched_tunable_scaling {
 };
 extern enum sched_tunable_scaling sysctl_sched_tunable_scaling;
 
+extern unsigned int sysctl_numa_balancing_scan_period_min;
+extern unsigned int sysctl_numa_balancing_scan_period_max;
+extern unsigned int sysctl_numa_balancing_settle_count;
+
 #ifdef CONFIG_SCHED_DEBUG
 extern unsigned int sysctl_sched_migration_cost;
 extern unsigned int sysctl_sched_nr_migrate;
-- 
cgit v1.2.3


From 6e5fb223e89dbe5cb5c563f8d4a4a0a7d62455a8 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 25 Oct 2012 14:16:45 +0200
Subject: mm: sched: numa: Implement constant, per task Working Set Sampling
 (WSS) rate

Previously, to probe the working set of a task, we'd use
a very simple and crude method: mark all of its address
space PROT_NONE.

That method has various (obvious) disadvantages:

 - it samples the working set at dissimilar rates,
   giving some tasks a sampling quality advantage
   over others.

 - creates performance problems for tasks with very
   large working sets

 - over-samples processes with large address spaces but
   which only very rarely execute

Improve that method by keeping a rotating offset into the
address space that marks the current position of the scan,
and advance it by a constant rate (in a CPU cycles execution
proportional manner). If the offset reaches the last mapped
address of the mm then it then it starts over at the first
address.

The per-task nature of the working set sampling functionality in this tree
allows such constant rate, per task, execution-weight proportional sampling
of the working set, with an adaptive sampling interval/frequency that
goes from once per 100ms up to just once per 8 seconds.  The current
sampling volume is 256 MB per interval.

As tasks mature and converge their working set, so does the
sampling rate slow down to just a trickle, 256 MB per 8
seconds of CPU time executed.

This, beyond being adaptive, also rate-limits rarely
executing systems and does not over-sample on overloaded
systems.

[ In AutoNUMA speak, this patch deals with the effective sampling
  rate of the 'hinting page fault'. AutoNUMA's scanning is
  currently rate-limited, but it is also fundamentally
  single-threaded, executing in the knuma_scand kernel thread,
  so the limit in AutoNUMA is global and does not scale up with
  the number of CPUs, nor does it scan tasks in an execution
  proportional manner.

  So the idea of rate-limiting the scanning was first implemented
  in the AutoNUMA tree via a global rate limit. This patch goes
  beyond that by implementing an execution rate proportional
  working set sampling rate that is not implemented via a single
  global scanning daemon. ]

[ Dan Carpenter pointed out a possible NULL pointer dereference in the
  first version of this patch. ]

Based-on-idea-by: Andrea Arcangeli <aarcange@redhat.com>
Bug-Found-By: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
[ Wrote changelog and fixed bug. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Reviewed-by: Rik van Riel <riel@redhat.com>
---
 include/linux/mm_types.h | 3 +++
 include/linux/sched.h    | 1 +
 2 files changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index ed8638c29b3e..d1e246c5e50c 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -406,6 +406,9 @@ struct mm_struct {
 	 */
 	unsigned long numa_next_scan;
 
+	/* Restart point for scanning and setting pte_numa */
+	unsigned long numa_scan_offset;
+
 	/* numa_scan_seq prevents two threads setting pte_numa */
 	int numa_scan_seq;
 #endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 844af5b12cb2..37841958d234 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2008,6 +2008,7 @@ extern enum sched_tunable_scaling sysctl_sched_tunable_scaling;
 
 extern unsigned int sysctl_numa_balancing_scan_period_min;
 extern unsigned int sysctl_numa_balancing_scan_period_max;
+extern unsigned int sysctl_numa_balancing_scan_size;
 extern unsigned int sysctl_numa_balancing_settle_count;
 
 #ifdef CONFIG_SCHED_DEBUG
-- 
cgit v1.2.3


From 4b96a29ba891dd59734cb7be80a900fe93aa2d9f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 25 Oct 2012 14:16:47 +0200
Subject: mm: sched: numa: Implement slow start for working set sampling

Add a 1 second delay before starting to scan the working set of
a task and starting to balance it amongst nodes.

[ note that before the constant per task WSS sampling rate patch
  the initial scan would happen much later still, in effect that
  patch caused this regression. ]

The theory is that short-run tasks benefit very little from NUMA
placement: they come and go, and they better stick to the node
they were started on. As tasks mature and rebalance to other CPUs
and nodes, so does their NUMA placement have to change and so
does it start to matter more and more.

In practice this change fixes an observable kbuild regression:

   # [ a perf stat --null --repeat 10 test of ten bzImage builds to /dev/shm ]

   !NUMA:
   45.291088843 seconds time elapsed                                          ( +-  0.40% )
   45.154231752 seconds time elapsed                                          ( +-  0.36% )

   +NUMA, no slow start:
   46.172308123 seconds time elapsed                                          ( +-  0.30% )
   46.343168745 seconds time elapsed                                          ( +-  0.25% )

   +NUMA, 1 sec slow start:
   45.224189155 seconds time elapsed                                          ( +-  0.25% )
   45.160866532 seconds time elapsed                                          ( +-  0.17% )

and it also fixes an observable perf bench (hackbench) regression:

   # perf stat --null --repeat 10 perf bench sched messaging

   -NUMA:

   -NUMA:                  0.246225691 seconds time elapsed                   ( +-  1.31% )
   +NUMA no slow start:    0.252620063 seconds time elapsed                   ( +-  1.13% )

   +NUMA 1sec delay:       0.248076230 seconds time elapsed                   ( +-  1.35% )

The implementation is simple and straightforward, most of the patch
deals with adding the /proc/sys/kernel/numa_balancing_scan_delay_ms tunable
knob.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
[ Wrote the changelog, ran measurements, tuned the default. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Reviewed-by: Rik van Riel <riel@redhat.com>
---
 include/linux/sched.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 37841958d234..7d95a232b5b9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2006,6 +2006,7 @@ enum sched_tunable_scaling {
 };
 extern enum sched_tunable_scaling sysctl_sched_tunable_scaling;
 
+extern unsigned int sysctl_numa_balancing_scan_delay;
 extern unsigned int sysctl_numa_balancing_scan_period_min;
 extern unsigned int sysctl_numa_balancing_scan_period_max;
 extern unsigned int sysctl_numa_balancing_scan_size;
-- 
cgit v1.2.3


From 03c5a6e16322c997bf8f264851bfa3f532ad515f Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Fri, 2 Nov 2012 14:52:48 +0000
Subject: mm: numa: Add pte updates, hinting and migration stats

It is tricky to quantify the basic cost of automatic NUMA placement in a
meaningful manner. This patch adds some vmstats that can be used as part
of a basic costing model.

u    = basic unit = sizeof(void *)
Ca   = cost of struct page access = sizeof(struct page) / u
Cpte = Cost PTE access = Ca
Cupdate = Cost PTE update = (2 * Cpte) + (2 * Wlock)
	where Cpte is incurred twice for a read and a write and Wlock
	is a constant representing the cost of taking or releasing a
	lock
Cnumahint = Cost of a minor page fault = some high constant e.g. 1000
Cpagerw = Cost to read or write a full page = Ca + PAGE_SIZE/u
Ci = Cost of page isolation = Ca + Wi
	where Wi is a constant that should reflect the approximate cost
	of the locking operation
Cpagecopy = Cpagerw + (Cpagerw * Wnuma) + Ci + (Ci * Wnuma)
	where Wnuma is the approximate NUMA factor. 1 is local. 1.2
	would imply that remote accesses are 20% more expensive

Balancing cost = Cpte * numa_pte_updates +
		Cnumahint * numa_hint_faults +
		Ci * numa_pages_migrated +
		Cpagecopy * numa_pages_migrated

Note that numa_pages_migrated is used as a measure of how many pages
were isolated even though it would miss pages that failed to migrate. A
vmstat counter could have been added for it but the isolation cost is
pretty marginal in comparison to the overall cost so it seemed overkill.

The ideal way to measure automatic placement benefit would be to count
the number of remote accesses versus local accesses and do something like

	benefit = (remote_accesses_before - remove_access_after) * Wnuma

but the information is not readily available. As a workload converges, the
expection would be that the number of remote numa hints would reduce to 0.

	convergence = numa_hint_faults_local / numa_hint_faults
		where this is measured for the last N number of
		numa hints recorded. When the workload is fully
		converged the value is 1.

This can measure if the placement policy is converging and how fast it is
doing it.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Acked-by: Rik van Riel <riel@redhat.com>
---
 include/linux/vm_event_item.h | 6 ++++++
 include/linux/vmstat.h        | 8 ++++++++
 2 files changed, 14 insertions(+)

(limited to 'include')

diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index a1f750b8e72a..55600049e794 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -38,6 +38,12 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 		KSWAPD_LOW_WMARK_HIT_QUICKLY, KSWAPD_HIGH_WMARK_HIT_QUICKLY,
 		KSWAPD_SKIP_CONGESTION_WAIT,
 		PAGEOUTRUN, ALLOCSTALL, PGROTATED,
+#ifdef CONFIG_NUMA_BALANCING
+		NUMA_PTE_UPDATES,
+		NUMA_HINT_FAULTS,
+		NUMA_HINT_FAULTS_LOCAL,
+		NUMA_PAGE_MIGRATE,
+#endif
 #ifdef CONFIG_MIGRATION
 		PGMIGRATE_SUCCESS, PGMIGRATE_FAIL,
 #endif
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 92a86b2cce33..a13291f7da88 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -80,6 +80,14 @@ static inline void vm_events_fold_cpu(int cpu)
 
 #endif /* CONFIG_VM_EVENT_COUNTERS */
 
+#ifdef CONFIG_NUMA_BALANCING
+#define count_vm_numa_event(x)     count_vm_event(x)
+#define count_vm_numa_events(x, y) count_vm_events(x, y)
+#else
+#define count_vm_numa_event(x) do {} while (0)
+#define count_vm_numa_events(x, y) do {} while (0)
+#endif /* CONFIG_NUMA_BALANCING */
+
 #define __count_zone_vm_events(item, zone, delta) \
 		__count_vm_events(item##_NORMAL - ZONE_NORMAL + \
 		zone_idx(zone), delta)
-- 
cgit v1.2.3


From 5606e3877ad8baea42f3a71ebde0a03622bbb551 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Fri, 2 Nov 2012 18:19:13 +0000
Subject: mm: numa: Migrate on reference policy

This is the simplest possible policy that still does something of note.
When a pte_numa is faulted, it is moved immediately. Any replacement
policy must at least do better than this and in all likelihood this
policy regresses normal workloads.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Acked-by: Rik van Riel <riel@redhat.com>
---
 include/uapi/linux/mempolicy.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h
index 16fb4e6efbc4..0d11c3dcd3a1 100644
--- a/include/uapi/linux/mempolicy.h
+++ b/include/uapi/linux/mempolicy.h
@@ -67,6 +67,7 @@ enum mpol_rebind_step {
 #define MPOL_F_LOCAL   (1 << 1)	/* preferred local allocation */
 #define MPOL_F_REBINDING (1 << 2)	/* identify policies in rebinding */
 #define MPOL_F_MOF	(1 << 3) /* this policy wants migrate on fault */
+#define MPOL_F_MORON	(1 << 4) /* Migrate On pte_numa Reference On Node */
 
 
 #endif /* _UAPI_LINUX_MEMPOLICY_H */
-- 
cgit v1.2.3


From 8177a420ed7c16c171ed3c3aec5b0676db38c247 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Fri, 23 Mar 2012 20:56:34 +0100
Subject: mm: numa: Structures for Migrate On Fault per NUMA migration rate
 limiting

This defines the per-node data used by Migrate On Fault in order to
rate limit the migration. The rate limiting is applied independently
to each destination node.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Mel Gorman <mgorman@suse.de>
---
 include/linux/mmzone.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index a23923ba8263..28601fdfcdb2 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -717,6 +717,19 @@ typedef struct pglist_data {
 	struct task_struct *kswapd;	/* Protected by lock_memory_hotplug() */
 	int kswapd_max_order;
 	enum zone_type classzone_idx;
+#ifdef CONFIG_NUMA_BALANCING
+	/*
+	 * Lock serializing the per destination node AutoNUMA memory
+	 * migration rate limiting data.
+	 */
+	spinlock_t numabalancing_migrate_lock;
+
+	/* Rate limiting time interval */
+	unsigned long numabalancing_migrate_next_window;
+
+	/* Number of pages migrated during the rate limiting time interval */
+	unsigned long numabalancing_migrate_nr_pages;
+#endif
 } pg_data_t;
 
 #define node_present_pages(nid)	(NODE_DATA(nid)->node_present_pages)
-- 
cgit v1.2.3


From e14808b49f55e0e1135da5e4a154a540dd9f3662 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Mon, 19 Nov 2012 10:59:15 +0000
Subject: mm: numa: Rate limit setting of pte_numa if node is saturated

If there are a large number of NUMA hinting faults and all of them
are resulting in migrations it may indicate that memory is just
bouncing uselessly around. NUMA balancing cost is likely exceeding
any benefit from locality. Rate limit the PTE updates if the node
is migration rate-limited. As noted in the comments, this distorts
the NUMA faulting statistics.

Signed-off-by: Mel Gorman <mgorman@suse.de>
---
 include/linux/migrate.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include')

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index f0d0313eea6f..91556889adac 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -77,11 +77,17 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping,
 
 #ifdef CONFIG_NUMA_BALANCING
 extern int migrate_misplaced_page(struct page *page, int node);
+extern int migrate_misplaced_page(struct page *page, int node);
+extern bool migrate_ratelimited(int node);
 #else
 static inline int migrate_misplaced_page(struct page *page, int node)
 {
 	return -EAGAIN; /* can't migrate now */
 }
+static inline bool migrate_ratelimited(int node)
+{
+	return false;
+}
 #endif /* CONFIG_NUMA_BALANCING */
 
 #endif /* _LINUX_MIGRATE_H */
-- 
cgit v1.2.3


From 57e0a0309160b1b4ebde9f3c6a867cd96ac368bf Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Mon, 12 Nov 2012 09:06:20 +0000
Subject: mm: numa: Introduce last_nid to the page frame

This patch introduces a last_nid field to the page struct. This is used
to build a two-stage filter in the next patch that is aimed at
mitigating a problem whereby pages migrate to the wrong node when
referenced by a process that was running off its home node.

Signed-off-by: Mel Gorman <mgorman@suse.de>
---
 include/linux/mm.h       | 30 ++++++++++++++++++++++++++++++
 include/linux/mm_types.h |  4 ++++
 2 files changed, 34 insertions(+)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index d04c2f0aab36..d87f9ec4a145 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -693,6 +693,36 @@ static inline int page_to_nid(const struct page *page)
 }
 #endif
 
+#ifdef CONFIG_NUMA_BALANCING
+static inline int page_xchg_last_nid(struct page *page, int nid)
+{
+	return xchg(&page->_last_nid, nid);
+}
+
+static inline int page_last_nid(struct page *page)
+{
+	return page->_last_nid;
+}
+static inline void reset_page_last_nid(struct page *page)
+{
+	page->_last_nid = -1;
+}
+#else
+static inline int page_xchg_last_nid(struct page *page, int nid)
+{
+	return page_to_nid(page);
+}
+
+static inline int page_last_nid(struct page *page)
+{
+	return page_to_nid(page);
+}
+
+static inline void reset_page_last_nid(struct page *page)
+{
+}
+#endif
+
 static inline struct zone *page_zone(const struct page *page)
 {
 	return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)];
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index d1e246c5e50c..c5fffa239861 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -175,6 +175,10 @@ struct page {
 	 */
 	void *shadow;
 #endif
+
+#ifdef CONFIG_NUMA_BALANCING
+	int _last_nid;
+#endif
 }
 /*
  * The struct page can be forced to be double word aligned so that atomic ops
-- 
cgit v1.2.3


From b8593bfda1652755136333cdd362de125b283a9c Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Wed, 21 Nov 2012 01:18:23 +0000
Subject: mm: sched: Adapt the scanning rate if a NUMA hinting fault does not
 migrate

The PTE scanning rate and fault rates are two of the biggest sources of
system CPU overhead with automatic NUMA placement.  Ideally a proper policy
would detect if a workload was properly placed, schedule and adjust the
PTE scanning rate accordingly. We do not track the necessary information
to do that but we at least know if we migrated or not.

This patch scans slower if a page was not migrated as the result of a
NUMA hinting fault up to sysctl_numa_balancing_scan_period_max which is
now higher than the previous default. Once every minute it will reset
the scanner in case of phase changes.

This is hilariously crude and the numbers are arbitrary. Workloads will
converge quite slowly in comparison to what a proper policy should be able
to do. On the plus side, we will chew up less CPU for workloads that have
no need for automatic balancing.

Signed-off-by: Mel Gorman <mgorman@suse.de>
---
 include/linux/mm_types.h | 3 +++
 include/linux/sched.h    | 5 +++--
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index c5fffa239861..e850a23dd6ec 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -410,6 +410,9 @@ struct mm_struct {
 	 */
 	unsigned long numa_next_scan;
 
+	/* numa_next_reset is when the PTE scanner period will be reset */
+	unsigned long numa_next_reset;
+
 	/* Restart point for scanning and setting pte_numa */
 	unsigned long numa_scan_offset;
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 7d95a232b5b9..0f4ff2bd03f6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1562,9 +1562,9 @@ struct task_struct {
 #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
 
 #ifdef CONFIG_NUMA_BALANCING
-extern void task_numa_fault(int node, int pages);
+extern void task_numa_fault(int node, int pages, bool migrated);
 #else
-static inline void task_numa_fault(int node, int pages)
+static inline void task_numa_fault(int node, int pages, bool migrated)
 {
 }
 #endif
@@ -2009,6 +2009,7 @@ extern enum sched_tunable_scaling sysctl_sched_tunable_scaling;
 extern unsigned int sysctl_numa_balancing_scan_delay;
 extern unsigned int sysctl_numa_balancing_scan_period_min;
 extern unsigned int sysctl_numa_balancing_scan_period_max;
+extern unsigned int sysctl_numa_balancing_scan_period_reset;
 extern unsigned int sysctl_numa_balancing_scan_size;
 extern unsigned int sysctl_numa_balancing_settle_count;
 
-- 
cgit v1.2.3


From 1a687c2e9a99335c9e77392f050fe607fa18a652 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Thu, 22 Nov 2012 11:16:36 +0000
Subject: mm: sched: numa: Control enabling and disabling of NUMA balancing

This patch adds Kconfig options and kernel parameters to allow the
enabling and disabling of automatic NUMA balancing. The existance
of such a switch was and is very important when debugging problems
related to transparent hugepages and we should have the same for
automatic NUMA placement.

Signed-off-by: Mel Gorman <mgorman@suse.de>
---
 include/linux/sched.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0f4ff2bd03f6..b1e619f9ff1a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1563,10 +1563,14 @@ struct task_struct {
 
 #ifdef CONFIG_NUMA_BALANCING
 extern void task_numa_fault(int node, int pages, bool migrated);
+extern void set_numabalancing_state(bool enabled);
 #else
 static inline void task_numa_fault(int node, int pages, bool migrated)
 {
 }
+static inline void set_numabalancing_state(bool enabled)
+{
+}
 #endif
 
 /*
-- 
cgit v1.2.3


From 5bca23035391928c4c7301835accca3551b96cc2 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Thu, 22 Nov 2012 14:40:03 +0000
Subject: mm: sched: numa: Delay PTE scanning until a task is scheduled on a
 new node

Due to the fact that migrations are driven by the CPU a task is running
on there is no point tracking NUMA faults until one task runs on a new
node. This patch tracks the first node used by an address space. Until
it changes, PTE scanning is disabled and no NUMA hinting faults are
trapped. This should help workloads that are short-lived, do not care
about NUMA placement or have bound themselves to a single node.

This takes advantage of the logic in "mm: sched: numa: Implement slow
start for working set sampling" to delay when the checks are made. This
will take advantage of processes that set their CPU and node bindings
early in their lifetime. It will also potentially allow any initial load
balancing to take place.

Signed-off-by: Mel Gorman <mgorman@suse.de>
---
 include/linux/mm_types.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index e850a23dd6ec..197422a1598c 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -418,10 +418,20 @@ struct mm_struct {
 
 	/* numa_scan_seq prevents two threads setting pte_numa */
 	int numa_scan_seq;
+
+	/*
+	 * The first node a task was scheduled on. If a task runs on
+	 * a different node than Make PTE Scan Go Now.
+	 */
+	int first_nid;
 #endif
 	struct uprobes_state uprobes_state;
 };
 
+/* first nid will either be a valid NID or one of these values */
+#define NUMA_PTE_SCAN_INIT	-1
+#define NUMA_PTE_SCAN_ACTIVE	-2
+
 static inline void mm_init_cpumask(struct mm_struct *mm)
 {
 #ifdef CONFIG_CPUMASK_OFFSTACK
-- 
cgit v1.2.3


From b32967ff101a7508f70be8de59b278d4df92fa00 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Mon, 19 Nov 2012 12:35:47 +0000
Subject: mm: numa: Add THP migration for the NUMA working set scanning fault
 case.

Note: This is very heavily based on a patch from Peter Zijlstra with
	fixes from Ingo Molnar, Hugh Dickins and Johannes Weiner.  That patch
	put a lot of migration logic into mm/huge_memory.c where it does
	not belong. This version puts tries to share some of the migration
	logic with migrate_misplaced_page.  However, it should be noted
	that now migrate.c is doing more with the pagetable manipulation
	than is preferred. The end result is barely recognisable so as
	before, the signed-offs had to be removed but will be re-added if
	the original authors are ok with it.

Add THP migration for the NUMA working set scanning fault case.

It uses the page lock to serialize. No migration pte dance is
necessary because the pte is already unmapped when we decide
to migrate.

[dhillf@gmail.com: Fix memory leak on isolation failure]
[dhillf@gmail.com: Fix transfer of last_nid information]
Signed-off-by: Mel Gorman <mgorman@suse.de>
---
 include/linux/migrate.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'include')

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 91556889adac..51eac4bdc606 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -79,6 +79,12 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping,
 extern int migrate_misplaced_page(struct page *page, int node);
 extern int migrate_misplaced_page(struct page *page, int node);
 extern bool migrate_ratelimited(int node);
+extern int migrate_misplaced_transhuge_page(struct mm_struct *mm,
+			struct vm_area_struct *vma,
+			pmd_t *pmd, pmd_t entry,
+			unsigned long address,
+			struct page *page, int node);
+
 #else
 static inline int migrate_misplaced_page(struct page *page, int node)
 {
@@ -88,6 +94,15 @@ static inline bool migrate_ratelimited(int node)
 {
 	return false;
 }
+
+static inline int migrate_misplaced_transhuge_page(struct mm_struct *mm,
+			struct vm_area_struct *vma,
+			pmd_t *pmd, pmd_t entry,
+			unsigned long address,
+			struct page *page, int node)
+{
+	return -EAGAIN;
+}
 #endif /* CONFIG_NUMA_BALANCING */
 
 #endif /* _LINUX_MIGRATE_H */
-- 
cgit v1.2.3


From 220018d388b8ab1fca1c5f0c6474bab47ad2c9c0 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Wed, 5 Dec 2012 09:32:56 +0000
Subject: mm: numa: Add THP migration for the NUMA working set scanning fault
 case build fix

Commit "Add THP migration for the NUMA working set scanning fault case"
breaks the build because HPAGE_PMD_SHIFT and HPAGE_PMD_MASK defined to
explode without CONFIG_TRANSPARENT_HUGEPAGE:

mm/migrate.c: In function 'migrate_misplaced_transhuge_page_put':
mm/migrate.c:1549: error: call to '__build_bug_failed' declared with attribute error: BUILD_BUG failed
mm/migrate.c:1564: error: call to '__build_bug_failed' declared with attribute error: BUILD_BUG failed
mm/migrate.c:1566: error: call to '__build_bug_failed' declared with attribute error: BUILD_BUG failed
mm/migrate.c:1573: error: call to '__build_bug_failed' declared with attribute error: BUILD_BUG failed
mm/migrate.c:1606: error: call to '__build_bug_failed' declared with attribute error: BUILD_BUG failed
mm/migrate.c:1648: error: call to '__build_bug_failed' declared with attribute error: BUILD_BUG failed

CONFIG_NUMA_BALANCING allows compilation without enabling transparent
hugepages, so define the dummy function for such a configuration and only
define migrate_misplaced_transhuge_page_put() when transparent hugepages
are enabled.

Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Mel Gorman <mgorman@suse.de>
---
 include/linux/migrate.h | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 51eac4bdc606..d52afb9a790c 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -79,12 +79,6 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping,
 extern int migrate_misplaced_page(struct page *page, int node);
 extern int migrate_misplaced_page(struct page *page, int node);
 extern bool migrate_ratelimited(int node);
-extern int migrate_misplaced_transhuge_page(struct mm_struct *mm,
-			struct vm_area_struct *vma,
-			pmd_t *pmd, pmd_t entry,
-			unsigned long address,
-			struct page *page, int node);
-
 #else
 static inline int migrate_misplaced_page(struct page *page, int node)
 {
@@ -94,7 +88,15 @@ static inline bool migrate_ratelimited(int node)
 {
 	return false;
 }
+#endif /* CONFIG_NUMA_BALANCING */
 
+#if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
+extern int migrate_misplaced_transhuge_page(struct mm_struct *mm,
+			struct vm_area_struct *vma,
+			pmd_t *pmd, pmd_t entry,
+			unsigned long address,
+			struct page *page, int node);
+#else
 static inline int migrate_misplaced_transhuge_page(struct mm_struct *mm,
 			struct vm_area_struct *vma,
 			pmd_t *pmd, pmd_t entry,
@@ -103,6 +105,6 @@ static inline int migrate_misplaced_transhuge_page(struct mm_struct *mm,
 {
 	return -EAGAIN;
 }
-#endif /* CONFIG_NUMA_BALANCING */
+#endif /* CONFIG_NUMA_BALANCING && CONFIG_TRANSPARENT_HUGEPAGE*/
 
 #endif /* _LINUX_MIGRATE_H */
-- 
cgit v1.2.3


From 5a505085f043e8380f83610f79642853c051e2f1 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sun, 2 Dec 2012 19:56:46 +0000
Subject: mm/rmap: Convert the struct anon_vma::mutex to an rwsem

Convert the struct anon_vma::mutex to an rwsem, which will help
in solving a page-migration scalability problem. (Addressed in
a separate patch.)

The conversion is simple and straightforward: in every case
where we mutex_lock()ed we'll now down_write().

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Reviewed-by: Rik van Riel <riel@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Turner <pjt@google.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Mel Gorman <mgorman@suse.de>
---
 include/linux/rmap.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index bfe1f4780644..f3f41d242e25 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -7,7 +7,7 @@
 #include <linux/list.h>
 #include <linux/slab.h>
 #include <linux/mm.h>
-#include <linux/mutex.h>
+#include <linux/rwsem.h>
 #include <linux/memcontrol.h>
 
 /*
@@ -25,8 +25,8 @@
  * pointing to this anon_vma once its vma list is empty.
  */
 struct anon_vma {
-	struct anon_vma *root;	/* Root of this anon_vma tree */
-	struct mutex mutex;	/* Serialize access to vma list */
+	struct anon_vma *root;		/* Root of this anon_vma tree */
+	struct rw_semaphore rwsem;	/* W: modification, R: walking the list */
 	/*
 	 * The refcount is taken on an anon_vma when there is no
 	 * guarantee that the vma of page tables will exist for
@@ -64,7 +64,7 @@ struct anon_vma_chain {
 	struct vm_area_struct *vma;
 	struct anon_vma *anon_vma;
 	struct list_head same_vma;   /* locked by mmap_sem & page_table_lock */
-	struct rb_node rb;			/* locked by anon_vma->mutex */
+	struct rb_node rb;			/* locked by anon_vma->rwsem */
 	unsigned long rb_subtree_last;
 #ifdef CONFIG_DEBUG_VM_RB
 	unsigned long cached_vma_start, cached_vma_last;
@@ -108,24 +108,24 @@ static inline void vma_lock_anon_vma(struct vm_area_struct *vma)
 {
 	struct anon_vma *anon_vma = vma->anon_vma;
 	if (anon_vma)
-		mutex_lock(&anon_vma->root->mutex);
+		down_write(&anon_vma->root->rwsem);
 }
 
 static inline void vma_unlock_anon_vma(struct vm_area_struct *vma)
 {
 	struct anon_vma *anon_vma = vma->anon_vma;
 	if (anon_vma)
-		mutex_unlock(&anon_vma->root->mutex);
+		up_write(&anon_vma->root->rwsem);
 }
 
 static inline void anon_vma_lock(struct anon_vma *anon_vma)
 {
-	mutex_lock(&anon_vma->root->mutex);
+	down_write(&anon_vma->root->rwsem);
 }
 
 static inline void anon_vma_unlock(struct anon_vma *anon_vma)
 {
-	mutex_unlock(&anon_vma->root->mutex);
+	up_write(&anon_vma->root->rwsem);
 }
 
 /*
-- 
cgit v1.2.3


From 4fc3f1d66b1ef0d7b8dc11f4ff1cc510f78b37d6 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sun, 2 Dec 2012 19:56:50 +0000
Subject: mm/rmap, migration: Make rmap_walk_anon() and try_to_unmap_anon()
 more scalable

rmap_walk_anon() and try_to_unmap_anon() appears to be too
careful about locking the anon vma: while it needs protection
against anon vma list modifications, it does not need exclusive
access to the list itself.

Transforming this exclusive lock to a read-locked rwsem removes
a global lock from the hot path of page-migration intense
threaded workloads which can cause pathological performance like
this:

    96.43%        process 0  [kernel.kallsyms]  [k] perf_trace_sched_switch
                  |
                  --- perf_trace_sched_switch
                      __schedule
                      schedule
                      schedule_preempt_disabled
                      __mutex_lock_common.isra.6
                      __mutex_lock_slowpath
                      mutex_lock
                     |
                     |--50.61%-- rmap_walk
                     |          move_to_new_page
                     |          migrate_pages
                     |          migrate_misplaced_page
                     |          __do_numa_page.isra.69
                     |          handle_pte_fault
                     |          handle_mm_fault
                     |          __do_page_fault
                     |          do_page_fault
                     |          page_fault
                     |          __memset_sse2
                     |          |
                     |           --100.00%-- worker_thread
                     |                     |
                     |                      --100.00%-- start_thread
                     |
                      --49.39%-- page_lock_anon_vma
                                try_to_unmap_anon
                                try_to_unmap
                                migrate_pages
                                migrate_misplaced_page
                                __do_numa_page.isra.69
                                handle_pte_fault
                                handle_mm_fault
                                __do_page_fault
                                do_page_fault
                                page_fault
                                __memset_sse2
                                |
                                 --100.00%-- worker_thread
                                           start_thread

With this change applied the profile is now nicely flat
and there's no anon-vma related scheduling/blocking.

Rename anon_vma_[un]lock() => anon_vma_[un]lock_write(),
to make it clearer that it's an exclusive write-lock in
that case - suggested by Rik van Riel.

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Turner <pjt@google.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Mel Gorman <mgorman@suse.de>
---
 include/linux/huge_mm.h |  2 +-
 include/linux/rmap.h    | 17 ++++++++++++++---
 2 files changed, 15 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 027ad04ef3a8..0d1208c0bdc4 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -102,7 +102,7 @@ extern void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd);
 #define wait_split_huge_page(__anon_vma, __pmd)				\
 	do {								\
 		pmd_t *____pmd = (__pmd);				\
-		anon_vma_lock(__anon_vma);				\
+		anon_vma_lock_write(__anon_vma);			\
 		anon_vma_unlock(__anon_vma);				\
 		BUG_ON(pmd_trans_splitting(*____pmd) ||			\
 		       pmd_trans_huge(*____pmd));			\
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index f3f41d242e25..c20635c527a9 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -118,7 +118,7 @@ static inline void vma_unlock_anon_vma(struct vm_area_struct *vma)
 		up_write(&anon_vma->root->rwsem);
 }
 
-static inline void anon_vma_lock(struct anon_vma *anon_vma)
+static inline void anon_vma_lock_write(struct anon_vma *anon_vma)
 {
 	down_write(&anon_vma->root->rwsem);
 }
@@ -128,6 +128,17 @@ static inline void anon_vma_unlock(struct anon_vma *anon_vma)
 	up_write(&anon_vma->root->rwsem);
 }
 
+static inline void anon_vma_lock_read(struct anon_vma *anon_vma)
+{
+	down_read(&anon_vma->root->rwsem);
+}
+
+static inline void anon_vma_unlock_read(struct anon_vma *anon_vma)
+{
+	up_read(&anon_vma->root->rwsem);
+}
+
+
 /*
  * anon_vma helper functions.
  */
@@ -220,8 +231,8 @@ int try_to_munlock(struct page *);
 /*
  * Called by memory-failure.c to kill processes.
  */
-struct anon_vma *page_lock_anon_vma(struct page *page);
-void page_unlock_anon_vma(struct anon_vma *anon_vma);
+struct anon_vma *page_lock_anon_vma_read(struct page *page);
+void page_unlock_anon_vma_read(struct anon_vma *anon_vma);
 int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma);
 
 /*
-- 
cgit v1.2.3