From 478b360a47b71f3b5030eacd3aae6acb1a32c2b6 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sat, 15 Feb 2014 23:48:45 +0100
Subject: netfilter: nf_tables: fix nf_trace always-on with XT_TRACE=n

When using nftables with CONFIG_NETFILTER_XT_TARGET_TRACE=n, we get
lots of "TRACE: filter:output:policy:1 IN=..." warnings as several
places will leave skb->nf_trace uninitialised.

Unlike iptables tracing functionality is not conditional in nftables,
so always copy/zero nf_trace setting when nftables is enabled.

Move this into __nf_copy() helper.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/skbuff.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index f589c9af8cbf..d40d40b2915b 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2725,7 +2725,7 @@ static inline void nf_reset(struct sk_buff *skb)
 
 static inline void nf_reset_trace(struct sk_buff *skb)
 {
-#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
+#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) || defined(CONFIG_NF_TABLES)
 	skb->nf_trace = 0;
 #endif
 }
@@ -2742,6 +2742,9 @@ static inline void __nf_copy(struct sk_buff *dst, const struct sk_buff *src)
 	dst->nf_bridge  = src->nf_bridge;
 	nf_bridge_get(src->nf_bridge);
 #endif
+#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) || defined(CONFIG_NF_TABLES)
+	dst->nf_trace = src->nf_trace;
+#endif
 }
 
 static inline void nf_copy(struct sk_buff *dst, const struct sk_buff *src)
-- 
cgit v1.2.3


From 994c41ee0ac875797b4dfef509ac7753e2649b4d Mon Sep 17 00:00:00 2001
From: Tomi Valkeinen <tomi.valkeinen@ti.com>
Date: Thu, 30 Jan 2014 13:17:20 +0200
Subject: ARM: OMAP2+: clock: fix clkoutx2 with CLK_SET_RATE_PARENT

If CLK_SET_RATE_PARENT is set for a clkoutx2 clock, calling
clk_set_rate() on the clock "skips" the x2 multiplier as there are no
set_rate and round_rate functions defined for the clkoutx2.

This results in getting double the requested clock rates, breaking the
display on omap3430 based devices. This got broken when
d0f58bd3bba3877fb1af4664c4e33273d36f00e4 and related patches were merged
for v3.14, as omapdss driver now relies more on the clk-framework and
CLK_SET_RATE_PARENT.

This patch implements set_rate and round_rate for clkoutx2.

Tested on OMAP3430, OMAP3630, OMAP4460.

Signed-off-by: Tomi Valkeinen <tomi.valkeinen@ti.com>
Acked-by: Tero Kristo <t-kristo@ti.com>
Signed-off-by: Paul Walmsley <paul@pwsan.com>
---
 include/linux/clk/ti.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/clk/ti.h b/include/linux/clk/ti.h
index 092b64168d7f..4a21a872dbbd 100644
--- a/include/linux/clk/ti.h
+++ b/include/linux/clk/ti.h
@@ -245,6 +245,10 @@ long omap2_dpll_round_rate(struct clk_hw *hw, unsigned long target_rate,
 void omap2_init_clk_clkdm(struct clk_hw *clk);
 unsigned long omap3_clkoutx2_recalc(struct clk_hw *hw,
 				    unsigned long parent_rate);
+int omap3_clkoutx2_set_rate(struct clk_hw *hw, unsigned long rate,
+					unsigned long parent_rate);
+long omap3_clkoutx2_round_rate(struct clk_hw *hw, unsigned long rate,
+		unsigned long *prate);
 int omap2_clkops_enable_clkdm(struct clk_hw *hw);
 void omap2_clkops_disable_clkdm(struct clk_hw *hw);
 int omap2_clk_disable_autoidle_all(void);
-- 
cgit v1.2.3


From feb71dae1f9e0aeb056f7f639a21e620d327fc66 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Thu, 20 Feb 2014 15:32:37 -0800
Subject: blk-mq: merge blk_mq_insert_request and blk_mq_run_request

It's almost identical to blk_mq_insert_request, so fold the two into one
slightly more generic function by making the flush special case a bit
smarted.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/blk-mq.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 18ba8a627f46..ff28fe37ddda 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -121,8 +121,7 @@ void blk_mq_init_commands(struct request_queue *, void (*init)(void *data, struc
 
 void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule);
 
-void blk_mq_insert_request(struct request_queue *, struct request *,
-		bool, bool);
+void blk_mq_insert_request(struct request *, bool, bool, bool);
 void blk_mq_run_queues(struct request_queue *q, bool async);
 void blk_mq_free_request(struct request *rq);
 bool blk_mq_can_queue(struct blk_mq_hw_ctx *);
-- 
cgit v1.2.3


From d6a25b31315327eef7785b895c354cc45c3f3742 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Thu, 20 Feb 2014 15:32:38 -0800
Subject: blk-mq: support partial I/O completions

Add a new blk_mq_end_io_partial function to partially complete requests
as needed by the SCSI layer.  We do this by reusing blk_update_request
to advance the bio instead of having a simplified version of it in
the blk-mq code.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/blk-mq.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index ff28fe37ddda..2ff2e8d982be 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -133,7 +133,13 @@ struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *, const int ctx_ind
 struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_reg *, unsigned int);
 void blk_mq_free_single_hw_queue(struct blk_mq_hw_ctx *, unsigned int);
 
-void blk_mq_end_io(struct request *rq, int error);
+bool blk_mq_end_io_partial(struct request *rq, int error,
+		unsigned int nr_bytes);
+static inline void blk_mq_end_io(struct request *rq, int error)
+{
+	bool done = !blk_mq_end_io_partial(rq, error, blk_rq_bytes(rq));
+	BUG_ON(!done);
+}
 
 void blk_mq_complete_request(struct request *rq);
 
-- 
cgit v1.2.3


From 6f285b19d09f72e801525f5eea1bdad22e559bf0 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 28 Feb 2014 19:44:55 -0800
Subject: audit: Send replies in the proper network namespace.

In perverse cases of file descriptor passing the current network
namespace of a process and the network namespace of a socket used by
that socket may differ.  Therefore use the network namespace of the
appropiate socket to ensure replies always go to the appropiate
socket.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/audit.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index aa865a9a4c4f..ec1464df4c60 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -43,6 +43,7 @@ struct mq_attr;
 struct mqstat;
 struct audit_watch;
 struct audit_tree;
+struct sk_buff;
 
 struct audit_krule {
 	int			vers_ops;
@@ -463,7 +464,7 @@ extern int audit_filter_user(int type);
 extern int audit_filter_type(int type);
 extern int audit_rule_change(int type, __u32 portid, int seq,
 				void *data, size_t datasz);
-extern int audit_list_rules_send(__u32 portid, int seq);
+extern int audit_list_rules_send(struct sk_buff *request_skb, int seq);
 
 extern u32 audit_enabled;
 #else /* CONFIG_AUDIT */
-- 
cgit v1.2.3


From b7e63a1079b266866a732cf699d8c4d61391bbda Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Wed, 26 Feb 2014 11:19:14 -0800
Subject: NFSv4: Fix another nfs4_sequence corruptor

nfs4_release_lockowner needs to set the rpc_message reply to point to
the nfs4_sequence_res in order to avoid another Oopsable situation
in nfs41_assign_slot.

Fixes: fbd4bfd1d9d21 (NFS: Add nfs4_sequence calls for RELEASE_LOCKOWNER)
Cc: stable@vger.kernel.org # 3.12+
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 include/linux/nfs_xdr.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index b2fb167b2e6d..5624e4e2763c 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -467,9 +467,14 @@ struct nfs_lockt_res {
 };
 
 struct nfs_release_lockowner_args {
+	struct nfs4_sequence_args	seq_args;
 	struct nfs_lowner	lock_owner;
 };
 
+struct nfs_release_lockowner_res {
+	struct nfs4_sequence_res	seq_res;
+};
+
 struct nfs4_delegreturnargs {
 	struct nfs4_sequence_args	seq_args;
 	const struct nfs_fh *fhandle;
-- 
cgit v1.2.3


From 45ab2813d40d88fc575e753c38478de242d03f88 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Wed, 26 Feb 2014 13:37:38 -0500
Subject: tracing: Do not add event files for modules that fail tracepoints

If a module fails to add its tracepoints due to module tainting, do not
create the module event infrastructure in the debugfs directory. As the events
will not work and worse yet, they will silently fail, making the user wonder
why the events they enable do not display anything.

Having a warning on module load and the events not visible to the users
will make the cause of the problem much clearer.

Link: http://lkml.kernel.org/r/20140227154923.265882695@goodmis.org

Fixes: 6d723736e472 "tracing/events: add support for modules to TRACE_EVENT"
Acked-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: stable@vger.kernel.org # 2.6.31+
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/tracepoint.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index accc497f8d72..7159a0a933df 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -60,6 +60,12 @@ struct tp_module {
 	unsigned int num_tracepoints;
 	struct tracepoint * const *tracepoints_ptrs;
 };
+bool trace_module_has_bad_taint(struct module *mod);
+#else
+static inline bool trace_module_has_bad_taint(struct module *mod)
+{
+	return false;
+}
 #endif /* CONFIG_MODULES */
 
 struct tracepoint_iter {
-- 
cgit v1.2.3


From 668f9abbd4334e6c29fa8acd71635c4f9101caa7 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Mon, 3 Mar 2014 15:38:18 -0800
Subject: mm: close PageTail race

Commit bf6bddf1924e ("mm: introduce compaction and migration for
ballooned pages") introduces page_count(page) into memory compaction
which dereferences page->first_page if PageTail(page).

This results in a very rare NULL pointer dereference on the
aforementioned page_count(page).  Indeed, anything that does
compound_head(), including page_count() is susceptible to racing with
prep_compound_page() and seeing a NULL or dangling page->first_page
pointer.

This patch uses Andrea's implementation of compound_trans_head() that
deals with such a race and makes it the default compound_head()
implementation.  This includes a read memory barrier that ensures that
if PageTail(head) is true that we return a head page that is neither
NULL nor dangling.  The patch then adds a store memory barrier to
prep_compound_page() to ensure page->first_page is set.

This is the safest way to ensure we see the head page that we are
expecting, PageTail(page) is already in the unlikely() path and the
memory barriers are unfortunately required.

Hugetlbfs is the exception, we don't enforce a store memory barrier
during init since no race is possible.

Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Holger Kiehl <Holger.Kiehl@dwd.de>
Cc: Christoph Lameter <cl@linux.com>
Cc: Rafael Aquini <aquini@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/huge_mm.h | 41 -----------------------------------------
 include/linux/mm.h      | 14 ++++++++++++--
 2 files changed, 12 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index db512014e061..b826239bdce0 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -157,46 +157,6 @@ static inline int hpage_nr_pages(struct page *page)
 		return HPAGE_PMD_NR;
 	return 1;
 }
-/*
- * compound_trans_head() should be used instead of compound_head(),
- * whenever the "page" passed as parameter could be the tail of a
- * transparent hugepage that could be undergoing a
- * __split_huge_page_refcount(). The page structure layout often
- * changes across releases and it makes extensive use of unions. So if
- * the page structure layout will change in a way that
- * page->first_page gets clobbered by __split_huge_page_refcount, the
- * implementation making use of smp_rmb() will be required.
- *
- * Currently we define compound_trans_head as compound_head, because
- * page->private is in the same union with page->first_page, and
- * page->private isn't clobbered. However this also means we're
- * currently leaving dirt into the page->private field of anonymous
- * pages resulting from a THP split, instead of setting page->private
- * to zero like for every other page that has PG_private not set. But
- * anonymous pages don't use page->private so this is not a problem.
- */
-#if 0
-/* This will be needed if page->private will be clobbered in split_huge_page */
-static inline struct page *compound_trans_head(struct page *page)
-{
-	if (PageTail(page)) {
-		struct page *head;
-		head = page->first_page;
-		smp_rmb();
-		/*
-		 * head may be a dangling pointer.
-		 * __split_huge_page_refcount clears PageTail before
-		 * overwriting first_page, so if PageTail is still
-		 * there it means the head pointer isn't dangling.
-		 */
-		if (PageTail(page))
-			return head;
-	}
-	return page;
-}
-#else
-#define compound_trans_head(page) compound_head(page)
-#endif
 
 extern int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 				unsigned long addr, pmd_t pmd, pmd_t *pmdp);
@@ -226,7 +186,6 @@ static inline int split_huge_page(struct page *page)
 	do { } while (0)
 #define split_huge_page_pmd_mm(__mm, __address, __pmd)	\
 	do { } while (0)
-#define compound_trans_head(page) compound_head(page)
 static inline int hugepage_madvise(struct vm_area_struct *vma,
 				   unsigned long *vm_flags, int advice)
 {
diff --git a/include/linux/mm.h b/include/linux/mm.h
index f28f46eade6a..03ab3e58f511 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -399,8 +399,18 @@ static inline void compound_unlock_irqrestore(struct page *page,
 
 static inline struct page *compound_head(struct page *page)
 {
-	if (unlikely(PageTail(page)))
-		return page->first_page;
+	if (unlikely(PageTail(page))) {
+		struct page *head = page->first_page;
+
+		/*
+		 * page->first_page may be a dangling pointer to an old
+		 * compound page, so recheck that it is still a tail
+		 * page before returning.
+		 */
+		smp_rmb();
+		if (likely(PageTail(page)))
+			return head;
+	}
 	return page;
 }
 
-- 
cgit v1.2.3


From 9050d7eba40b3d79551668f54e68fd6f51945ef3 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Mon, 3 Mar 2014 15:38:27 -0800
Subject: mm: include VM_MIXEDMAP flag in the VM_SPECIAL list to avoid
 m(un)locking

Daniel Borkmann reported a VM_BUG_ON assertion failing:

  ------------[ cut here ]------------
  kernel BUG at mm/mlock.c:528!
  invalid opcode: 0000 [#1] SMP
  Modules linked in: ccm arc4 iwldvm [...]
   video
  CPU: 3 PID: 2266 Comm: netsniff-ng Not tainted 3.14.0-rc2+ #8
  Hardware name: LENOVO 2429BP3/2429BP3, BIOS G4ET37WW (1.12 ) 05/29/2012
  task: ffff8801f87f9820 ti: ffff88002cb44000 task.ti: ffff88002cb44000
  RIP: 0010:[<ffffffff81171ad0>]  [<ffffffff81171ad0>] munlock_vma_pages_range+0x2e0/0x2f0
  Call Trace:
    do_munmap+0x18f/0x3b0
    vm_munmap+0x41/0x60
    SyS_munmap+0x22/0x30
    system_call_fastpath+0x1a/0x1f
  RIP   munlock_vma_pages_range+0x2e0/0x2f0
  ---[ end trace a0088dcf07ae10f2 ]---

because munlock_vma_pages_range() thinks it's unexpectedly in the middle
of a THP page.  This can be reproduced with default config since 3.11
kernels.  A reproducer can be found in the kernel's selftest directory
for networking by running ./psock_tpacket.

The problem is that an order=2 compound page (allocated by
alloc_one_pg_vec_page() is part of the munlocked VM_MIXEDMAP vma (mapped
by packet_mmap()) and mistaken for a THP page and assumed to be order=9.

The checks for THP in munlock came with commit ff6a6da60b89 ("mm:
accelerate munlock() treatment of THP pages"), i.e.  since 3.9, but did
not trigger a bug.  It just makes munlock_vma_pages_range() skip such
compound pages until the next 512-pages-aligned page, when it encounters
a head page.  This is however not a problem for vma's where mlocking has
no effect anyway, but it can distort the accounting.

Since commit 7225522bb429 ("mm: munlock: batch non-THP page isolation
and munlock+putback using pagevec") this can trigger a VM_BUG_ON in
PageTransHuge() check.

This patch fixes the issue by adding VM_MIXEDMAP flag to VM_SPECIAL, a
list of flags that make vma's non-mlockable and non-mergeable.  The
reasoning is that VM_MIXEDMAP vma's are similar to VM_PFNMAP, which is
already on the VM_SPECIAL list, and both are intended for non-LRU pages
where mlocking makes no sense anyway.  Related Lkml discussion can be
found in [2].

 [1] tools/testing/selftests/net/psock_tpacket
 [2] https://lkml.org/lkml/2014/1/10/427

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Reported-by: Daniel Borkmann <dborkman@redhat.com>
Tested-by: Daniel Borkmann <dborkman@redhat.com>
Cc: Thomas Hellstrom <thellstrom@vmware.com>
Cc: John David Anglin <dave.anglin@bell.net>
Cc: HATAYAMA Daisuke <d.hatayama@jp.fujitsu.com>
Cc: Konstantin Khlebnikov <khlebnikov@openvz.org>
Cc: Carsten Otte <cotte@de.ibm.com>
Cc: Jared Hulbert <jaredeh@gmail.com>
Tested-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: <stable@vger.kernel.org> [3.11.x+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 03ab3e58f511..a1fe25110f50 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -175,7 +175,7 @@ extern unsigned int kobjsize(const void *objp);
  * Special vmas that are non-mergable, non-mlock()able.
  * Note: mm/huge_memory.c VM_NO_THP depends on this definition.
  */
-#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP)
+#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP)
 
 /*
  * mapping from the currently active vm_flags protection bits (the
-- 
cgit v1.2.3


From 1ae71d03194ea7424cbd14e449581f67c463d20d Mon Sep 17 00:00:00 2001
From: Liu Ping Fan <pingfank@linux.vnet.ibm.com>
Date: Mon, 3 Mar 2014 15:38:39 -0800
Subject: mm: numa: bugfix for LAST_CPUPID_NOT_IN_PAGE_FLAGS

When doing some numa tests on powerpc, I triggered an oops bug.  I find
it is caused by using page->_last_cpupid.  It should be initialized as
"-1 & LAST_CPUPID_MASK", but not "-1".  Otherwise, in task_numa_fault(),
we will miss the checking (last_cpupid == (-1 & LAST_CPUPID_MASK)).  And
finally cause an oops bug in task_numa_group(), since the online cpu is
less than possible cpu.  This happen with CONFIG_SPARSE_VMEMMAP disabled

Call trace:

  SMP NR_CPUS=64 NUMA PowerNV
  Modules linked in:
  CPU: 24 PID: 804 Comm: systemd-udevd Not tainted3.13.0-rc1+ #32
  task: c000001e2746aa80 ti: c000001e32c50000 task.ti:c000001e32c50000
  REGS: c000001e32c53510 TRAP: 0300   Not tainted(3.13.0-rc1+)
  MSR: 9000000000009032 <SF,HV,EE,ME,IR,DR,RI>  CR:28024424  XER: 20000000
  CFAR: c000000000009324 DAR: 7265717569726857 DSISR:40000000 SOFTE: 1
  NIP  .task_numa_fault+0x1470/0x2370
  LR  .task_numa_fault+0x1468/0x2370
  Call Trace:
   .task_numa_fault+0x1468/0x2370 (unreliable)
   .do_numa_page+0x480/0x4a0
   .handle_mm_fault+0x4ec/0xc90
   .do_page_fault+0x3a8/0x890
   handle_page_fault+0x10/0x30
  Instruction dump:
  3c82fefb 3884b138 48d9cff1 60000000 48000574 3c62fefb3863af78 3c82fefb
  3884b138 48d9cfd5 60000000 e93f0100 <812902e4> 7d2907b45529063e 7d2a07b4
  ---[ end trace 15f2510da5ae07cf ]---

Signed-off-by: Liu Ping Fan <pingfank@linux.vnet.ibm.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index a1fe25110f50..c1b7414c7bef 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -767,7 +767,7 @@ static inline bool __cpupid_match_pid(pid_t task_pid, int cpupid)
 #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
 static inline int page_cpupid_xchg_last(struct page *page, int cpupid)
 {
-	return xchg(&page->_last_cpupid, cpupid);
+	return xchg(&page->_last_cpupid, cpupid & LAST_CPUPID_MASK);
 }
 
 static inline int page_cpupid_last(struct page *page)
@@ -776,7 +776,7 @@ static inline int page_cpupid_last(struct page *page)
 }
 static inline void page_cpupid_reset_last(struct page *page)
 {
-	page->_last_cpupid = -1;
+	page->_last_cpupid = -1 & LAST_CPUPID_MASK;
 }
 #else
 static inline int page_cpupid_last(struct page *page)
-- 
cgit v1.2.3


From 70044d71d31d6973665ced5be04ef39ac1c09a48 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 7 Mar 2014 10:19:57 -0500
Subject: firewire: don't use PREPARE_DELAYED_WORK

PREPARE_[DELAYED_]WORK() are being phased out.  They have few users
and a nasty surprise in terms of reentrancy guarantee as workqueue
considers work items to be different if they don't have the same work
function.

firewire core-device and sbp2 have been been multiplexing work items
with multiple work functions.  Introduce fw_device_workfn() and
sbp2_lu_workfn() which invoke fw_device->workfn and
sbp2_logical_unit->workfn respectively and always use the two
functions as the work functions and update the users to set the
->workfn fields instead of overriding work functions using
PREPARE_DELAYED_WORK().

This fixes a variety of possible regressions since a2c1c57be8d9
"workqueue: consider work function when searching for busy work items"
due to which fw_workqueue lost its required non-reentrancy property.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
Cc: linux1394-devel@lists.sourceforge.net
Cc: stable@vger.kernel.org # v3.9+
Cc: stable@vger.kernel.org # v3.8.2+
Cc: stable@vger.kernel.org # v3.4.60+
Cc: stable@vger.kernel.org # v3.2.40+
---
 include/linux/firewire.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/firewire.h b/include/linux/firewire.h
index 5d7782e42b8f..c3683bdf28fe 100644
--- a/include/linux/firewire.h
+++ b/include/linux/firewire.h
@@ -200,6 +200,7 @@ struct fw_device {
 	unsigned irmc:1;
 	unsigned bc_implemented:2;
 
+	work_func_t workfn;
 	struct delayed_work work;
 	struct fw_attribute_group attribute_group;
 };
-- 
cgit v1.2.3


From 52a4c6404f91f2d2c5592ee6365a8418c4565f53 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@redhat.com>
Date: Fri, 7 Mar 2014 12:44:19 +0100
Subject: selinux: add gfp argument to security_xfrm_policy_alloc and fix
 callers

security_xfrm_policy_alloc can be called in atomic context so the
allocation should be done with GFP_ATOMIC. Add an argument to let the
callers choose the appropriate way. In order to do so a gfp argument
needs to be added to the method xfrm_policy_alloc_security in struct
security_operations and to the internal function
selinux_xfrm_alloc_user. After that switch to GFP_ATOMIC in the atomic
callers and leave GFP_KERNEL as before for the rest.
The path that needed the gfp argument addition is:
security_xfrm_policy_alloc -> security_ops.xfrm_policy_alloc_security ->
all users of xfrm_policy_alloc_security (e.g. selinux_xfrm_policy_alloc) ->
selinux_xfrm_alloc_user (here the allocation used to be GFP_KERNEL only)

Now adding a gfp argument to selinux_xfrm_alloc_user requires us to also
add it to security_context_to_sid which is used inside and prior to this
patch did only GFP_KERNEL allocation. So add gfp argument to
security_context_to_sid and adjust all of its callers as well.

CC: Paul Moore <paul@paul-moore.com>
CC: Dave Jones <davej@redhat.com>
CC: Steffen Klassert <steffen.klassert@secunet.com>
CC: Fan Du <fan.du@windriver.com>
CC: David S. Miller <davem@davemloft.net>
CC: LSM list <linux-security-module@vger.kernel.org>
CC: SELinux list <selinux@tycho.nsa.gov>

Signed-off-by: Nikolay Aleksandrov <nikolay@redhat.com>
Acked-by: Paul Moore <paul@paul-moore.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/linux/security.h | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/security.h b/include/linux/security.h
index 5623a7f965b7..2fc42d191f79 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -1040,6 +1040,7 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *	Allocate a security structure to the xp->security field; the security
  *	field is initialized to NULL when the xfrm_policy is allocated.
  *	Return 0 if operation was successful (memory to allocate, legal context)
+ *	@gfp is to specify the context for the allocation
  * @xfrm_policy_clone_security:
  *	@old_ctx contains an existing xfrm_sec_ctx.
  *	@new_ctxp contains a new xfrm_sec_ctx being cloned from old.
@@ -1683,7 +1684,7 @@ struct security_operations {
 
 #ifdef CONFIG_SECURITY_NETWORK_XFRM
 	int (*xfrm_policy_alloc_security) (struct xfrm_sec_ctx **ctxp,
-			struct xfrm_user_sec_ctx *sec_ctx);
+			struct xfrm_user_sec_ctx *sec_ctx, gfp_t gfp);
 	int (*xfrm_policy_clone_security) (struct xfrm_sec_ctx *old_ctx, struct xfrm_sec_ctx **new_ctx);
 	void (*xfrm_policy_free_security) (struct xfrm_sec_ctx *ctx);
 	int (*xfrm_policy_delete_security) (struct xfrm_sec_ctx *ctx);
@@ -2859,7 +2860,8 @@ static inline void security_skb_owned_by(struct sk_buff *skb, struct sock *sk)
 
 #ifdef CONFIG_SECURITY_NETWORK_XFRM
 
-int security_xfrm_policy_alloc(struct xfrm_sec_ctx **ctxp, struct xfrm_user_sec_ctx *sec_ctx);
+int security_xfrm_policy_alloc(struct xfrm_sec_ctx **ctxp,
+			       struct xfrm_user_sec_ctx *sec_ctx, gfp_t gfp);
 int security_xfrm_policy_clone(struct xfrm_sec_ctx *old_ctx, struct xfrm_sec_ctx **new_ctxp);
 void security_xfrm_policy_free(struct xfrm_sec_ctx *ctx);
 int security_xfrm_policy_delete(struct xfrm_sec_ctx *ctx);
@@ -2877,7 +2879,9 @@ void security_skb_classify_flow(struct sk_buff *skb, struct flowi *fl);
 
 #else	/* CONFIG_SECURITY_NETWORK_XFRM */
 
-static inline int security_xfrm_policy_alloc(struct xfrm_sec_ctx **ctxp, struct xfrm_user_sec_ctx *sec_ctx)
+static inline int security_xfrm_policy_alloc(struct xfrm_sec_ctx **ctxp,
+					     struct xfrm_user_sec_ctx *sec_ctx,
+					     gfp_t gfp)
 {
 	return 0;
 }
-- 
cgit v1.2.3


From 9c225f2655e36a470c4f58dbbc99244c5fc7f2d4 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 3 Mar 2014 09:36:58 -0800
Subject: vfs: atomic f_pos accesses as per POSIX

Our write() system call has always been atomic in the sense that you get
the expected thread-safe contiguous write, but we haven't actually
guaranteed that concurrent writes are serialized wrt f_pos accesses, so
threads (or processes) that share a file descriptor and use "write()"
concurrently would quite likely overwrite each others data.

This violates POSIX.1-2008/SUSv4 Section XSI 2.9.7 that says:

 "2.9.7 Thread Interactions with Regular File Operations

  All of the following functions shall be atomic with respect to each
  other in the effects specified in POSIX.1-2008 when they operate on
  regular files or symbolic links: [...]"

and one of the effects is the file position update.

This unprotected file position behavior is not new behavior, and nobody
has ever cared.  Until now.  Yongzhi Pan reported unexpected behavior to
Michael Kerrisk that was due to this.

This resolves the issue with a f_pos-specific lock that is taken by
read/write/lseek on file descriptors that may be shared across threads
or processes.

Reported-by: Yongzhi Pan <panyongzhi@gmail.com>
Reported-by: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/file.h | 6 ++++--
 include/linux/fs.h   | 6 +++++-
 2 files changed, 9 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/file.h b/include/linux/file.h
index cbacf4faf447..f2517fa2d610 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -28,12 +28,14 @@ static inline void fput_light(struct file *file, int fput_needed)
 
 struct fd {
 	struct file *file;
-	int need_put;
+	unsigned int flags;
 };
+#define FDPUT_FPUT       1
+#define FDPUT_POS_UNLOCK 2
 
 static inline void fdput(struct fd fd)
 {
-	if (fd.need_put)
+	if (fd.flags & FDPUT_FPUT)
 		fput(fd.file);
 }
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 60829565e552..ebfde04bca06 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -123,6 +123,9 @@ typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
 /* File is opened with O_PATH; almost nothing can be done with it */
 #define FMODE_PATH		((__force fmode_t)0x4000)
 
+/* File needs atomic accesses to f_pos */
+#define FMODE_ATOMIC_POS	((__force fmode_t)0x8000)
+
 /* File was opened by fanotify and shouldn't generate fanotify events */
 #define FMODE_NONOTIFY		((__force fmode_t)0x1000000)
 
@@ -780,13 +783,14 @@ struct file {
 	const struct file_operations	*f_op;
 
 	/*
-	 * Protects f_ep_links, f_flags, f_pos vs i_size in lseek SEEK_CUR.
+	 * Protects f_ep_links, f_flags.
 	 * Must not be taken from IRQ context.
 	 */
 	spinlock_t		f_lock;
 	atomic_long_t		f_count;
 	unsigned int 		f_flags;
 	fmode_t			f_mode;
+	struct mutex		f_pos_lock;
 	loff_t			f_pos;
 	struct fown_struct	f_owner;
 	const struct cred	*f_cred;
-- 
cgit v1.2.3


From bd2a31d522344b3ac2fb680bd2366e77a9bd8209 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 4 Mar 2014 14:54:22 -0500
Subject: get rid of fget_light()

instead of returning the flags by reference, we can just have the
low-level primitive return those in lower bits of unsigned long,
with struct file * derived from the rest.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/file.h | 21 +++++++++++----------
 include/linux/fs.h   |  2 +-
 2 files changed, 12 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/file.h b/include/linux/file.h
index f2517fa2d610..4d69123377a2 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -40,23 +40,24 @@ static inline void fdput(struct fd fd)
 }
 
 extern struct file *fget(unsigned int fd);
-extern struct file *fget_light(unsigned int fd, int *fput_needed);
+extern struct file *fget_raw(unsigned int fd);
+extern unsigned long __fdget(unsigned int fd);
+extern unsigned long __fdget_raw(unsigned int fd);
+extern unsigned long __fdget_pos(unsigned int fd);
 
-static inline struct fd fdget(unsigned int fd)
+static inline struct fd __to_fd(unsigned long v)
 {
-	int b;
-	struct file *f = fget_light(fd, &b);
-	return (struct fd){f,b};
+	return (struct fd){(struct file *)(v & ~3),v & 3};
 }
 
-extern struct file *fget_raw(unsigned int fd);
-extern struct file *fget_raw_light(unsigned int fd, int *fput_needed);
+static inline struct fd fdget(unsigned int fd)
+{
+	return __to_fd(__fdget(fd));
+}
 
 static inline struct fd fdget_raw(unsigned int fd)
 {
-	int b;
-	struct file *f = fget_raw_light(fd, &b);
-	return (struct fd){f,b};
+	return __to_fd(__fdget_raw(fd));
 }
 
 extern int f_dupfd(unsigned int from, struct file *file, unsigned flags);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ebfde04bca06..23b2a35d712e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -812,7 +812,7 @@ struct file {
 #ifdef CONFIG_DEBUG_WRITECOUNT
 	unsigned long f_mnt_write_state;
 #endif
-};
+} __attribute__((aligned(4)));	/* lest something weird decides that 2 is OK */
 
 struct file_handle {
 	__u32 handle_bytes;
-- 
cgit v1.2.3


From e97ca8e5b864f88b028c1759ba8536fa827d6d96 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Mon, 10 Mar 2014 15:49:43 -0700
Subject: mm: fix GFP_THISNODE callers and clarify

GFP_THISNODE is for callers that implement their own clever fallback to
remote nodes.  It restricts the allocation to the specified node and
does not invoke reclaim, assuming that the caller will take care of it
when the fallback fails, e.g.  through a subsequent allocation request
without GFP_THISNODE set.

However, many current GFP_THISNODE users only want the node exclusive
aspect of the flag, without actually implementing their own fallback or
triggering reclaim if necessary.  This results in things like page
migration failing prematurely even when there is easily reclaimable
memory available, unless kswapd happens to be running already or a
concurrent allocation attempt triggers the necessary reclaim.

Convert all callsites that don't implement their own fallback strategy
to __GFP_THISNODE.  This restricts the allocation a single node too, but
at the same time allows the allocator to enter the slowpath, wake
kswapd, and invoke direct reclaim if necessary, to make the allocation
happen when memory is full.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Jan Stancek <jstancek@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/gfp.h    | 4 ++++
 include/linux/mmzone.h | 4 ++--
 include/linux/slab.h   | 2 +-
 3 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 0437439bc047..39b81dc7d01a 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -123,6 +123,10 @@ struct vm_area_struct;
 			 __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN | \
 			 __GFP_NO_KSWAPD)
 
+/*
+ * GFP_THISNODE does not perform any reclaim, you most likely want to
+ * use __GFP_THISNODE to allocate from a given node without fallback!
+ */
 #ifdef CONFIG_NUMA
 #define GFP_THISNODE	(__GFP_THISNODE | __GFP_NOWARN | __GFP_NORETRY)
 #else
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 5f2052c83154..9b61b9bf81ac 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -590,10 +590,10 @@ static inline bool zone_is_empty(struct zone *zone)
 
 /*
  * The NUMA zonelists are doubled because we need zonelists that restrict the
- * allocations to a single node for GFP_THISNODE.
+ * allocations to a single node for __GFP_THISNODE.
  *
  * [0]	: Zonelist with fallback
- * [1]	: No fallback (GFP_THISNODE)
+ * [1]	: No fallback (__GFP_THISNODE)
  */
 #define MAX_ZONELISTS 2
 
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 9260abdd67df..b5b2df60299e 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -410,7 +410,7 @@ static __always_inline void *kmalloc_large(size_t size, gfp_t flags)
  *
  * %GFP_NOWAIT - Allocation will not sleep.
  *
- * %GFP_THISNODE - Allocate node-local memory only.
+ * %__GFP_THISNODE - Allocate node-local memory only.
  *
  * %GFP_DMA - Allocation suitable for DMA.
  *   Should only be used for kmalloc() caches. Otherwise, use a
-- 
cgit v1.2.3


From ff0992e9036e9810e7cd45234fa32ca1e79750e2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B8rn=20Mork?= <bjorn@mork.no>
Date: Mon, 17 Mar 2014 16:25:18 +0100
Subject: net: cdc_ncm: fix control message ordering
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is a context modified revert of commit 6a9612e2cb22
("net: cdc_ncm: remove ncm_parm field") which introduced
a NCM specification violation, causing setup errors for
some devices. These errors resulted in the device and
host disagreeing about shared settings, with complete
failure to communicate as the end result.

The NCM specification require that many of the NCM specific
control reuests are sent only while the NCM Data Interface
is in alternate setting 0. Reverting the commit ensures that
we follow this requirement.

Fixes: 6a9612e2cb22 ("net: cdc_ncm: remove ncm_parm field")
Reported-and-tested-by: Pasi Kärkkäinen <pasik@iki.fi>
Reported-by: Thomas Schäfer <tschaefer@t-online.de>
Signed-off-by: Bjørn Mork <bjorn@mork.no>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/usb/cdc_ncm.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/usb/cdc_ncm.h b/include/linux/usb/cdc_ncm.h
index c3fa80745996..2c14d9cdd57a 100644
--- a/include/linux/usb/cdc_ncm.h
+++ b/include/linux/usb/cdc_ncm.h
@@ -88,6 +88,7 @@
 #define cdc_ncm_data_intf_is_mbim(x)  ((x)->desc.bInterfaceProtocol == USB_CDC_MBIM_PROTO_NTB)
 
 struct cdc_ncm_ctx {
+	struct usb_cdc_ncm_ntb_parameters ncm_parm;
 	struct hrtimer tx_timer;
 	struct tasklet_struct bh;
 
-- 
cgit v1.2.3


From 87291347c49dc40aa339f587b209618201c2e527 Mon Sep 17 00:00:00 2001
From: Vaibhav Nagarnaik <vnagarnaik@google.com>
Date: Thu, 13 Feb 2014 19:51:48 -0800
Subject: tracing: Fix array size mismatch in format string

In event format strings, the array size is reported in two locations.
One in array subscript and then via the "size:" attribute. The values
reported there have a mismatch.

For e.g., in sched:sched_switch the prev_comm and next_comm character
arrays have subscript values as [32] where as the actual field size is
16.

name: sched_switch
ID: 301
format:
        field:unsigned short common_type;       offset:0;       size:2; signed:0;
        field:unsigned char common_flags;       offset:2;       size:1; signed:0;
        field:unsigned char common_preempt_count;       offset:3;       size:1;signed:0;
        field:int common_pid;   offset:4;       size:4; signed:1;

        field:char prev_comm[32];       offset:8;       size:16;        signed:1;
        field:pid_t prev_pid;   offset:24;      size:4; signed:1;
        field:int prev_prio;    offset:28;      size:4; signed:1;
        field:long prev_state;  offset:32;      size:8; signed:1;
        field:char next_comm[32];       offset:40;      size:16;        signed:1;
        field:pid_t next_pid;   offset:56;      size:4; signed:1;
        field:int next_prio;    offset:60;      size:4; signed:1;

After bisection, the following commit was blamed:
92edca0 tracing: Use direct field, type and system names

This commit removes the duplication of strings for field->name and
field->type assuming that all the strings passed in
__trace_define_field() are immutable. This is not true for arrays, where
the type string is created in event_storage variable and field->type for
all array fields points to event_storage.

Use __stringify() to create a string constant for the type string.

Also, get rid of event_storage and event_storage_mutex that are not
needed anymore.

also, an added benefit is that this reduces the overhead of events a bit more:

   text    data     bss     dec     hex filename
8424787 2036472 1302528 11763787         b3804b vmlinux
8420814 2036408 1302528 11759750         b37086 vmlinux.patched

Link: http://lkml.kernel.org/r/1392349908-29685-1-git-send-email-vnagarnaik@google.com

Cc: Laurent Chavey <chavey@google.com>
Cc: stable@vger.kernel.org # 3.10+
Signed-off-by: Vaibhav Nagarnaik <vnagarnaik@google.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace_event.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 4e4cc28623ad..4cdb3a17bcb5 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -495,10 +495,6 @@ enum {
 	FILTER_TRACE_FN,
 };
 
-#define EVENT_STORAGE_SIZE 128
-extern struct mutex event_storage_mutex;
-extern char event_storage[EVENT_STORAGE_SIZE];
-
 extern int trace_event_raw_init(struct ftrace_event_call *call);
 extern int trace_define_field(struct ftrace_event_call *call, const char *type,
 			      const char *name, int offset, int size,
-- 
cgit v1.2.3


From 7e09e738afd21ef99f047425fc0b0c9be8b03254 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Thu, 20 Mar 2014 21:52:17 -0700
Subject: mm: fix swapops.h:131 bug if remap_file_pages raced migration

Add remove_linear_migration_ptes_from_nonlinear(), to fix an interesting
little include/linux/swapops.h:131 BUG_ON(!PageLocked) found by trinity:
indicating that remove_migration_ptes() failed to find one of the
migration entries that was temporarily inserted.

The problem comes from remap_file_pages()'s switch from vma_interval_tree
(good for inserting the migration entry) to i_mmap_nonlinear list (no good
for locating it again); but can only be a problem if the remap_file_pages()
range does not cover the whole of the vma (zap_pte() clears the range).

remove_migration_ptes() needs a file_nonlinear method to go down the
i_mmap_nonlinear list, applying linear location to look for migration
entries in those vmas too, just in case there was this race.

The file_nonlinear method does need rmap_walk_control.arg to do this;
but it never needed vma passed in - vma comes from its own iteration.

Reported-and-tested-by: Dave Jones <davej@redhat.com>
Reported-and-tested-by: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/rmap.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 1da693d51255..b66c2110cb1f 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -250,8 +250,7 @@ struct rmap_walk_control {
 	int (*rmap_one)(struct page *page, struct vm_area_struct *vma,
 					unsigned long addr, void *arg);
 	int (*done)(struct page *page);
-	int (*file_nonlinear)(struct page *, struct address_space *,
-					struct vm_area_struct *vma);
+	int (*file_nonlinear)(struct page *, struct address_space *, void *arg);
 	struct anon_vma *(*anon_lock)(struct page *page);
 	bool (*invalid_vma)(struct vm_area_struct *vma, void *arg);
 };
-- 
cgit v1.2.3