From a6e2f029ae34f41adb6ae3812c32c5d326e1abd2 Mon Sep 17 00:00:00 2001
From: Chris Metcalf <cmetcalf@ezchip.com>
Date: Wed, 29 Apr 2015 12:48:40 -0400
Subject: Make asm/word-at-a-time.h available on all architectures

Added the x86 implementation of word-at-a-time to the
generic version, which previously only supported big-endian.

Omitted the x86-specific load_unaligned_zeropad(), which in
any case is also not present for the existing BE-only
implementation of a word-at-a-time, and is only used under
CONFIG_DCACHE_WORD_ACCESS.

Added as a "generic-y" to the Kbuilds of all architectures
that didn't previously have it.

Signed-off-by: Chris Metcalf <cmetcalf@ezchip.com>
---
 include/asm-generic/word-at-a-time.h | 80 ++++++++++++++++++++++++++++++++----
 1 file changed, 72 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/asm-generic/word-at-a-time.h b/include/asm-generic/word-at-a-time.h
index 94f9ea8abcae..011dde083f23 100644
--- a/include/asm-generic/word-at-a-time.h
+++ b/include/asm-generic/word-at-a-time.h
@@ -1,15 +1,10 @@
 #ifndef _ASM_WORD_AT_A_TIME_H
 #define _ASM_WORD_AT_A_TIME_H
 
-/*
- * This says "generic", but it's actually big-endian only.
- * Little-endian can use more efficient versions of these
- * interfaces, see for example
- *	 arch/x86/include/asm/word-at-a-time.h
- * for those.
- */
-
 #include <linux/kernel.h>
+#include <asm/byteorder.h>
+
+#ifdef __BIG_ENDIAN
 
 struct word_at_a_time {
 	const unsigned long high_bits, low_bits;
@@ -53,4 +48,73 @@ static inline bool has_zero(unsigned long val, unsigned long *data, const struct
 #define zero_bytemask(mask) (~1ul << __fls(mask))
 #endif
 
+#else
+
+/*
+ * The optimal byte mask counting is probably going to be something
+ * that is architecture-specific. If you have a reliably fast
+ * bit count instruction, that might be better than the multiply
+ * and shift, for example.
+ */
+struct word_at_a_time {
+	const unsigned long one_bits, high_bits;
+};
+
+#define WORD_AT_A_TIME_CONSTANTS { REPEAT_BYTE(0x01), REPEAT_BYTE(0x80) }
+
+#ifdef CONFIG_64BIT
+
+/*
+ * Jan Achrenius on G+: microoptimized version of
+ * the simpler "(mask & ONEBYTES) * ONEBYTES >> 56"
+ * that works for the bytemasks without having to
+ * mask them first.
+ */
+static inline long count_masked_bytes(unsigned long mask)
+{
+	return mask*0x0001020304050608ul >> 56;
+}
+
+#else	/* 32-bit case */
+
+/* Carl Chatfield / Jan Achrenius G+ version for 32-bit */
+static inline long count_masked_bytes(long mask)
+{
+	/* (000000 0000ff 00ffff ffffff) -> ( 1 1 2 3 ) */
+	long a = (0x0ff0001+mask) >> 23;
+	/* Fix the 1 for 00 case */
+	return a & mask;
+}
+
+#endif
+
+/* Return nonzero if it has a zero */
+static inline unsigned long has_zero(unsigned long a, unsigned long *bits, const struct word_at_a_time *c)
+{
+	unsigned long mask = ((a - c->one_bits) & ~a) & c->high_bits;
+	*bits = mask;
+	return mask;
+}
+
+static inline unsigned long prep_zero_mask(unsigned long a, unsigned long bits, const struct word_at_a_time *c)
+{
+	return bits;
+}
+
+static inline unsigned long create_zero_mask(unsigned long bits)
+{
+	bits = (bits - 1) & ~bits;
+	return bits >> 7;
+}
+
+/* The mask we created is directly usable as a bytemask */
+#define zero_bytemask(mask) (mask)
+
+static inline unsigned long find_zero(unsigned long mask)
+{
+	return count_masked_bytes(mask);
+}
+
+#endif /* __BIG_ENDIAN */
+
 #endif /* _ASM_WORD_AT_A_TIME_H */
-- 
cgit v1.2.3


From ae1ff3d623905947158fd3394854c23026337810 Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Mon, 13 Jul 2015 14:31:28 +0300
Subject: iommu: iova: Move iova cache management to the iova library

This is necessary to separate intel-iommu from the iova library.

Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 include/linux/iova.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/iova.h b/include/linux/iova.h
index 3920a19d8194..92f7177db2ce 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -68,8 +68,8 @@ static inline unsigned long iova_pfn(struct iova_domain *iovad, dma_addr_t iova)
 	return iova >> iova_shift(iovad);
 }
 
-int iommu_iova_cache_init(void);
-void iommu_iova_cache_destroy(void);
+int iova_cache_get(void);
+void iova_cache_put(void);
 
 struct iova *alloc_iova_mem(void);
 void free_iova_mem(struct iova *iova);
-- 
cgit v1.2.3


From 30035e45753b708e7d47a98398500ca005e02b86 Mon Sep 17 00:00:00 2001
From: Chris Metcalf <cmetcalf@ezchip.com>
Date: Wed, 29 Apr 2015 12:52:04 -0400
Subject: string: provide strscpy()

The strscpy() API is intended to be used instead of strlcpy(),
and instead of most uses of strncpy().

- Unlike strlcpy(), it doesn't read from memory beyond (src + size).

- Unlike strlcpy() or strncpy(), the API provides an easy way to check
  for destination buffer overflow: an -E2BIG error return value.

- The provided implementation is robust in the face of the source
  buffer being asynchronously changed during the copy, unlike the
  current implementation of strlcpy().

- Unlike strncpy(), the destination buffer will be NUL-terminated
  if the string in the source buffer is too long.

- Also unlike strncpy(), the destination buffer will not be updated
  beyond the NUL termination, avoiding strncpy's behavior of zeroing
  the entire tail end of the destination buffer.  (A memset() after
  the strscpy() can be used if this behavior is desired.)

- The implementation should be reasonably performant on all
  platforms since it uses the asm/word-at-a-time.h API rather than
  simple byte copy.  Kernel-to-kernel string copy is not considered
  to be performance critical in any case.

Signed-off-by: Chris Metcalf <cmetcalf@ezchip.com>
---
 include/linux/string.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/linux/string.h b/include/linux/string.h
index a8d90db9c4b0..9ef7795e65e4 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -25,6 +25,9 @@ extern char * strncpy(char *,const char *, __kernel_size_t);
 #ifndef __HAVE_ARCH_STRLCPY
 size_t strlcpy(char *, const char *, size_t);
 #endif
+#ifndef __HAVE_ARCH_STRSCPY
+ssize_t __must_check strscpy(char *, const char *, size_t);
+#endif
 #ifndef __HAVE_ARCH_STRCAT
 extern char * strcat(char *, const char *);
 #endif
-- 
cgit v1.2.3


From cd33dc9ac2977ebe30cecbf39d2992190fbac5b4 Mon Sep 17 00:00:00 2001
From: Punit Agrawal <punit.agrawal@arm.com>
Date: Tue, 8 Sep 2015 14:51:12 +0100
Subject: thermal: Fix thermal_zone_of_sensor_register to match documentation

thermal_zone_of_sensor_register is documented as returning a pointer
to either a valid thermal_zone_device on success, or a corresponding
ERR_PTR() value.

In contrast, the function returns NULL when THERMAL_OF is configured
off. Fix this.

Signed-off-by: Punit Agrawal <punit.agrawal@arm.com>
Acked-by: Guenter Roeck <linux@roeck-us.net>
Cc: Eduardo Valentin <edubezval@gmail.com>
Cc: Zhang Rui <rui.zhang@intel.com>
Signed-off-by: Eduardo Valentin <edubezval@gmail.com>
---
 include/linux/thermal.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 17292fee8686..0c5518e13584 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -360,7 +360,7 @@ static inline struct thermal_zone_device *
 thermal_zone_of_sensor_register(struct device *dev, int id, void *data,
 				const struct thermal_zone_of_device_ops *ops)
 {
-	return NULL;
+	return ERR_PTR(-ENODEV);
 }
 
 static inline
-- 
cgit v1.2.3


From c973c3bcec3752455c4d7545edd42935cd7942d9 Mon Sep 17 00:00:00 2001
From: Javi Merino <javi.merino@arm.com>
Date: Mon, 14 Sep 2015 14:23:50 +0100
Subject: thermal: Add a function to get the minimum power

The thermal core already has a function to get the maximum power of a
cooling device: power_actor_get_max_power().  Add a function to get the
minimum power of a cooling device.

Cc: Zhang Rui <rui.zhang@intel.com>
Cc: Eduardo Valentin <edubezval@gmail.com>
Reviewed-by: Daniel Kurtz <djkurtz@chromium.org>
Signed-off-by: Javi Merino <javi.merino@arm.com>
Signed-off-by: Eduardo Valentin <edubezval@gmail.com>
---
 include/linux/thermal.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include')

diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 0c5518e13584..157d366e761b 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -380,6 +380,8 @@ static inline bool cdev_is_power_actor(struct thermal_cooling_device *cdev)
 
 int power_actor_get_max_power(struct thermal_cooling_device *,
 			      struct thermal_zone_device *tz, u32 *max_power);
+int power_actor_get_min_power(struct thermal_cooling_device *,
+			      struct thermal_zone_device *tz, u32 *min_power);
 int power_actor_set_power(struct thermal_cooling_device *,
 			  struct thermal_instance *, u32);
 struct thermal_zone_device *thermal_zone_device_register(const char *, int, int,
@@ -415,6 +417,10 @@ static inline bool cdev_is_power_actor(struct thermal_cooling_device *cdev)
 static inline int power_actor_get_max_power(struct thermal_cooling_device *cdev,
 			      struct thermal_zone_device *tz, u32 *max_power)
 { return 0; }
+static inline int power_actor_get_min_power(struct thermal_cooling_device *cdev,
+					    struct thermal_zone_device *tz,
+					    u32 *min_power)
+{ return -ENODEV; }
 static inline int power_actor_set_power(struct thermal_cooling_device *cdev,
 			  struct thermal_instance *tz, u32 power)
 { return 0; }
-- 
cgit v1.2.3


From 63cdbc06b357dcb3a7104a421ee4a4550d7fadfd Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 14 Sep 2015 17:06:27 +0200
Subject: netfilter: bridge: fix routing of bridge frames with call-iptables=1

We can't re-use the physoutdev storage area.

1.  When using NFQUEUE in PREROUTING, we attempt to bump a bogus
refcnt since nf_bridge->physoutdev is garbage (ipv4/ipv6 address)

2. for same reason, we crash in physdev match in FORWARD or later if
skb is routed instead of bridged.

This increases nf_bridge_info to 40 bytes, but we have no other choice.

Fixes: 72b1e5e4cac7 ("netfilter: bridge: reduce nf_bridge_info to 32 bytes again")
Reported-by: Sander Eikelenboom <linux@eikelenboom.it>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/skbuff.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 2738d355cdf9..9987af080fa0 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -179,6 +179,9 @@ struct nf_bridge_info {
 	u8			bridged_dnat:1;
 	__u16			frag_max_size;
 	struct net_device	*physindev;
+
+	/* always valid & non-NULL from FORWARD on, for physdev match */
+	struct net_device	*physoutdev;
 	union {
 		/* prerouting: detect dnat in orig/reply direction */
 		__be32          ipv4_daddr;
@@ -189,9 +192,6 @@ struct nf_bridge_info {
 		 * skb is out in neigh layer.
 		 */
 		char neigh_header[8];
-
-		/* always valid & non-NULL from FORWARD on, for physdev match */
-		struct net_device *physoutdev;
 	};
 };
 #endif
-- 
cgit v1.2.3


From f230d1e891ba1da5953460516960894154f265db Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Tue, 15 Sep 2015 14:30:06 -0700
Subject: ipv6: Rename the dst_cache helper functions in ip6_tunnel

It is a prep work to fix the dst_entry refcnt bugs in
ip6_tunnel.

This patch rename:
1. ip6_tnl_dst_check() to ip6_tnl_dst_get() to better
   reflect that it will take a dst refcnt in the next patch.
2. ip6_tnl_dst_store() to ip6_tnl_dst_set() to have a more
   conventional name matching with ip6_tnl_dst_get().

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_tunnel.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_tunnel.h b/include/net/ip6_tunnel.h
index b8529aa1dae7..979b081a47e8 100644
--- a/include/net/ip6_tunnel.h
+++ b/include/net/ip6_tunnel.h
@@ -60,9 +60,9 @@ struct ipv6_tlv_tnl_enc_lim {
 	__u8 encap_limit;	/* tunnel encapsulation limit   */
 } __packed;
 
-struct dst_entry *ip6_tnl_dst_check(struct ip6_tnl *t);
+struct dst_entry *ip6_tnl_dst_get(struct ip6_tnl *t);
 void ip6_tnl_dst_reset(struct ip6_tnl *t);
-void ip6_tnl_dst_store(struct ip6_tnl *t, struct dst_entry *dst);
+void ip6_tnl_dst_set(struct ip6_tnl *t, struct dst_entry *dst);
 int ip6_tnl_rcv_ctl(struct ip6_tnl *t, const struct in6_addr *laddr,
 		const struct in6_addr *raddr);
 int ip6_tnl_xmit_ctl(struct ip6_tnl *t, const struct in6_addr *laddr,
-- 
cgit v1.2.3


From cdf3464e6c6bd764277cbbe992cd12da735b92fb Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Tue, 15 Sep 2015 14:30:07 -0700
Subject: ipv6: Fix dst_entry refcnt bugs in ip6_tunnel

Problems in the current dst_entry cache in the ip6_tunnel:

1. ip6_tnl_dst_set is racy.  There is no lock to protect it:
   - One major problem is that the dst refcnt gets messed up. F.e.
     the same dst_cache can be released multiple times and then
     triggering the infamous dst refcnt < 0 warning message.
   - Another issue is the inconsistency between dst_cache and
     dst_cookie.

   It can be reproduced by adding and removing the ip6gre tunnel
   while running a super_netperf TCP_CRR test.

2. ip6_tnl_dst_get does not take the dst refcnt before returning
   the dst.

This patch:
1. Create a percpu dst_entry cache in ip6_tnl
2. Use a spinlock to protect the dst_cache operations
3. ip6_tnl_dst_get always takes the dst refcnt before returning

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_tunnel.h | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_tunnel.h b/include/net/ip6_tunnel.h
index 979b081a47e8..60b4f402f78c 100644
--- a/include/net/ip6_tunnel.h
+++ b/include/net/ip6_tunnel.h
@@ -32,6 +32,12 @@ struct __ip6_tnl_parm {
 	__be32			o_key;
 };
 
+struct ip6_tnl_dst {
+	spinlock_t lock;
+	struct dst_entry *dst;
+	u32 cookie;
+};
+
 /* IPv6 tunnel */
 struct ip6_tnl {
 	struct ip6_tnl __rcu *next;	/* next tunnel in list */
@@ -39,8 +45,7 @@ struct ip6_tnl {
 	struct net *net;	/* netns for packet i/o */
 	struct __ip6_tnl_parm parms;	/* tunnel configuration parameters */
 	struct flowi fl;	/* flowi template for xmit */
-	struct dst_entry *dst_cache;    /* cached dst */
-	u32 dst_cookie;
+	struct ip6_tnl_dst __percpu *dst_cache;	/* cached dst */
 
 	int err_count;
 	unsigned long err_time;
@@ -61,6 +66,8 @@ struct ipv6_tlv_tnl_enc_lim {
 } __packed;
 
 struct dst_entry *ip6_tnl_dst_get(struct ip6_tnl *t);
+int ip6_tnl_dst_init(struct ip6_tnl *t);
+void ip6_tnl_dst_destroy(struct ip6_tnl *t);
 void ip6_tnl_dst_reset(struct ip6_tnl *t);
 void ip6_tnl_dst_set(struct ip6_tnl *t, struct dst_entry *dst);
 int ip6_tnl_rcv_ctl(struct ip6_tnl *t, const struct in6_addr *laddr,
-- 
cgit v1.2.3


From 70da5b5c532f0ec8aa76b4f46158da5f010f34b3 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Tue, 15 Sep 2015 14:30:09 -0700
Subject: ipv6: Replace spinlock with seqlock and rcu in ip6_tunnel

This patch uses a seqlock to ensure consistency between idst->dst and
idst->cookie.  It also makes dst freeing from fib tree to undergo a
rcu grace period.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_tunnel.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_tunnel.h b/include/net/ip6_tunnel.h
index 60b4f402f78c..65c2a9397b3c 100644
--- a/include/net/ip6_tunnel.h
+++ b/include/net/ip6_tunnel.h
@@ -33,8 +33,8 @@ struct __ip6_tnl_parm {
 };
 
 struct ip6_tnl_dst {
-	spinlock_t lock;
-	struct dst_entry *dst;
+	seqlock_t lock;
+	struct dst_entry __rcu *dst;
 	u32 cookie;
 };
 
-- 
cgit v1.2.3


From 0243ed44ad4a25dbd2e92ad97e5e12a1a6c72d6c Mon Sep 17 00:00:00 2001
From: Geliang Tang <geliangtang@163.com>
Date: Tue, 15 Sep 2015 04:59:21 -0700
Subject: spi: fix kernel-doc warnings in spi.h

Fix the following 'make htmldocs' warnings:

  .//include/linux/spi/spi.h:71: warning: No description found for parameter 'lock'
  .//include/linux/spi/spi.h:71: warning: Excess struct/union/enum/typedef member 'clock' description in 'spi_statistics'

Signed-off-by: Geliang Tang <geliangtang@163.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/spi/spi.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index 269e8afd3e2a..6b00f18f5e6b 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -34,7 +34,7 @@ extern struct bus_type spi_bus_type;
 
 /**
  * struct spi_statistics - statistics for spi transfers
- * @clock:         lock protecting this structure
+ * @lock:          lock protecting this structure
  *
  * @messages:      number of spi-messages handled
  * @transfers:     number of spi_transfers handled
-- 
cgit v1.2.3


From 37a1d3611c126fd9782ce5235791f898f053e763 Mon Sep 17 00:00:00 2001
From: Roopa Prabhu <roopa@cumulusnetworks.com>
Date: Sun, 13 Sep 2015 10:18:33 -0700
Subject: ipv6: include NLM_F_REPLACE in route replace notifications

This patch adds NLM_F_REPLACE flag to ipv6 route replace notifications.
This makes nlm_flags in ipv6 replace notifications consistent
with ipv4.

Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Acked-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Reviewed-by: Michal Kubecek <mkubecek@suse.cz>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 063d30474cf6..aaf9700fc9e5 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -275,7 +275,8 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
 	     struct nl_info *info, struct mx6_config *mxc);
 int fib6_del(struct rt6_info *rt, struct nl_info *info);
 
-void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info);
+void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
+		     unsigned int flags);
 
 void fib6_run_gc(unsigned long expires, struct net *net, bool force);
 
-- 
cgit v1.2.3


From 0fdea1e8a2853f79d39b8555cc9de16a7e0ab26f Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Wed, 16 Sep 2015 23:43:17 -0400
Subject: SUNRPC: Ensure that we wait for connections to complete before
 retrying

Commit 718ba5b87343, moved the responsibility for unlocking the socket to
xs_tcp_setup_socket, meaning that the socket will be unlocked before we
know that it has finished trying to connect. The following patch is based on
an initial patch by Russell King to ensure that we delay clearing the
XPRT_CONNECTING flag until we either know that we failed to initiate
a connection attempt, or the connection attempt itself failed.

Fixes: 718ba5b87343 ("SUNRPC: Add helpers to prevent socket create from racing")
Reported-by: Russell King <linux@arm.linux.org.uk>
Reported-by: Russell King <rmk+kernel@arm.linux.org.uk>
Tested-by: Russell King <rmk+kernel@arm.linux.org.uk>
Tested-by: Benjamin Coddington <bcodding@redhat.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 include/linux/sunrpc/xprtsock.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/linux/sunrpc/xprtsock.h b/include/linux/sunrpc/xprtsock.h
index 7591788e9fbf..357e44c1a46b 100644
--- a/include/linux/sunrpc/xprtsock.h
+++ b/include/linux/sunrpc/xprtsock.h
@@ -42,6 +42,7 @@ struct sock_xprt {
 	/*
 	 * Connection of transports
 	 */
+	unsigned long		sock_state;
 	struct delayed_work	connect_worker;
 	struct sockaddr_storage	srcaddr;
 	unsigned short		srcport;
@@ -76,6 +77,8 @@ struct sock_xprt {
  */
 #define TCP_RPC_REPLY		(1UL << 6)
 
+#define XPRT_SOCK_CONNECTING	1U
+
 #endif /* __KERNEL__ */
 
 #endif /* _LINUX_SUNRPC_XPRTSOCK_H */
-- 
cgit v1.2.3


From 58189ca7b27411c3dc9a5cb9eeee0906da684c59 Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Tue, 15 Sep 2015 15:10:50 -0700
Subject: net: Fix vti use case with oif in dst lookups

Steffen reported that the recent change to add oif to dst lookups breaks
the VTI use case. The problem is that with the oif set in the flow struct
the comparison to the nh_oif is triggered. Fix by splitting the
FLOWI_FLAG_VRFSRC into 2 flags -- one that triggers the vrf device cache
bypass (FLOWI_FLAG_VRFSRC) and another telling the lookup to not compare
nh oif (FLOWI_FLAG_SKIP_NH_OIF).

Fixes: 42a7b32b73d6 ("xfrm: Add oif to dst lookups")

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Acked-by: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow.h  | 1 +
 include/net/route.h | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/flow.h b/include/net/flow.h
index acd6a096250e..9b85db85f13c 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -35,6 +35,7 @@ struct flowi_common {
 #define FLOWI_FLAG_ANYSRC		0x01
 #define FLOWI_FLAG_KNOWN_NH		0x02
 #define FLOWI_FLAG_VRFSRC		0x04
+#define FLOWI_FLAG_SKIP_NH_OIF		0x08
 	__u32	flowic_secid;
 	struct flowi_tunnel flowic_tun_key;
 };
diff --git a/include/net/route.h b/include/net/route.h
index cc61cb95f059..f46af256880c 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -255,7 +255,7 @@ static inline void ip_route_connect_init(struct flowi4 *fl4, __be32 dst, __be32
 		flow_flags |= FLOWI_FLAG_ANYSRC;
 
 	if (netif_index_is_vrf(sock_net(sk), oif))
-		flow_flags |= FLOWI_FLAG_VRFSRC;
+		flow_flags |= FLOWI_FLAG_VRFSRC | FLOWI_FLAG_SKIP_NH_OIF;
 
 	flowi4_init_output(fl4, oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE,
 			   protocol, flow_flags, dst, src, dport, sport);
-- 
cgit v1.2.3


From 66e8c57da6bf6b847a48a5a6fda59512f733ed78 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 25 Aug 2015 20:45:18 +0200
Subject: rcu: Change _wait_rcu_gp() to work around GCC bug 67055

Code like this in inline functions confuses some recent versions of gcc:

	const int n = const-expr;
	whatever_t array[n];

For more details, see:

	https://gcc.gnu.org/bugzilla/show_bug.cgi?id=67055#c13

This compiler bug results in the following failure after 114b7fd4b (rcu:
Create rcu_sync infrastructure):

	In file included from include/linux/rcupdate.h:429:0,
			  from include/linux/rcu_sync.h:5,
			  from kernel/rcu/sync.c:1:
	include/linux/rcutiny.h: In function 'rcu_barrier_sched':
	include/linux/rcutiny.h:55:20: internal compiler error: Segmentation fault
	  static inline void rcu_barrier_sched(void)

This commit therefore eliminates the constant local variable in favor of
direct use of the expression.

Reported-and-tested-by: Mark Salter <msalter@redhat.com>
Reported-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rcupdate.h | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index ff476515f716..581abf848566 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -230,12 +230,11 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array,
 		   struct rcu_synchronize *rs_array);
 
 #define _wait_rcu_gp(checktiny, ...) \
-do { \
-	call_rcu_func_t __crcu_array[] = { __VA_ARGS__ }; \
-	const int __n = ARRAY_SIZE(__crcu_array); \
-	struct rcu_synchronize __rs_array[__n]; \
-	\
-	__wait_rcu_gp(checktiny, __n, __crcu_array, __rs_array); \
+do {									\
+	call_rcu_func_t __crcu_array[] = { __VA_ARGS__ };		\
+	struct rcu_synchronize __rs_array[ARRAY_SIZE(__crcu_array)];	\
+	__wait_rcu_gp(checktiny, ARRAY_SIZE(__crcu_array),		\
+			__crcu_array, __rs_array);			\
 } while (0)
 
 #define wait_rcu_gp(...) _wait_rcu_gp(false, __VA_ARGS__)
-- 
cgit v1.2.3


From 0315e382704817b279e5693dca8ab9d89aa20b3f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nikola=20Forr=C3=B3?= <nforro@redhat.com>
Date: Thu, 17 Sep 2015 16:01:32 +0200
Subject: net: Fix behaviour of unreachable, blackhole and prohibit routes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Man page of ip-route(8) says following about route types:

  unreachable - these destinations are unreachable.  Packets are dis‐
  carded and the ICMP message host unreachable is generated.  The local
  senders get an EHOSTUNREACH error.

  blackhole - these destinations are unreachable.  Packets are dis‐
  carded silently.  The local senders get an EINVAL error.

  prohibit - these destinations are unreachable.  Packets are discarded
  and the ICMP message communication administratively prohibited is
  generated.  The local senders get an EACCES error.

In the inet6 address family, this was correct, except the local senders
got ENETUNREACH error instead of EHOSTUNREACH in case of unreachable route.
In the inet address family, all three route types generated ICMP message
net unreachable, and the local senders got ENETUNREACH error.

In both address families all three route types now behave consistently
with documentation.

Signed-off-by: Nikola Forró <nforro@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h | 30 +++++++++++++++++++-----------
 1 file changed, 19 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index a37d0432bebd..727d6e9a9685 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -236,8 +236,11 @@ static inline int fib_lookup(struct net *net, const struct flowi4 *flp,
 	rcu_read_lock();
 
 	tb = fib_get_table(net, RT_TABLE_MAIN);
-	if (tb && !fib_table_lookup(tb, flp, res, flags | FIB_LOOKUP_NOREF))
-		err = 0;
+	if (tb)
+		err = fib_table_lookup(tb, flp, res, flags | FIB_LOOKUP_NOREF);
+
+	if (err == -EAGAIN)
+		err = -ENETUNREACH;
 
 	rcu_read_unlock();
 
@@ -258,7 +261,7 @@ static inline int fib_lookup(struct net *net, struct flowi4 *flp,
 			     struct fib_result *res, unsigned int flags)
 {
 	struct fib_table *tb;
-	int err;
+	int err = -ENETUNREACH;
 
 	flags |= FIB_LOOKUP_NOREF;
 	if (net->ipv4.fib_has_custom_rules)
@@ -268,15 +271,20 @@ static inline int fib_lookup(struct net *net, struct flowi4 *flp,
 
 	res->tclassid = 0;
 
-	for (err = 0; !err; err = -ENETUNREACH) {
-		tb = rcu_dereference_rtnl(net->ipv4.fib_main);
-		if (tb && !fib_table_lookup(tb, flp, res, flags))
-			break;
+	tb = rcu_dereference_rtnl(net->ipv4.fib_main);
+	if (tb)
+		err = fib_table_lookup(tb, flp, res, flags);
+
+	if (!err)
+		goto out;
+
+	tb = rcu_dereference_rtnl(net->ipv4.fib_default);
+	if (tb)
+		err = fib_table_lookup(tb, flp, res, flags);
 
-		tb = rcu_dereference_rtnl(net->ipv4.fib_default);
-		if (tb && !fib_table_lookup(tb, flp, res, flags))
-			break;
-	}
+out:
+	if (err == -EAGAIN)
+		err = -ENETUNREACH;
 
 	rcu_read_unlock();
 
-- 
cgit v1.2.3


From 83cf9a2521b0934a5f9d04082c9bb4f554fddcd4 Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Fri, 18 Sep 2015 11:47:41 +0200
Subject: ip6tunnel: make rx/tx bytes counters consistent

Like the previous patch, which fixes ipv4 tunnels, here is the ipv6 part.

Before the patch, the external ipv6 header + gre header were included on
tx.

After the patch:
$ ping -c1 192.168.6.121 ; ip -s l ls dev ip6gre1
PING 192.168.6.121 (192.168.6.121) 56(84) bytes of data.
64 bytes from 192.168.6.121: icmp_req=1 ttl=64 time=1.92 ms

--- 192.168.6.121 ping statistics ---
1 packets transmitted, 1 received, 0% packet loss, time 0ms
rtt min/avg/max/mdev = 1.923/1.923/1.923/0.000 ms
7: ip6gre1@NONE: <POINTOPOINT,NOARP,UP,LOWER_UP> mtu 1440 qdisc noqueue state UNKNOWN mode DEFAULT group default
    link/gre6 20:01:06:60:30:08:c1:c3:00:00:00:00:00:00:01:23 peer 20:01:06:60:30:08:c1:c3:00:00:00:00:00:00:01:21
    RX: bytes  packets  errors  dropped overrun mcast
    84         1        0       0       0       0
    TX: bytes  packets  errors  dropped carrier collsns
    84         1        0       0       0       0
$ ping -c1 192.168.1.121 ; ip -s l ls dev ip6tnl1
PING 192.168.1.121 (192.168.1.121) 56(84) bytes of data.
64 bytes from 192.168.1.121: icmp_req=1 ttl=64 time=2.28 ms

--- 192.168.1.121 ping statistics ---
1 packets transmitted, 1 received, 0% packet loss, time 0ms
rtt min/avg/max/mdev = 2.288/2.288/2.288/0.000 ms
8: ip6tnl1@NONE: <POINTOPOINT,NOARP,UP,LOWER_UP> mtu 1452 qdisc noqueue state UNKNOWN mode DEFAULT group default
    link/tunnel6 2001:660:3008:c1c3::123 peer 2001:660:3008:c1c3::121
    RX: bytes  packets  errors  dropped overrun mcast
    84         1        0       0       0       0
    TX: bytes  packets  errors  dropped carrier collsns
    84         1        0       0       0       0

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_tunnel.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/ip6_tunnel.h b/include/net/ip6_tunnel.h
index 65c2a9397b3c..fa915fa0f703 100644
--- a/include/net/ip6_tunnel.h
+++ b/include/net/ip6_tunnel.h
@@ -86,7 +86,7 @@ static inline void ip6tunnel_xmit(struct sock *sk, struct sk_buff *skb,
 	struct net_device_stats *stats = &dev->stats;
 	int pkt_len, err;
 
-	pkt_len = skb->len;
+	pkt_len = skb->len - skb_inner_network_offset(skb);
 	err = ip6_local_out_sk(sk, skb);
 
 	if (net_xmit_eval(err) == 0) {
-- 
cgit v1.2.3


From ed2e923945892a8372ab70d2f61d364b0b6d9054 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 19 Sep 2015 09:08:34 -0700
Subject: tcp/dccp: fix timewait races in timer handling

When creating a timewait socket, we need to arm the timer before
allowing other cpus to find it. The signal allowing cpus to find
the socket is setting tw_refcnt to non zero value.

As we set tw_refcnt in __inet_twsk_hashdance(), we therefore need to
call inet_twsk_schedule() first.

This also means we need to remove tw_refcnt changes from
inet_twsk_schedule() and let the caller handle it.

Note that because we use mod_timer_pinned(), we have the guarantee
the timer wont expire before we set tw_refcnt as we run in BH context.

To make things more readable I introduced inet_twsk_reschedule() helper.

When rearming the timer, we can use mod_timer_pending() to make sure
we do not rearm a canceled timer.

Note: This bug can possibly trigger if packets of a flow can hit
multiple cpus. This does not normally happen, unless flow steering
is broken somehow. This explains this bug was spotted ~5 months after
its introduction.

A similar fix is needed for SYN_RECV sockets in reqsk_queue_hash_req(),
but will be provided in a separate patch for proper tracking.

Fixes: 789f558cfb36 ("tcp/dccp: get rid of central timewait timer")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Ying Cai <ycai@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_timewait_sock.h | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
index 879d6e5a973b..186f3a1e1b1f 100644
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -110,7 +110,19 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk,
 void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
 			   struct inet_hashinfo *hashinfo);
 
-void inet_twsk_schedule(struct inet_timewait_sock *tw, const int timeo);
+void __inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo,
+			  bool rearm);
+
+static void inline inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo)
+{
+	__inet_twsk_schedule(tw, timeo, false);
+}
+
+static void inline inet_twsk_reschedule(struct inet_timewait_sock *tw, int timeo)
+{
+	__inet_twsk_schedule(tw, timeo, true);
+}
+
 void inet_twsk_deschedule_put(struct inet_timewait_sock *tw);
 
 void inet_twsk_purge(struct inet_hashinfo *hashinfo,
-- 
cgit v1.2.3


From ac5be6b47e8bd25b62bed2c82cda7398999f59e9 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Tue, 22 Sep 2015 14:58:49 -0700
Subject: userfaultfd: revert "userfaultfd: waitqueue: add nr wake parameter to
 __wake_up_locked_key"

This reverts commit 51360155eccb907ff8635bd10fc7de876408c2e0 and adapts
fs/userfaultfd.c to use the old version of that function.

It didn't look robust to call __wake_up_common with "nr == 1" when we
absolutely require wakeall semantics, but we've full control of what we
insert in the two waitqueue heads of the blocked userfaults.  No
exclusive waitqueue risks to be inserted into those two waitqueue heads
so we can as well stick to "nr == 1" of the old code and we can rely
purely on the fact no waitqueue inserted in one of the two waitqueue
heads we must enforce as wakeall, has wait->flags WQ_FLAG_EXCLUSIVE set.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: Dr. David Alan Gilbert <dgilbert@redhat.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Cc: Thierry Reding <treding@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/wait.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/wait.h b/include/linux/wait.h
index d3d077228d4c..1e1bf9f963a9 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -147,8 +147,7 @@ __remove_wait_queue(wait_queue_head_t *head, wait_queue_t *old)
 
 typedef int wait_bit_action_f(struct wait_bit_key *);
 void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key);
-void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, int nr,
-			  void *key);
+void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key);
 void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr, void *key);
 void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr);
 void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr);
@@ -180,7 +179,7 @@ wait_queue_head_t *bit_waitqueue(void *, int);
 #define wake_up_poll(x, m)						\
 	__wake_up(x, TASK_NORMAL, 1, (void *) (m))
 #define wake_up_locked_poll(x, m)					\
-	__wake_up_locked_key((x), TASK_NORMAL, 1, (void *) (m))
+	__wake_up_locked_key((x), TASK_NORMAL, (void *) (m))
 #define wake_up_interruptible_poll(x, m)				\
 	__wake_up(x, TASK_INTERRUPTIBLE, 1, (void *) (m))
 #define wake_up_interruptible_sync_poll(x, m)				\
-- 
cgit v1.2.3


From 09f7298100ea9767324298ab0c7979f6d7463183 Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Date: Tue, 22 Sep 2015 14:59:09 -0700
Subject: userfaultfd: register uapi generic syscall (aarch64)

Add the userfaultfd syscalls to uapi asm-generic, it was tested with
postcopy live migration on aarch64 with both 4k and 64k pagesize
kernels.

Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Cc: Thierry Reding <treding@nvidia.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/uapi/asm-generic/unistd.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 8da542a2874d..ee124009e12a 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -709,17 +709,19 @@ __SYSCALL(__NR_memfd_create, sys_memfd_create)
 __SYSCALL(__NR_bpf, sys_bpf)
 #define __NR_execveat 281
 __SC_COMP(__NR_execveat, sys_execveat, compat_sys_execveat)
-#define __NR_membarrier 282
+#define __NR_userfaultfd 282
+__SYSCALL(__NR_userfaultfd, sys_userfaultfd)
+#define __NR_membarrier 283
 __SYSCALL(__NR_membarrier, sys_membarrier)
 
 #undef __NR_syscalls
-#define __NR_syscalls 283
+#define __NR_syscalls 284
 
 /*
  * All syscalls below here should go away really,
  * these are provided for both review and as a porting
  * help for the C library version.
-*
+ *
  * Last chance: are any of these important enough to
  * enable by default?
  */
-- 
cgit v1.2.3


From 2d8bff12699abc3a9bf886bb0b79f44d94d81496 Mon Sep 17 00:00:00 2001
From: Neil Horman <nhorman@redhat.com>
Date: Wed, 23 Sep 2015 14:57:58 -0400
Subject: netpoll: Close race condition between poll_one_napi and napi_disable

Drivers might call napi_disable while not holding the napi instance poll_lock.
In those instances, its possible for a race condition to exist between
poll_one_napi and napi_disable.  That is to say, poll_one_napi only tests the
NAPI_STATE_SCHED bit to see if there is work to do during a poll, and as such
the following may happen:

CPU0				CPU1
ndo_tx_timeout			napi_poll_dev
 napi_disable			 poll_one_napi
  test_and_set_bit (ret 0)
				  test_bit (ret 1)
   reset adapter		   napi_poll_routine

If the adapter gets a tx timeout without a napi instance scheduled, its possible
for the adapter to think it has exclusive access to the hardware  (as the napi
instance is now scheduled via the napi_disable call), while the netpoll code
thinks there is simply work to do.  The result is parallel hardware access
leading to corrupt data structures in the driver, and a crash.

Additionaly, there is another, more critical race between netpoll and
napi_disable.  The disabled napi state is actually identical to the scheduled
state for a given napi instance.  The implication being that, if a napi instance
is disabled, a netconsole instance would see the napi state of the device as
having been scheduled, and poll it, likely while the driver was dong something
requiring exclusive access.  In the case above, its fairly clear that not having
the rings in a state ready to be polled will cause any number of crashes.

The fix should be pretty easy.  netpoll uses its own bit to indicate that that
the napi instance is in a state of being serviced by netpoll (NAPI_STATE_NPSVC).
We can just gate disabling on that bit as well as the sched bit.  That should
prevent netpoll from conducting a napi poll if we convert its set bit to a
test_and_set_bit operation to provide mutual exclusion

Change notes:
V2)
	Remove a trailing whtiespace
	Resubmit with proper subject prefix

V3)
	Clean up spacing nits

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
CC: "David S. Miller" <davem@davemloft.net>
CC: jmaxwell@redhat.com
Tested-by: jmaxwell@redhat.com
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 88a00694eda5..2d15e3831440 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -507,6 +507,7 @@ static inline void napi_enable(struct napi_struct *n)
 	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
 	smp_mb__before_atomic();
 	clear_bit(NAPI_STATE_SCHED, &n->state);
+	clear_bit(NAPI_STATE_NPSVC, &n->state);
 }
 
 #ifdef CONFIG_SMP
-- 
cgit v1.2.3


From 9badce000e2ce68ba74838a3cd356dde58221c2f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 23 Sep 2015 17:07:29 -0400
Subject: cgroup, writeback: don't enable cgroup writeback on traditional
 hierarchies

inode_cgwb_enabled() gates cgroup writeback support.  If it returns
true, each inode is attached to the corresponding memory domain which
gets mapped to io domain.  It currently only tests whether the
filesystem and bdi support cgroup writeback; however, cgroup writeback
support doesn't work on traditional hierarchies and thus it should
also test whether memcg and iocg are on the default hierarchy.

This caused traditional hierarchy setups to hit the cgroup writeback
path inadvertently and ended up creating separate writeback domains
for each memcg and mapping them all to the root iocg uncovering a
couple issues in the cgroup writeback path.

cgroup writeback was never meant to be enabled on traditional
hierarchies.  Make inode_cgwb_enabled() test whether both memcg and
iocg are on the default hierarchy.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Artem Bityutskiy <dedekind1@gmail.com>
Reported-by: Dexuan Cui <decui@microsoft.com>
Link: http://lkml.kernel.org/g/1443012552.19983.209.camel@gmail.com
Link: http://lkml.kernel.org/g/f30d4a6aa8a546ff88f73021d026a453@SIXPR30MB031.064d.mgd.msft.net
---
 include/linux/backing-dev.h | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 5a5d79ee256f..d5eb4ad1c534 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -13,6 +13,7 @@
 #include <linux/sched.h>
 #include <linux/blkdev.h>
 #include <linux/writeback.h>
+#include <linux/memcontrol.h>
 #include <linux/blk-cgroup.h>
 #include <linux/backing-dev-defs.h>
 #include <linux/slab.h>
@@ -252,13 +253,19 @@ int inode_congested(struct inode *inode, int cong_bits);
  * @inode: inode of interest
  *
  * cgroup writeback requires support from both the bdi and filesystem.
- * Test whether @inode has both.
+ * Also, both memcg and iocg have to be on the default hierarchy.  Test
+ * whether all conditions are met.
+ *
+ * Note that the test result may change dynamically on the same inode
+ * depending on how memcg and iocg are configured.
  */
 static inline bool inode_cgwb_enabled(struct inode *inode)
 {
 	struct backing_dev_info *bdi = inode_to_bdi(inode);
 
-	return bdi_cap_account_dirty(bdi) &&
+	return cgroup_on_dfl(mem_cgroup_root_css->cgroup) &&
+		cgroup_on_dfl(blkcg_root_css->cgroup) &&
+		bdi_cap_account_dirty(bdi) &&
 		(bdi->capabilities & BDI_CAP_CGROUP_WRITEBACK) &&
 		(inode->i_sb->s_iflags & SB_I_CGROUPWB);
 }
-- 
cgit v1.2.3


From 6ae459bdaaeebc632b16e54dcbabb490c6931d61 Mon Sep 17 00:00:00 2001
From: Pravin B Shelar <pshelar@nicira.com>
Date: Tue, 22 Sep 2015 12:57:53 -0700
Subject: skbuff: Fix skb checksum flag on skb pull

VXLAN device can receive skb with checksum partial. But the checksum
offset could be in outer header which is pulled on receive. This results
in negative checksum offset for the skb. Such skb can cause the assert
failure in skb_checksum_help(). Following patch fixes the bug by setting
checksum-none while pulling outer header.

Following is the kernel panic msg from old kernel hitting the bug.

------------[ cut here ]------------
kernel BUG at net/core/dev.c:1906!
RIP: 0010:[<ffffffff81518034>] skb_checksum_help+0x144/0x150
Call Trace:
<IRQ>
[<ffffffffa0164c28>] queue_userspace_packet+0x408/0x470 [openvswitch]
[<ffffffffa016614d>] ovs_dp_upcall+0x5d/0x60 [openvswitch]
[<ffffffffa0166236>] ovs_dp_process_packet_with_key+0xe6/0x100 [openvswitch]
[<ffffffffa016629b>] ovs_dp_process_received_packet+0x4b/0x80 [openvswitch]
[<ffffffffa016c51a>] ovs_vport_receive+0x2a/0x30 [openvswitch]
[<ffffffffa0171383>] vxlan_rcv+0x53/0x60 [openvswitch]
[<ffffffffa01734cb>] vxlan_udp_encap_recv+0x8b/0xf0 [openvswitch]
[<ffffffff8157addc>] udp_queue_rcv_skb+0x2dc/0x3b0
[<ffffffff8157b56f>] __udp4_lib_rcv+0x1cf/0x6c0
[<ffffffff8157ba7a>] udp_rcv+0x1a/0x20
[<ffffffff8154fdbd>] ip_local_deliver_finish+0xdd/0x280
[<ffffffff81550128>] ip_local_deliver+0x88/0x90
[<ffffffff8154fa7d>] ip_rcv_finish+0x10d/0x370
[<ffffffff81550365>] ip_rcv+0x235/0x300
[<ffffffff8151ba1d>] __netif_receive_skb+0x55d/0x620
[<ffffffff8151c360>] netif_receive_skb+0x80/0x90
[<ffffffff81459935>] virtnet_poll+0x555/0x6f0
[<ffffffff8151cd04>] net_rx_action+0x134/0x290
[<ffffffff810683d8>] __do_softirq+0xa8/0x210
[<ffffffff8162fe6c>] call_softirq+0x1c/0x30
[<ffffffff810161a5>] do_softirq+0x65/0xa0
[<ffffffff810687be>] irq_exit+0x8e/0xb0
[<ffffffff81630733>] do_IRQ+0x63/0xe0
[<ffffffff81625f2e>] common_interrupt+0x6e/0x6e

Reported-by: Anupam Chanda <achanda@vmware.com>
Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 9987af080fa0..2b0a30a6e31c 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2707,6 +2707,9 @@ static inline void skb_postpull_rcsum(struct sk_buff *skb,
 {
 	if (skb->ip_summed == CHECKSUM_COMPLETE)
 		skb->csum = csum_sub(skb->csum, csum_partial(start, len, 0));
+	else if (skb->ip_summed == CHECKSUM_PARTIAL &&
+		 skb_checksum_start_offset(skb) <= len)
+		skb->ip_summed = CHECKSUM_NONE;
 }
 
 unsigned char *skb_pull_rcsum(struct sk_buff *skb, unsigned int len);
-- 
cgit v1.2.3


From 63d008a4e9ee86614ca5671b7f3ba447df007190 Mon Sep 17 00:00:00 2001
From: Jiri Benc <jbenc@redhat.com>
Date: Tue, 22 Sep 2015 18:12:11 +0200
Subject: ipv4: send arp replies to the correct tunnel

When using ip lwtunnels, the additional data for xmit (basically, the actual
tunnel to use) are carried in ip_tunnel_info either in dst->lwtstate or in
metadata dst. When replying to ARP requests, we need to send the reply to
the same tunnel the request came from. This means we need to construct
proper metadata dst for ARP replies.

We could perform another route lookup to get a dst entry with the correct
lwtstate. However, this won't always ensure that the outgoing tunnel is the
same as the incoming one, and it won't work anyway for IPv4 duplicate
address detection.

The only thing to do is to "reverse" the ip_tunnel_info.

Signed-off-by: Jiri Benc <jbenc@redhat.com>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_tunnels.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 9a6a3ba888e8..f6dafec9102c 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -276,6 +276,8 @@ int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto);
 int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
 		  __be32 src, __be32 dst, u8 proto,
 		  u8 tos, u8 ttl, __be16 df, bool xnet);
+struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md,
+					     gfp_t flags);
 
 struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb, bool gre_csum,
 					 int gso_type_mask);
-- 
cgit v1.2.3


From b194f30c61efb0767a98f47a64530baa8b731670 Mon Sep 17 00:00:00 2001
From: Jiri Benc <jbenc@redhat.com>
Date: Tue, 22 Sep 2015 18:12:12 +0200
Subject: lwtunnel: remove source and destination UDP port config option

The UDP tunnel config is asymmetric wrt. to the ports used. The source and
destination ports from one direction of the tunnel are not related to the
ports of the other direction. We need to be able to respond to ARP requests
using the correct ports without involving routing.

As the consequence, UDP ports need to be fixed property of the tunnel
interface and cannot be set per route. Remove the ability to set ports per
route. This is still okay to do, as no kernel has been released with these
attributes yet.

Note that the ability to specify source and destination ports is preserved
for other users of the lwtunnel API which don't use routes for tunnel key
specification (like openvswitch).

If in the future we rework ARP handling to allow port specification, the
attributes can be added back.

Signed-off-by: Jiri Benc <jbenc@redhat.com>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/lwtunnel.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/lwtunnel.h b/include/uapi/linux/lwtunnel.h
index 34141a5dfe74..f8b01887a495 100644
--- a/include/uapi/linux/lwtunnel.h
+++ b/include/uapi/linux/lwtunnel.h
@@ -21,8 +21,6 @@ enum lwtunnel_ip_t {
 	LWTUNNEL_IP_SRC,
 	LWTUNNEL_IP_TTL,
 	LWTUNNEL_IP_TOS,
-	LWTUNNEL_IP_SPORT,
-	LWTUNNEL_IP_DPORT,
 	LWTUNNEL_IP_FLAGS,
 	__LWTUNNEL_IP_MAX,
 };
@@ -36,8 +34,6 @@ enum lwtunnel_ip6_t {
 	LWTUNNEL_IP6_SRC,
 	LWTUNNEL_IP6_HOPLIMIT,
 	LWTUNNEL_IP6_TC,
-	LWTUNNEL_IP6_SPORT,
-	LWTUNNEL_IP6_DPORT,
 	LWTUNNEL_IP6_FLAGS,
 	__LWTUNNEL_IP6_MAX,
 };
-- 
cgit v1.2.3


From 3e3aaf649416988ca8be4ad2c52dc24d8be7b46e Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@arm.linux.org.uk>
Date: Thu, 24 Sep 2015 20:36:02 +0100
Subject: phy: fix mdiobus module safety

Re-implement the mdiobus module refcounting to ensure that we actually
ensure that the mdiobus module code does not go away while we might call
into it.

The old scheme using bus->dev.driver was buggy, because bus->dev is a
class device which never has a struct device_driver associated with it,
and hence the associated code trying to obtain a refcount did nothing
useful.

Instead, take the approach that other subsystems do: pass the module
when calling mdiobus_register(), and record that in the mii_bus struct.
When we need to increment the module use count in the phy code, use
this stored pointer.  When the phy is deteched, drop the module
refcount, remembering that the phy device might go away at that point.

This doesn't stop the mii_bus going away while there are in-use phys -
it merely stops the underlying code vanishing.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phy.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index 962387a192f1..11bce44f6d65 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -19,6 +19,7 @@
 #include <linux/spinlock.h>
 #include <linux/ethtool.h>
 #include <linux/mii.h>
+#include <linux/module.h>
 #include <linux/timer.h>
 #include <linux/workqueue.h>
 #include <linux/mod_devicetable.h>
@@ -153,6 +154,7 @@ struct sk_buff;
  * PHYs should register using this structure
  */
 struct mii_bus {
+	struct module *owner;
 	const char *name;
 	char id[MII_BUS_ID_SIZE];
 	void *priv;
@@ -198,7 +200,8 @@ static inline struct mii_bus *mdiobus_alloc(void)
 	return mdiobus_alloc_size(0);
 }
 
-int mdiobus_register(struct mii_bus *bus);
+int __mdiobus_register(struct mii_bus *bus, struct module *owner);
+#define mdiobus_register(bus) __mdiobus_register(bus, THIS_MODULE)
 void mdiobus_unregister(struct mii_bus *bus);
 void mdiobus_free(struct mii_bus *bus);
 struct mii_bus *devm_mdiobus_alloc_size(struct device *dev, int sizeof_priv);
-- 
cgit v1.2.3


From 38737e490d4ea91660d3cec83ef88c4e6d360ae4 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@arm.linux.org.uk>
Date: Thu, 24 Sep 2015 20:36:28 +0100
Subject: phy: add phy_device_remove()

Add a phy_device_remove() function to complement phy_device_register(),
which undoes the effects of phy_device_register() by removing the phy
device from visibility, but not freeing it.

This allows these details to be moved out of the mdio bus code into
the phy code where this action belongs.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phy.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index 11bce44f6d65..4a4e3a092337 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -745,6 +745,7 @@ struct phy_device *phy_device_create(struct mii_bus *bus, int addr, int phy_id,
 				     struct phy_c45_device_ids *c45_ids);
 struct phy_device *get_phy_device(struct mii_bus *bus, int addr, bool is_c45);
 int phy_device_register(struct phy_device *phy);
+void phy_device_remove(struct phy_device *phydev);
 int phy_init_hw(struct phy_device *phydev);
 int phy_suspend(struct phy_device *phydev);
 int phy_resume(struct phy_device *phydev);
-- 
cgit v1.2.3


From eeeb9522231118138be418ff527dc8c9050f4707 Mon Sep 17 00:00:00 2001
From: Nicholas Bellinger <nab@linux-iscsi.org>
Date: Tue, 15 Sep 2015 17:27:35 -0700
Subject: target: Propigate backend read-only to core_tpg_add_lun

This patch adds a DF_READ_ONLY flag that is used by IBLOCK to
signal when a backend has been set to read-only mode, in order
to propigate read-only status up to core_tpg_add_lun() for all
future LUN fabric exports.

With this is place, existing emulation for reporting read-only
in spc_emulate_modesense() and normal transport_lookup_cmd_lun()
TCM_WRITE_PROTECTED status checking just works as expected.

Reported-by: Joeue Deng <joeue404@gmail.com>
Reported-by: Andy Grover <agrover@redhat.com>
Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
---
 include/target/target_core_base.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/target/target_core_base.h b/include/target/target_core_base.h
index ac9bf1c0e42d..5f48754dc36a 100644
--- a/include/target/target_core_base.h
+++ b/include/target/target_core_base.h
@@ -730,6 +730,7 @@ struct se_device {
 #define DF_EMULATED_VPD_UNIT_SERIAL		0x00000004
 #define DF_USING_UDEV_PATH			0x00000008
 #define DF_USING_ALIAS				0x00000010
+#define DF_READ_ONLY				0x00000020
 	/* Physical device queue depth */
 	u32			queue_depth;
 	/* Used for SPC-2 reservations enforce of ISIDs */
-- 
cgit v1.2.3


From c6790aa9f4fdc26b1246ba36da2fd749663beb65 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagig@mellanox.com>
Date: Thu, 24 Sep 2015 10:34:23 +0300
Subject: IB/mlx5: Remove support for IB_DEVICE_LOCAL_DMA_LKEY

Commit 96249d70dd70 ("IB/core: Guarantee that a local_dma_lkey
is available") allows ULPs that make use of the local dma key to keep
working as before by allocating a DMA MR with local permissions and
converted these consumers to use the MR associated with the PD
rather then device->local_dma_lkey.

ConnectIB has some known issues with memory registration
using the local_dma_lkey (SEND, RDMA, RECV seems to work ok).

Thus don't expose support for it (remove device->local_dma_lkey
setting), and take advantage of the above commit such that no regression
is introduced to working systems.

The local_dma_lkey support will be restored in CX4 depending on FW
capability query.

Signed-off-by: Sagi Grimberg <sagig@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 include/linux/mlx5/device.h | 11 -----------
 include/linux/mlx5/driver.h |  1 -
 2 files changed, 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 8eb3b19af2a4..250b1ff8b48d 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -402,17 +402,6 @@ struct mlx5_cmd_teardown_hca_mbox_out {
 	u8			rsvd[8];
 };
 
-struct mlx5_cmd_query_special_contexts_mbox_in {
-	struct mlx5_inbox_hdr	hdr;
-	u8			rsvd[8];
-};
-
-struct mlx5_cmd_query_special_contexts_mbox_out {
-	struct mlx5_outbox_hdr	hdr;
-	__be32                  dump_fill_mkey;
-	__be32                  resd_lkey;
-};
-
 struct mlx5_cmd_layout {
 	u8		type;
 	u8		rsvd0[3];
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 27b53f9a24ad..8b6d6f2154a4 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -845,7 +845,6 @@ void *mlx5_get_protocol_dev(struct mlx5_core_dev *mdev, int protocol);
 int mlx5_register_interface(struct mlx5_interface *intf);
 void mlx5_unregister_interface(struct mlx5_interface *intf);
 int mlx5_core_query_vendor_id(struct mlx5_core_dev *mdev, u32 *vendor_id);
-int mlx5_core_query_special_context(struct mlx5_core_dev *dev, u32 *rsvd_lkey);
 
 struct mlx5_profile {
 	u64	mask;
-- 
cgit v1.2.3


From 5ebc76035303016ec41bb752bec156ea9fde7c34 Mon Sep 17 00:00:00 2001
From: Jiang Liu <jiang.liu@linux.intel.com>
Date: Thu, 17 Sep 2015 14:02:45 +0800
Subject: ACPI, PCI, irq: Do not share PCI IRQ with ISA IRQ

Avoid IRQs occupied by ISA IRQs when allocating IRQs for PCI link devices,
otherwise it may cause interrupt storm due to incompatible pin attributes.

This issue was triggered on a KVM virtual machine, which
 1) uses IRQ9 for SCI in high level mode.
 2) defines an PCI interrupt link device (LNKS) with IRQ9 as the only
    possible irq.
 3) has an PCI device referring to link device LNKS.
So it causes interrupt storm when enabling the PCI device because PCI IRQ
works in low level mode.

Signed-off-by: Jiang Liu <jiang.liu@linux.intel.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/acpi.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 7235c4851460..43856d19cf4d 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -217,6 +217,7 @@ struct pci_dev;
 
 int acpi_pci_irq_enable (struct pci_dev *dev);
 void acpi_penalize_isa_irq(int irq, int active);
+bool acpi_isa_irq_available(int irq);
 void acpi_penalize_sci_irq(int irq, int trigger, int polarity);
 void acpi_pci_irq_disable (struct pci_dev *dev);
 
-- 
cgit v1.2.3


From 4593fdbe7a2f44d5e64c627c715dd0bcec9bdf14 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Sun, 27 Sep 2015 02:09:20 +0900
Subject: blk-mq: fix sysfs registration/unregistration race

There is a race between cpu hotplug handling and adding/deleting
gendisk for blk-mq, where both are trying to register and unregister
the same sysfs entries.

null_add_dev
    --> blk_mq_init_queue
        --> blk_mq_init_allocated_queue
            --> add to 'all_q_list' (*)
    --> add_disk
        --> blk_register_queue
            --> blk_mq_register_disk (++)

null_del_dev
    --> del_gendisk
        --> blk_unregister_queue
            --> blk_mq_unregister_disk (--)
    --> blk_cleanup_queue
        --> blk_mq_free_queue
            --> del from 'all_q_list' (*)

blk_mq_queue_reinit
    --> blk_mq_sysfs_unregister (-)
    --> blk_mq_sysfs_register (+)

While the request queue is added to 'all_q_list' (*),
blk_mq_queue_reinit() can be called for the queue anytime by CPU
hotplug callback.  But blk_mq_sysfs_unregister (-) and
blk_mq_sysfs_register (+) in blk_mq_queue_reinit must not be called
before blk_mq_register_disk (++) and after blk_mq_unregister_disk (--)
is finished.  Because '/sys/block/*/mq/' is not exists.

There has already been BLK_MQ_F_SYSFS_UP flag in hctx->flags which can
be used to track these sysfs stuff, but it is only fixing this issue
partially.

In order to fix it completely, we just need per-queue flag instead of
per-hctx flag with appropriate locking.  So this introduces
q->mq_sysfs_init_done which is properly protected with all_q_mutex.

Also, we need to ensure that blk_mq_map_swqueue() is called with
all_q_mutex is held.  Since hctx->nr_ctx is reset temporarily and
updated in blk_mq_map_swqueue(), so we should avoid
blk_mq_register_hctx() seeing the temporary hctx->nr_ctx value
in CPU hotplug handling or adding/deleting gendisk .

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Reviewed-by: Ming Lei <tom.leiming@gmail.com>
Cc: Ming Lei <tom.leiming@gmail.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/blk-mq.h | 1 -
 include/linux/blkdev.h | 2 ++
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 37d1602c4f7a..b80ba4572a31 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -145,7 +145,6 @@ enum {
 	BLK_MQ_F_SHOULD_MERGE	= 1 << 0,
 	BLK_MQ_F_TAG_SHARED	= 1 << 1,
 	BLK_MQ_F_SG_MERGE	= 1 << 2,
-	BLK_MQ_F_SYSFS_UP	= 1 << 3,
 	BLK_MQ_F_DEFER_ISSUE	= 1 << 4,
 	BLK_MQ_F_ALLOC_POLICY_START_BIT = 8,
 	BLK_MQ_F_ALLOC_POLICY_BITS = 1,
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 99da9ebc7377..19c2e947d4d1 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -456,6 +456,8 @@ struct request_queue {
 	struct blk_mq_tag_set	*tag_set;
 	struct list_head	tag_set_list;
 	struct bio_set		*bio_split;
+
+	bool			mq_sysfs_init_done;
 };
 
 #define QUEUE_FLAG_QUEUED	1	/* uses generic tag queueing */
-- 
cgit v1.2.3


From 4613012db1d911f80897f9446a49de817b2c4c47 Mon Sep 17 00:00:00 2001
From: Aaron Conole <aconole@bytheb.org>
Date: Sat, 26 Sep 2015 18:50:42 -0400
Subject: af_unix: Convert the unix_sk macro to an inline function for type
 safety

As suggested by Eric Dumazet this change replaces the
#define with a static inline function to enjoy
complaints by the compiler when misusing the API.

Signed-off-by: Aaron Conole <aconole@bytheb.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/af_unix.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index 4a167b30a12f..cb1b9bbda332 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -63,7 +63,11 @@ struct unix_sock {
 #define UNIX_GC_MAYBE_CYCLE	1
 	struct socket_wq	peer_wq;
 };
-#define unix_sk(__sk) ((struct unix_sock *)__sk)
+
+static inline struct unix_sock *unix_sk(struct sock *sk)
+{
+	return (struct unix_sock *)sk;
+}
 
 #define peer_wait peer_wq.wait
 
-- 
cgit v1.2.3


From 31b33dfb0a144469dd805514c9e63f4993729a48 Mon Sep 17 00:00:00 2001
From: Pravin B Shelar <pshelar@nicira.com>
Date: Mon, 28 Sep 2015 17:24:25 -0700
Subject: skbuff: Fix skb checksum partial check.

Earlier patch 6ae459bda tried to detect void ckecksum partial
skb by comparing pull length to checksum offset. But it does
not work for all cases since checksum-offset depends on
updates to skb->data.

Following patch fixes it by validating checksum start offset
after skb-data pointer is updated. Negative value of checksum
offset start means there is no need to checksum.

Fixes: 6ae459bda ("skbuff: Fix skb checksum flag on skb pull")
Reported-by: Andrew Vagin <avagin@odin.com>
Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 2b0a30a6e31c..4398411236f1 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2708,7 +2708,7 @@ static inline void skb_postpull_rcsum(struct sk_buff *skb,
 	if (skb->ip_summed == CHECKSUM_COMPLETE)
 		skb->csum = csum_sub(skb->csum, csum_partial(start, len, 0));
 	else if (skb->ip_summed == CHECKSUM_PARTIAL &&
-		 skb_checksum_start_offset(skb) <= len)
+		 skb_checksum_start_offset(skb) < 0)
 		skb->ip_summed = CHECKSUM_NONE;
 }
 
-- 
cgit v1.2.3


From 4ad640e99e5e5514d623210bc937e665ffd8f43f Mon Sep 17 00:00:00 2001
From: Egbert Eich <eich@suse.de>
Date: Wed, 23 Sep 2015 16:13:00 +0200
Subject: drm: Add a non-locking version of drm_kms_helper_poll_enable(), v2

drm_kms_helper_poll_enable() was converted to lock the mode_config
mutex in commit 8c4ccc4ab6f64e859d4ff8d7c02c2ed2e956e07f
("drm/probe-helper: Grab mode_config.mutex in poll_init/enable").

This disregarded the cases where this function is called from a context
where this mutex is already locked.

Add a non-locking version as well.

Changes since v1:
- use function name suffix '_locked' for the function that
  is to be called from a locked context.

Signed-off-by: Egbert Eich <eich@suse.de>
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
---
 include/drm/drm_crtc_helper.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/drm/drm_crtc_helper.h b/include/drm/drm_crtc_helper.h
index 2a747a91fded..3febb4b9fce9 100644
--- a/include/drm/drm_crtc_helper.h
+++ b/include/drm/drm_crtc_helper.h
@@ -240,5 +240,6 @@ extern void drm_kms_helper_hotplug_event(struct drm_device *dev);
 
 extern void drm_kms_helper_poll_disable(struct drm_device *dev);
 extern void drm_kms_helper_poll_enable(struct drm_device *dev);
+extern void drm_kms_helper_poll_enable_locked(struct drm_device *dev);
 
 #endif
-- 
cgit v1.2.3


From f4829a9b7a61e159367350008a608b062c4f6840 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 27 Sep 2015 21:01:50 +0200
Subject: blk-mq: fix racy updates of rq->errors

blk_mq_complete_request may be a no-op if the request has already
been completed by others means (e.g. a timeout or cancellation), but
currently drivers have to set rq->errors before calling
blk_mq_complete_request, which might leave us with the wrong error value.

Add an error parameter to blk_mq_complete_request so that we can
defer setting rq->errors until we known we won the race to complete the
request.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Sagi Grimberg <sagig@mellanox.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/blk-mq.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index b80ba4572a31..c1b5c867ff07 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -214,7 +214,7 @@ void blk_mq_add_to_requeue_list(struct request *rq, bool at_head);
 void blk_mq_cancel_requeue_work(struct request_queue *q);
 void blk_mq_kick_requeue_list(struct request_queue *q);
 void blk_mq_abort_requeue_list(struct request_queue *q);
-void blk_mq_complete_request(struct request *rq);
+void blk_mq_complete_request(struct request *rq, int error);
 
 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx);
 void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx);
-- 
cgit v1.2.3


From 0bf6cd5b9531bcc29c0a5e504b6ce2984c6fd8d8 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 27 Sep 2015 21:01:51 +0200
Subject: blk-mq: factor out a helper to iterate all tags for a request_queue

And replace the blk_mq_tag_busy_iter with it - the driver use has been
replaced with a new helper a while ago, and internal to the block we
only need the new version.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/blk-mq.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index c1b5c867ff07..5e7d43ab61c0 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -223,8 +223,6 @@ void blk_mq_start_hw_queues(struct request_queue *q);
 void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async);
 void blk_mq_run_hw_queues(struct request_queue *q, bool async);
 void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
-void blk_mq_tag_busy_iter(struct blk_mq_hw_ctx *hctx, busy_iter_fn *fn,
-		void *priv);
 void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn,
 		void *priv);
 void blk_mq_freeze_queue(struct request_queue *q);
-- 
cgit v1.2.3


From 9ff42d10c3b3e26d9555878f31b9a2e5c24efa57 Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@arm.com>
Date: Thu, 1 Oct 2015 15:36:51 -0700
Subject: userfaultfd: remove kernel header include from uapi header

As include/uapi/linux/userfaultfd.h is a user visible header file, it
should not include kernel-exclusive header files.

So trying to build the userfaultfd test program from the selftests
directory fails, since it contains a reference to linux/compiler.h.  As
it turns out, that header is not really needed there, so we can simply
remove it to fix that issue.

Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/uapi/linux/userfaultfd.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index df0e09bb7dd5..9057d7af3ae1 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -11,8 +11,6 @@
 
 #include <linux/types.h>
 
-#include <linux/compiler.h>
-
 #define UFFD_API ((__u64)0xAA)
 /*
  * After implementing the respective features it will become:
-- 
cgit v1.2.3


From 0610c25daa3e76e38ad5a8fae683a89ff9f71798 Mon Sep 17 00:00:00 2001
From: Greg Thelen <gthelen@google.com>
Date: Thu, 1 Oct 2015 15:37:02 -0700
Subject: memcg: fix dirty page migration

The problem starts with a file backed dirty page which is charged to a
memcg.  Then page migration is used to move oldpage to newpage.

Migration:
 - copies the oldpage's data to newpage
 - clears oldpage.PG_dirty
 - sets newpage.PG_dirty
 - uncharges oldpage from memcg
 - charges newpage to memcg

Clearing oldpage.PG_dirty decrements the charged memcg's dirty page
count.

However, because newpage is not yet charged, setting newpage.PG_dirty
does not increment the memcg's dirty page count.  After migration
completes newpage.PG_dirty is eventually cleared, often in
account_page_cleaned().  At this time newpage is charged to a memcg so
the memcg's dirty page count is decremented which causes underflow
because the count was not previously incremented by migration.  This
underflow causes balance_dirty_pages() to see a very large unsigned
number of dirty memcg pages which leads to aggressive throttling of
buffered writes by processes in non root memcg.

This issue:
 - can harm performance of non root memcg buffered writes.
 - can report too small (even negative) values in
   memory.stat[(total_)dirty] counters of all memcg, including the root.

To avoid polluting migrate.c with #ifdef CONFIG_MEMCG checks, introduce
page_memcg() and set_page_memcg() helpers.

Test:
    0) setup and enter limited memcg
    mkdir /sys/fs/cgroup/test
    echo 1G > /sys/fs/cgroup/test/memory.limit_in_bytes
    echo $$ > /sys/fs/cgroup/test/cgroup.procs

    1) buffered writes baseline
    dd if=/dev/zero of=/data/tmp/foo bs=1M count=1k
    sync
    grep ^dirty /sys/fs/cgroup/test/memory.stat

    2) buffered writes with compaction antagonist to induce migration
    yes 1 > /proc/sys/vm/compact_memory &
    rm -rf /data/tmp/foo
    dd if=/dev/zero of=/data/tmp/foo bs=1M count=1k
    kill %
    sync
    grep ^dirty /sys/fs/cgroup/test/memory.stat

    3) buffered writes without antagonist, should match baseline
    rm -rf /data/tmp/foo
    dd if=/dev/zero of=/data/tmp/foo bs=1M count=1k
    sync
    grep ^dirty /sys/fs/cgroup/test/memory.stat

                       (speed, dirty residue)
             unpatched                       patched
    1) 841 MB/s 0 dirty pages          886 MB/s 0 dirty pages
    2) 611 MB/s -33427456 dirty pages  793 MB/s 0 dirty pages
    3) 114 MB/s -33427456 dirty pages  891 MB/s 0 dirty pages

    Notice that unpatched baseline performance (1) fell after
    migration (3): 841 -> 114 MB/s.  In the patched kernel, post
    migration performance matches baseline.

Fixes: c4843a7593a9 ("memcg: add per cgroup dirty page accounting")
Signed-off-by: Greg Thelen <gthelen@google.com>
Reported-by: Dave Hansen <dave.hansen@intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: <stable@vger.kernel.org>	[4.2+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 91c08f6f0dc9..80001de019ba 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -905,6 +905,27 @@ static inline void set_page_links(struct page *page, enum zone_type zone,
 #endif
 }
 
+#ifdef CONFIG_MEMCG
+static inline struct mem_cgroup *page_memcg(struct page *page)
+{
+	return page->mem_cgroup;
+}
+
+static inline void set_page_memcg(struct page *page, struct mem_cgroup *memcg)
+{
+	page->mem_cgroup = memcg;
+}
+#else
+static inline struct mem_cgroup *page_memcg(struct page *page)
+{
+	return NULL;
+}
+
+static inline void set_page_memcg(struct page *page, struct mem_cgroup *memcg)
+{
+}
+#endif
+
 /*
  * Some inline functions in vmstat.h depend on page_zone()
  */
-- 
cgit v1.2.3


From ef510194cefe0cd369ef73419cd65b0a5bb4fb5b Mon Sep 17 00:00:00 2001
From: Greg Thelen <gthelen@google.com>
Date: Thu, 1 Oct 2015 15:37:13 -0700
Subject: memcg: remove pcp_counter_lock

Commit 733a572e66d2 ("memcg: make mem_cgroup_read_{stat|event}() iterate
possible cpus instead of online") removed the last use of the per memcg
pcp_counter_lock but forgot to remove the variable.

Kill the vestigial variable.

Signed-off-by: Greg Thelen <gthelen@google.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index ad800e62cb7a..6452ff4c463f 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -242,7 +242,6 @@ struct mem_cgroup {
 	 * percpu counter.
 	 */
 	struct mem_cgroup_stat_cpu __percpu *stat;
-	spinlock_t pcp_counter_lock;
 
 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
 	struct cg_proto tcp_mem;
-- 
cgit v1.2.3


From d9515c5ec1a20c77d83471e634ad9bb12deb0eac Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Wed, 16 Sep 2015 17:55:23 +1000
Subject: drm/dp/mst: split connector registration into two parts (v2)

In order to cache the EDID properly for tiled displays, we
need to retrieve it before we register the connector with
userspace, otherwise userspace can call get resources
and try and get the edid before we've even cached it.

This fixes some problems when hotplugging mst monitors,
with X/mutter running. As mutter seems to get 0 modes
for one of the monitors in the tile.

v2: fix warning in radeon
handle tile setting in cached path rather than
get edid path.

Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: stable@vger.kernel.org
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 include/drm/drm_dp_mst_helper.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/drm/drm_dp_mst_helper.h b/include/drm/drm_dp_mst_helper.h
index 86d0b25ed054..0f408b002d98 100644
--- a/include/drm/drm_dp_mst_helper.h
+++ b/include/drm/drm_dp_mst_helper.h
@@ -374,6 +374,7 @@ struct drm_dp_mst_topology_mgr;
 struct drm_dp_mst_topology_cbs {
 	/* create a connector for a port */
 	struct drm_connector *(*add_connector)(struct drm_dp_mst_topology_mgr *mgr, struct drm_dp_mst_port *port, const char *path);
+	void (*register_connector)(struct drm_connector *connector);
 	void (*destroy_connector)(struct drm_dp_mst_topology_mgr *mgr,
 				  struct drm_connector *connector);
 	void (*hotplug)(struct drm_dp_mst_topology_mgr *mgr);
-- 
cgit v1.2.3


From ccf03d6995fa4b784f5b987726ba98f4859bf326 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Thu, 1 Oct 2015 16:28:25 +1000
Subject: drm/dp/mst: add some defines for logical/physical ports

This just removes the magic number.

Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 include/drm/drm_dp_helper.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/drm/drm_dp_helper.h b/include/drm/drm_dp_helper.h
index 499e9f625aef..0212d139a480 100644
--- a/include/drm/drm_dp_helper.h
+++ b/include/drm/drm_dp_helper.h
@@ -568,6 +568,10 @@
 #define MODE_I2C_READ	4
 #define MODE_I2C_STOP	8
 
+/* DP 1.2 MST PORTs - Section 2.5.1 v1.2a spec */
+#define DP_MST_PHYSICAL_PORT_0 0
+#define DP_MST_LOGICAL_PORT_0 8
+
 #define DP_LINK_STATUS_SIZE	   6
 bool drm_dp_channel_eq_ok(const u8 link_status[DP_LINK_STATUS_SIZE],
 			  int lane_count);
-- 
cgit v1.2.3