From d1c85c2ebe7ffe1f1b27846bd1ba0944c513d822 Mon Sep 17 00:00:00 2001
From: Zhouyi Zhou <zhouzhouyi@gmail.com>
Date: Fri, 22 Aug 2014 10:40:15 +0800
Subject: netfilter: HAVE_JUMP_LABEL instead of CONFIG_JUMP_LABEL

Use HAVE_JUMP_LABEL as elsewhere in the kernel to ensure
that the toolchain has the required support in addition to
CONFIG_JUMP_LABEL being set.

Signed-off-by: Zhouyi Zhou <yizhouzhou@ict.ac.cn>
Reviewed-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 2077489f9887..2517ece98820 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -9,6 +9,7 @@
 #include <linux/in6.h>
 #include <linux/wait.h>
 #include <linux/list.h>
+#include <linux/static_key.h>
 #include <uapi/linux/netfilter.h>
 #ifdef CONFIG_NETFILTER
 static inline int NF_DROP_GETERR(int verdict)
@@ -99,9 +100,9 @@ void nf_unregister_sockopt(struct nf_sockopt_ops *reg);
 
 extern struct list_head nf_hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS];
 
-#if defined(CONFIG_JUMP_LABEL)
-#include <linux/static_key.h>
+#ifdef HAVE_JUMP_LABEL
 extern struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS];
+
 static inline bool nf_hooks_active(u_int8_t pf, unsigned int hook)
 {
 	if (__builtin_constant_p(pf) &&
-- 
cgit v1.2.3


From b95089d00c04712a9d4655d5c638930ac24b7bd3 Mon Sep 17 00:00:00 2001
From: Or Gerlitz <ogerlitz@mellanox.com>
Date: Wed, 27 Aug 2014 16:47:48 +0300
Subject: net/mlx4: Move the tunnel steering helper function to mlx4_core

Move the function which we use to set VXLAN DMFS (flow-steering) rules
from mlx4_en to mlx4_core. This refactoring will allow the mlx4_ib driver
to call the helper for the use case of user-space RAW Ethernet QPs, such
that they can serve VXLAN traffic too.

Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mlx4/device.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 071f6b234604..511c6e0d21a9 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -1196,6 +1196,9 @@ int mlx4_map_sw_to_hw_steering_id(struct mlx4_dev *dev,
 				  enum mlx4_net_trans_rule_id id);
 int mlx4_hw_rule_sz(struct mlx4_dev *dev, enum mlx4_net_trans_rule_id id);
 
+int mlx4_tunnel_steer_add(struct mlx4_dev *dev, unsigned char *addr,
+			  int port, int qpn, u16 prio, u64 *reg_id);
+
 void mlx4_sync_pkey_table(struct mlx4_dev *dev, int slave, int port,
 			  int i, int val);
 
-- 
cgit v1.2.3


From e793c0f70e9bdf4a2e71c151a1a3cf85c4db92ad Mon Sep 17 00:00:00 2001
From: Masanari Iida <standby24x7@gmail.com>
Date: Thu, 4 Sep 2014 23:44:36 +0900
Subject: net: treewide: Fix typo found in DocBook/networking.xml

This patch fix spelling typo found in DocBook/networking.xml.
It is because the neworking.xml is generated from comments
in the source, I have to fix typo in comments within the source.

Signed-off-by: Masanari Iida <standby24x7@gmail.com>
Acked-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 38377392d082..c8e388e5fccc 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3176,7 +3176,7 @@ static inline int __dev_uc_sync(struct net_device *dev,
 }
 
 /**
- *  __dev_uc_unsync - Remove synchonized addresses from device
+ *  __dev_uc_unsync - Remove synchronized addresses from device
  *  @dev:  device to sync
  *  @unsync: function to call if address should be removed
  *
@@ -3220,7 +3220,7 @@ static inline int __dev_mc_sync(struct net_device *dev,
 }
 
 /**
- *  __dev_mc_unsync - Remove synchonized addresses from device
+ *  __dev_mc_unsync - Remove synchronized addresses from device
  *  @dev:  device to sync
  *  @unsync: function to call if address should be removed
  *
-- 
cgit v1.2.3


From d78c9300c51d6ceed9f6d078d4e9366f259de28c Mon Sep 17 00:00:00 2001
From: Andrew Hunter <ahh@google.com>
Date: Thu, 4 Sep 2014 14:17:16 -0700
Subject: jiffies: Fix timeval conversion to jiffies

timeval_to_jiffies tried to round a timeval up to an integral number
of jiffies, but the logic for doing so was incorrect: intervals
corresponding to exactly N jiffies would become N+1. This manifested
itself particularly repeatedly stopping/starting an itimer:

setitimer(ITIMER_PROF, &val, NULL);
setitimer(ITIMER_PROF, NULL, &val);

would add a full tick to val, _even if it was exactly representable in
terms of jiffies_ (say, the result of a previous rounding.)  Doing
this repeatedly would cause unbounded growth in val.  So fix the math.

Here's what was wrong with the conversion: we essentially computed
(eliding seconds)

jiffies = usec  * (NSEC_PER_USEC/TICK_NSEC)

by using scaling arithmetic, which took the best approximation of
NSEC_PER_USEC/TICK_NSEC with denominator of 2^USEC_JIFFIE_SC =
x/(2^USEC_JIFFIE_SC), and computed:

jiffies = (usec * x) >> USEC_JIFFIE_SC

and rounded this calculation up in the intermediate form (since we
can't necessarily exactly represent TICK_NSEC in usec.) But the
scaling arithmetic is a (very slight) *over*approximation of the true
value; that is, instead of dividing by (1 usec/ 1 jiffie), we
effectively divided by (1 usec/1 jiffie)-epsilon (rounding
down). This would normally be fine, but we want to round timeouts up,
and we did so by adding 2^USEC_JIFFIE_SC - 1 before the shift; this
would be fine if our division was exact, but dividing this by the
slightly smaller factor was equivalent to adding just _over_ 1 to the
final result (instead of just _under_ 1, as desired.)

In particular, with HZ=1000, we consistently computed that 10000 usec
was 11 jiffies; the same was true for any exact multiple of
TICK_NSEC.

We could possibly still round in the intermediate form, adding
something less than 2^USEC_JIFFIE_SC - 1, but easier still is to
convert usec->nsec, round in nanoseconds, and then convert using
time*spec*_to_jiffies.  This adds one constant multiplication, and is
not observably slower in microbenchmarks on recent x86 hardware.

Tested: the following program:

int main() {
  struct itimerval zero = {{0, 0}, {0, 0}};
  /* Initially set to 10 ms. */
  struct itimerval initial = zero;
  initial.it_interval.tv_usec = 10000;
  setitimer(ITIMER_PROF, &initial, NULL);
  /* Save and restore several times. */
  for (size_t i = 0; i < 10; ++i) {
    struct itimerval prev;
    setitimer(ITIMER_PROF, &zero, &prev);
    /* on old kernels, this goes up by TICK_USEC every iteration */
    printf("previous value: %ld %ld %ld %ld\n",
           prev.it_interval.tv_sec, prev.it_interval.tv_usec,
           prev.it_value.tv_sec, prev.it_value.tv_usec);
    setitimer(ITIMER_PROF, &prev, NULL);
  }
    return 0;
}

Cc: stable@vger.kernel.org
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Paul Turner <pjt@google.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Reviewed-by: Paul Turner <pjt@google.com>
Reported-by: Aaron Jacobs <jacobsa@google.com>
Signed-off-by: Andrew Hunter <ahh@google.com>
[jstultz: Tweaked to apply to 3.17-rc]
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 include/linux/jiffies.h | 12 ------------
 1 file changed, 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h
index 1f44466c1e9d..c367cbdf73ab 100644
--- a/include/linux/jiffies.h
+++ b/include/linux/jiffies.h
@@ -258,23 +258,11 @@ extern unsigned long preset_lpj;
 #define SEC_JIFFIE_SC (32 - SHIFT_HZ)
 #endif
 #define NSEC_JIFFIE_SC (SEC_JIFFIE_SC + 29)
-#define USEC_JIFFIE_SC (SEC_JIFFIE_SC + 19)
 #define SEC_CONVERSION ((unsigned long)((((u64)NSEC_PER_SEC << SEC_JIFFIE_SC) +\
                                 TICK_NSEC -1) / (u64)TICK_NSEC))
 
 #define NSEC_CONVERSION ((unsigned long)((((u64)1 << NSEC_JIFFIE_SC) +\
                                         TICK_NSEC -1) / (u64)TICK_NSEC))
-#define USEC_CONVERSION  \
-                    ((unsigned long)((((u64)NSEC_PER_USEC << USEC_JIFFIE_SC) +\
-                                        TICK_NSEC -1) / (u64)TICK_NSEC))
-/*
- * USEC_ROUND is used in the timeval to jiffie conversion.  See there
- * for more details.  It is the scaled resolution rounding value.  Note
- * that it is a 64-bit value.  Since, when it is applied, we are already
- * in jiffies (albit scaled), it is nothing but the bits we will shift
- * off.
- */
-#define USEC_ROUND (u64)(((u64)1 << USEC_JIFFIE_SC) - 1)
 /*
  * The maximum jiffie value is (MAX_INT >> 1).  Here we translate that
  * into seconds.  The 64-bit case will overflow if we are not careful,
-- 
cgit v1.2.3


From 23d0db76ffa13ffb95229946e4648568c3c29db5 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sat, 13 Sep 2014 11:24:03 -0700
Subject: Make hash_64() use a 64-bit multiply when appropriate

The hash_64() function historically does the multiply by the
GOLDEN_RATIO_PRIME_64 number with explicit shifts and adds, because
unlike the 32-bit case, gcc seems unable to turn the constant multiply
into the more appropriate shift and adds when required.

However, that means that we generate those shifts and adds even when the
architecture has a fast multiplier, and could just do it better in
hardware.

Use the now-cleaned-up CONFIG_ARCH_HAS_FAST_MULTIPLIER (together with
"is it a 64-bit architecture") to decide whether to use an integer
multiply or the explicit sequence of shift/add instructions.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hash.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/hash.h b/include/linux/hash.h
index bd1754c7ecef..d0494c399392 100644
--- a/include/linux/hash.h
+++ b/include/linux/hash.h
@@ -37,6 +37,9 @@ static __always_inline u64 hash_64(u64 val, unsigned int bits)
 {
 	u64 hash = val;
 
+#if defined(CONFIG_ARCH_HAS_FAST_MULTIPLIER) && BITS_PER_LONG == 64
+	hash = hash * GOLDEN_RATIO_PRIME_64;
+#else
 	/*  Sigh, gcc can't optimise this alone like it does for 32 bits. */
 	u64 n = hash;
 	n <<= 18;
@@ -51,6 +54,7 @@ static __always_inline u64 hash_64(u64 val, unsigned int bits)
 	hash += n;
 	n <<= 2;
 	hash += n;
+#endif
 
 	/* High bits are more random, so use them. */
 	return hash >> (64 - bits);
-- 
cgit v1.2.3


From 9226b5b440f2b4fbb3b797f3cb74a9a627220660 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 14 Sep 2014 17:28:32 -0700
Subject: vfs: avoid non-forwarding large load after small store in path lookup

The performance regression that Josef Bacik reported in the pathname
lookup (see commit 99d263d4c5b2 "vfs: fix bad hashing of dentries") made
me look at performance stability of the dcache code, just to verify that
the problem was actually fixed.  That turned up a few other problems in
this area.

There are a few cases where we exit RCU lookup mode and go to the slow
serializing case when we shouldn't, Al has fixed those and they'll come
in with the next VFS pull.

But my performance verification also shows that link_path_walk() turns
out to have a very unfortunate 32-bit store of the length and hash of
the name we look up, followed by a 64-bit read of the combined hash_len
field.  That screws up the processor store to load forwarding, causing
an unnecessary hickup in this critical routine.

It's caused by the ugly calling convention for the "hash_name()"
function, and easily fixed by just making hash_name() fill in the whole
'struct qstr' rather than passing it a pointer to just the hash value.

With that, the profile for this function looks much smoother.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/dcache.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index e4ae2ad48d07..75a227cc7ce2 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -55,6 +55,7 @@ struct qstr {
 #define QSTR_INIT(n,l) { { { .len = l } }, .name = n }
 #define hashlen_hash(hashlen) ((u32) (hashlen))
 #define hashlen_len(hashlen)  ((u32)((hashlen) >> 32))
+#define hashlen_create(hash,len) (((u64)(len)<<32)|(u32)(hash))
 
 struct dentry_stat_t {
 	long nr_dentry;
-- 
cgit v1.2.3