From b1b38278e12b04cf9a227f6af2c24651cf6e8a85 Mon Sep 17 00:00:00 2001
From: David Weinehall <david.weinehall@linux.intel.com>
Date: Wed, 20 May 2015 17:00:13 +0300
Subject: drm/i915: add a context parameter to {en, dis}able zero address
 mapping

Export a new context parameter that can be set/queried through the
context_{get,set}param ioctls.  This parameter is passed as a context
flag and decides whether or not a GPU address mapping is allowed to
be made at address zero.  The default is to allow such mappings.

Signed-off-by: David Weinehall <david.weinehall@intel.com>
Acked-by: "Zou, Nanhai" <nanhai.zou@intel.com>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 include/uapi/drm/i915_drm.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi')

diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 6e1a2ed116cb..92d61a7c942a 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -1106,6 +1106,7 @@ struct drm_i915_gem_context_param {
 	__u32 size;
 	__u64 param;
 #define I915_CONTEXT_PARAM_BAN_PERIOD 0x1
+#define I915_CONTEXT_PARAM_NO_ZEROMAP 0x2
 	__u64 value;
 };
 
-- 
cgit v1.2.3


From 49e4d842f0d0892c3d26c93a81b9f22c1467030e Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Mon, 15 Jun 2015 12:23:48 +0100
Subject: drm/i915: Report to userspace if we have a (presumed) working GPU
 reset

In igt, we want to test handling of GPU hangs, both for recovery
purposes and for reporting. However, we don't want to inject a genuine
GPU hang onto a machine that cannot recover and so be permenantly
wedged. Rather than embed heuristics into igt, have the kernel report
exactly when it expects the GPU reset to work.

This can also be usefully extended in future to indicate different
levels of fine-grained resets.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Tim Gore <tim.gore@intel.com>
Cc: Tomas Elf <tomas.elf@intel.com>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 include/uapi/drm/i915_drm.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi')

diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 92d61a7c942a..f88cc1cac5d9 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -354,6 +354,7 @@ typedef struct drm_i915_irq_wait {
 #define I915_PARAM_REVISION              32
 #define I915_PARAM_SUBSLICE_TOTAL	 33
 #define I915_PARAM_EU_TOTAL		 34
+#define I915_PARAM_HAS_GPU_RESET	 35
 
 typedef struct drm_i915_getparam {
 	int param;
-- 
cgit v1.2.3


From a9ed33ca075f712cc7fd96eb84e3d322012fcaaf Mon Sep 17 00:00:00 2001
From: Abdiel Janulgue <abdiel.janulgue@linux.intel.com>
Date: Wed, 1 Jul 2015 10:12:23 +0300
Subject: drm/i915: Expose I915_EXEC_RESOURCE_STREAMER flag and getparam

Ensures that the batch buffer is executed by the resource streamer.
And will let userspace know whether Resource Streamer is supported in
the kernel.

v2: Don't skip 1<<15 for the exec flags (Jani Nikula)
v3: Use HAS_RESOURCE_STREAMER macro for execbuf validation (Chris Wilson)

(from getparam patch)

v2: Update I915_PARAM_HAS_RESOURCE_STREAMER so it's after
    I915_PARAM_HAS_GPU_RESET.
v3: Only advertise RS support for hardware that supports it.
v4: Add HAS_RESOURCE_STREAMER() macro (Chris)

Testcase: igt/gem_exec_params
Cc: Jani Nikula <jani.nikula@intel.com>
Cc: Kenneth Graunke <kenneth@whitecape.org>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Abdiel Janulgue <abdiel.janulgue@linux.intel.com>
[danvet: squash in getparam patch since it'd break bisect, suggested
by Chris.]
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 include/uapi/drm/i915_drm.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index f88cc1cac5d9..e7c29f1659ad 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -355,6 +355,7 @@ typedef struct drm_i915_irq_wait {
 #define I915_PARAM_SUBSLICE_TOTAL	 33
 #define I915_PARAM_EU_TOTAL		 34
 #define I915_PARAM_HAS_GPU_RESET	 35
+#define I915_PARAM_HAS_RESOURCE_STREAMER 36
 
 typedef struct drm_i915_getparam {
 	int param;
@@ -765,7 +766,12 @@ struct drm_i915_gem_execbuffer2 {
 #define I915_EXEC_BSD_RING1		(1<<13)
 #define I915_EXEC_BSD_RING2		(2<<13)
 
-#define __I915_EXEC_UNKNOWN_FLAGS -(1<<15)
+/** Tell the kernel that the batchbuffer is processed by
+ *  the resource streamer.
+ */
+#define I915_EXEC_RESOURCE_STREAMER     (1<<15)
+
+#define __I915_EXEC_UNKNOWN_FLAGS -(I915_EXEC_RESOURCE_STREAMER<<1)
 
 #define I915_EXEC_CONTEXT_ID_MASK	(0xffffffff)
 #define i915_execbuffer2_set_context_id(eb2, context) \
-- 
cgit v1.2.3


From 9aee1ae3312daf0de4c9c614680d06d557133317 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Sat, 16 May 2015 15:11:40 -0300
Subject: [media] media: uapi: vsp1: Use __u32 instead of u32

Don't use the kernel types in uapi headers.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
---
 include/uapi/linux/vsp1.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/vsp1.h b/include/uapi/linux/vsp1.h
index e18858f6e865..9a823696d816 100644
--- a/include/uapi/linux/vsp1.h
+++ b/include/uapi/linux/vsp1.h
@@ -28,7 +28,7 @@
 	_IOWR('V', BASE_VIDIOC_PRIVATE + 1, struct vsp1_lut_config)
 
 struct vsp1_lut_config {
-	u32 lut[256];
+	__u32 lut[256];
 };
 
 #endif	/* __VSP1_USER_H__ */
-- 
cgit v1.2.3


From 1055e0687f581cf988bb2239a00d1396e18ef114 Mon Sep 17 00:00:00 2001
From: Chad Versace <chad.versace@intel.com>
Date: Thu, 9 Jul 2015 01:38:42 -0700
Subject: drm/fourcc: Add formats R8, RG88, GR88
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The Kodi/XBMC developers want to transcode NV12 to RGB with OpenGL
shaders, importing the two source planes through
EGL_EXT_image_dma_buf_import. That requires importing the Y plane as an
R8 EGLImage and the UV plane as either an RG88 or GR88 EGLImage.

CC: Peter Frühberger <peter.fruehberger@gmail.com>
Cc: Rainer Hochecker <rainer.hochecker@onlinehome.de>
Cc: Benjamin Widawsky <benjamin.widawsky@intel.com>
Reviewed-by: Pekka Paalanen <pekka.paalanen@collabora.co.uk>
Signed-off-by: Chad Versace <chad.versace@intel.com>
---
 include/uapi/drm/drm_fourcc.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/drm/drm_fourcc.h b/include/uapi/drm/drm_fourcc.h
index 2f295cde657e..8c5e8b91a3cb 100644
--- a/include/uapi/drm/drm_fourcc.h
+++ b/include/uapi/drm/drm_fourcc.h
@@ -34,6 +34,13 @@
 /* color index */
 #define DRM_FORMAT_C8		fourcc_code('C', '8', ' ', ' ') /* [7:0] C */
 
+/* 8 bpp Red */
+#define DRM_FORMAT_R8		fourcc_code('R', '8', ' ', ' ') /* [7:0] R */
+
+/* 16 bpp RG */
+#define DRM_FORMAT_RG88		fourcc_code('R', 'G', '8', '8') /* [15:0] R:G 8:8 little endian */
+#define DRM_FORMAT_GR88		fourcc_code('G', 'R', '8', '8') /* [15:0] G:R 8:8 little endian */
+
 /* 8 bpp RGB */
 #define DRM_FORMAT_RGB332	fourcc_code('R', 'G', 'B', '8') /* [7:0] R:G:B 3:3:2 */
 #define DRM_FORMAT_BGR233	fourcc_code('B', 'G', 'R', '8') /* [7:0] B:G:R 2:3:3 */
-- 
cgit v1.2.3


From 74fe61f17e999a458d5f64ca2aa9a0282ca32198 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Fri, 10 Jul 2015 08:02:08 -0700
Subject: bridge: mdb: add vlan support for user entries

Until now all user mdb entries were added in vlan 0, this patch adds
support to allow the user to specify the vlan for the entry.
About the uapi change a hole in struct br_mdb_entry is used so the size
and offsets are kept the same (verified with pahole and tested with older
iproute2).

Example:
$ bridge mdb
dev br0 port eth1 grp 239.0.0.1 permanent vlan 2000
dev br0 port eth1 grp 239.0.0.1 permanent vlan 200
dev br0 port eth1 grp 239.0.0.1 permanent

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_bridge.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index eaaea6208b42..3635b7797508 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -182,6 +182,7 @@ struct br_mdb_entry {
 #define MDB_TEMPORARY 0
 #define MDB_PERMANENT 1
 	__u8 state;
+	__u16 vid;
 	struct {
 		union {
 			__be32	ip4;
-- 
cgit v1.2.3


From 346add7834557b5b9628b9bf2387106d42e631d4 Mon Sep 17 00:00:00 2001
From: Daniel Vetter <daniel.vetter@ffwll.ch>
Date: Tue, 14 Jul 2015 18:07:30 +0200
Subject: drm/i915: Use expcitly fixed type in compat32 structs

I was confused shortly whether the compat was needed for the int,
until I noticed the pointer in the original.

Also remove typedef.

v2: Review from Chris.
- Add comments.
- Also change the int param in the original structure.

Cc: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 include/uapi/drm/i915_drm.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index e7c29f1659ad..192027b4f031 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -358,7 +358,11 @@ typedef struct drm_i915_irq_wait {
 #define I915_PARAM_HAS_RESOURCE_STREAMER 36
 
 typedef struct drm_i915_getparam {
-	int param;
+	s32 param;
+	/*
+	 * WARNING: Using pointers instead of fixed-size u64 means we need to write
+	 * compat32 code. Don't repeat this mistake.
+	 */
 	int __user *value;
 } drm_i915_getparam_t;
 
-- 
cgit v1.2.3


From 13c4a90119d28cfcb6b5bdd820c233b86c2b0237 Mon Sep 17 00:00:00 2001
From: Tycho Andersen <tycho.andersen@canonical.com>
Date: Sat, 13 Jun 2015 09:02:48 -0600
Subject: seccomp: add ptrace options for suspend/resume

This patch is the first step in enabling checkpoint/restore of processes
with seccomp enabled.

One of the things CRIU does while dumping tasks is inject code into them
via ptrace to collect information that is only available to the process
itself. However, if we are in a seccomp mode where these processes are
prohibited from making these syscalls, then what CRIU does kills the task.

This patch adds a new ptrace option, PTRACE_O_SUSPEND_SECCOMP, that enables
a task from the init user namespace which has CAP_SYS_ADMIN and no seccomp
filters to disable (and re-enable) seccomp filters for another task so that
they can be successfully dumped (and restored). We restrict the set of
processes that can disable seccomp through ptrace because although today
ptrace can be used to bypass seccomp, there is some discussion of closing
this loophole in the future and we would like this patch to not depend on
that behavior and be future proofed for when it is removed.

Note that seccomp can be suspended before any filters are actually
installed; this behavior is useful on criu restore, so that we can suspend
seccomp, restore the filters, unmap our restore code from the restored
process' address space, and then resume the task by detaching and have the
filters resumed as well.

v2 changes:

* require that the tracer have no seccomp filters installed
* drop TIF_NOTSC manipulation from the patch
* change from ptrace command to a ptrace option and use this ptrace option
  as the flag to check. This means that as soon as the tracer
  detaches/dies, seccomp is re-enabled and as a corrollary that one can not
  disable seccomp across PTRACE_ATTACHs.

v3 changes:

* get rid of various #ifdefs everywhere
* report more sensible errors when PTRACE_O_SUSPEND_SECCOMP is incorrectly
  used

v4 changes:

* get rid of may_suspend_seccomp() in favor of a capable() check in ptrace
  directly

v5 changes:

* check that seccomp is not enabled (or suspended) on the tracer

Signed-off-by: Tycho Andersen <tycho.andersen@canonical.com>
CC: Will Drewry <wad@chromium.org>
CC: Roland McGrath <roland@hack.frob.com>
CC: Pavel Emelyanov <xemul@parallels.com>
CC: Serge E. Hallyn <serge.hallyn@ubuntu.com>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Andy Lutomirski <luto@amacapital.net>
[kees: access seccomp.mode through seccomp_mode() instead]
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/uapi/linux/ptrace.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/ptrace.h b/include/uapi/linux/ptrace.h
index cf1019e15f5b..a7a697986614 100644
--- a/include/uapi/linux/ptrace.h
+++ b/include/uapi/linux/ptrace.h
@@ -89,9 +89,11 @@ struct ptrace_peeksiginfo_args {
 #define PTRACE_O_TRACESECCOMP	(1 << PTRACE_EVENT_SECCOMP)
 
 /* eventless options */
-#define PTRACE_O_EXITKILL	(1 << 20)
+#define PTRACE_O_EXITKILL		(1 << 20)
+#define PTRACE_O_SUSPEND_SECCOMP	(1 << 21)
 
-#define PTRACE_O_MASK		(0x000000ff | PTRACE_O_EXITKILL)
+#define PTRACE_O_MASK		(\
+	0x000000ff | PTRACE_O_EXITKILL | PTRACE_O_SUSPEND_SECCOMP)
 
 #include <asm/ptrace.h>
 
-- 
cgit v1.2.3


From 88d6378bd6c096cb8440face3ae3f33d55a2e6e4 Mon Sep 17 00:00:00 2001
From: Anuradha Karuppiah <anuradhak@cumulusnetworks.com>
Date: Tue, 14 Jul 2015 13:43:20 -0700
Subject: netlink: changes for setting and clearing protodown via netlink.

Signed-off-by: Anuradha Karuppiah <anuradhak@cumulusnetworks.com>
Signed-off-by: Andy Gospodarek <gospo@cumulusnetworks.com>
Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: Wilson Kok <wkok@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_link.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 2c7e8e3d3981..24d68b797c59 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -148,6 +148,7 @@ enum {
 	IFLA_PHYS_SWITCH_ID,
 	IFLA_LINK_NETNSID,
 	IFLA_PHYS_PORT_NAME,
+	IFLA_PROTO_DOWN,
 	__IFLA_MAX
 };
 
-- 
cgit v1.2.3


From d32d98642de66048f9534a05f3641558e811bbc9 Mon Sep 17 00:00:00 2001
From: Mats Randgaard <matrandg@cisco.com>
Date: Thu, 9 Jul 2015 05:45:47 -0300
Subject: [media] Driver for Toshiba TC358743 HDMI to CSI-2 bridge

The driver is tested on our hardware and all the implemented features
works as expected.

Missing features:
- CEC support
- HDCP repeater support
- IR support

Signed-off-by: Mats Randgaard <matrandg@cisco.com>
[hans.verkuil@cisco.com: updated copyright year to 2015]
[hans.verkuil@cisco.com: update confusing confctl_mutex comment]
Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>

Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
---
 include/uapi/linux/v4l2-controls.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/v4l2-controls.h b/include/uapi/linux/v4l2-controls.h
index 9f6e108ff4a0..d448c536b49d 100644
--- a/include/uapi/linux/v4l2-controls.h
+++ b/include/uapi/linux/v4l2-controls.h
@@ -174,6 +174,10 @@ enum v4l2_colorfx {
  * We reserve 16 controls for this driver. */
 #define V4L2_CID_USER_ADV7180_BASE		(V4L2_CID_USER_BASE + 0x1070)
 
+/* The base for the tc358743 driver controls.
+ * We reserve 16 controls for this driver. */
+#define V4L2_CID_USER_TC358743_BASE		(V4L2_CID_USER_BASE + 0x1080)
+
 /* MPEG-class control IDs */
 /* The MPEG controls are applicable to all codec controls
  * and the 'MPEG' part of the define is historical */
-- 
cgit v1.2.3


From eff3cddc222c88943ff515ae9335687c9e2cbaf6 Mon Sep 17 00:00:00 2001
From: Jacob Keller <jacob.e.keller@intel.com>
Date: Wed, 22 Apr 2015 14:40:30 -0700
Subject: clarify implementation of ethtool's get_ts_info op

This patch adds some clarification about the intended way to implement
both SIOCSHWTSTAMP and ethtool's get_ts_info. The HWTSTAMP API has
several Rx filters which are very specific, as well as more general
filters. The specific filters really only exist to support some broken
hardware which can't fully implement the generic filters. This patch
adds clarification that it is okay to support the specific filters in
SIOCSHWTSTAMP by upscaling them to the generic filters. In addition,
update the header for ethtool_ts_info to specify that drivers ought to
only report the filters they support without upscaling in this manner.

Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Acked-by: Richard Cochran <richardcochran@gmail.com>
Tested-by: Phil Schmitt <phillip.j.schmitt@intel.com>
Reviewed-by: Aaron Brown <aaron.f.brown@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 include/uapi/linux/ethtool.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index cd67aec187d9..cd1629170103 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -1093,6 +1093,11 @@ struct ethtool_sfeatures {
  * the 'hwtstamp_tx_types' and 'hwtstamp_rx_filters' enumeration values,
  * respectively.  For example, if the device supports HWTSTAMP_TX_ON,
  * then (1 << HWTSTAMP_TX_ON) in 'tx_types' will be set.
+ *
+ * Drivers should only report the filters they actually support without
+ * upscaling in the SIOCSHWTSTAMP ioctl. If the SIOCSHWSTAMP request for
+ * HWTSTAMP_FILTER_V1_SYNC is supported by HWTSTAMP_FILTER_V1_EVENT, then the
+ * driver should only report HWTSTAMP_FILTER_V1_EVENT in this op.
  */
 struct ethtool_ts_info {
 	__u32	cmd;
-- 
cgit v1.2.3


From 7b8f4586532f36c5541a15d072576e7e89a5df75 Mon Sep 17 00:00:00 2001
From: Kinglong Mee <kinglongmee@gmail.com>
Date: Fri, 3 Jul 2015 19:39:02 +0800
Subject: nfsd: Add macro NFS_ACL_MASK for ACL

Signed-off-by: Kinglong Mee <kinglongmee@gmail.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/uapi/linux/nfsacl.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/nfsacl.h b/include/uapi/linux/nfsacl.h
index 9bb9771a107f..552726631162 100644
--- a/include/uapi/linux/nfsacl.h
+++ b/include/uapi/linux/nfsacl.h
@@ -22,6 +22,7 @@
 #define NFS_ACLCNT		0x0002
 #define NFS_DFACL		0x0004
 #define NFS_DFACLCNT		0x0008
+#define NFS_ACL_MASK		0x000f
 
 /* Flag for Default ACL entries */
 #define NFS_ACL_DEFAULT		0x1000
-- 
cgit v1.2.3


From 8d20aabe1c76cccac544d9fcc3ad7823d9e98a2d Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 15 Jul 2015 14:21:42 +0200
Subject: ebpf: add helper to retrieve net_cls's classid cookie

It would be very useful to retrieve the net_cls's classid from an eBPF
program to allow for a more fine-grained classification, it could be
directly used or in conjunction with additional policies. I.e. docker,
but also tooling such as cgexec, can easily run applications via net_cls
cgroups:

  cgcreate -g net_cls:/foo
  echo 42 > foo/net_cls.classid
  cgexec -g net_cls:foo <prog>

Thus, their respecitve classid cookie of foo can then be looked up on
the egress path to apply further policies. The helper is desigend such
that a non-zero value returns the cgroup id.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Thomas Graf <tgraf@suug.ch>
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 29ef6f99e43d..2de87e58b12b 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -249,6 +249,13 @@ enum bpf_func_id {
 	 * Return: 0 on success
 	 */
 	BPF_FUNC_get_current_comm,
+
+	/**
+	 * bpf_get_cgroup_classid(skb) - retrieve a proc's classid
+	 * @skb: pointer to skb
+	 * Return: classid if != 0
+	 */
+	BPF_FUNC_get_cgroup_classid,
 	__BPF_FUNC_MAX_ID,
 };
 
-- 
cgit v1.2.3


From 4e10df9a60d96ced321dd2af71da558c6b750078 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Mon, 20 Jul 2015 20:34:18 -0700
Subject: bpf: introduce bpf_skb_vlan_push/pop() helpers

Allow eBPF programs attached to TC qdiscs call skb_vlan_push/pop via
helper functions. These functions may change skb->data/hlen which are
cached by some JITs to improve performance of ld_abs/ld_ind instructions.
Therefore JITs need to recognize bpf_skb_vlan_push/pop() calls,
re-compute header len and re-cache skb->data/hlen back into cpu registers.
Note, skb->data/hlen are not directly accessible from the programs,
so any changes to skb->data done either by these helpers or by other
TC actions are safe.

eBPF JIT supported by three architectures:
- arm64 JIT is using bpf_load_pointer() without caching, so it's ok as-is.
- x64 JIT re-caches skb->data/hlen unconditionally after vlan_push/pop calls
  (experiments showed that conditional re-caching is slower).
- s390 JIT falls back to interpreter for now when bpf_skb_vlan_push() is present
  in the program (re-caching is tbd).

These helpers allow more scalable handling of vlan from the programs.
Instead of creating thousands of vlan netdevs on top of eth0 and attaching
TC+ingress+bpf to all of them, the program can be attached to eth0 directly
and manipulate vlans as necessary.

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 2de87e58b12b..2f6c83d714e9 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -256,6 +256,8 @@ enum bpf_func_id {
 	 * Return: classid if != 0
 	 */
 	BPF_FUNC_get_cgroup_classid,
+	BPF_FUNC_skb_vlan_push, /* bpf_skb_vlan_push(skb, vlan_proto, vlan_tci) */
+	BPF_FUNC_skb_vlan_pop,  /* bpf_skb_vlan_pop(skb) */
 	__BPF_FUNC_MAX_ID,
 };
 
-- 
cgit v1.2.3


From 8ab30c1538b14424015e45063c41d509b24c1dea Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alex=20Benn=C3=A9e?= <alex.bennee@linaro.org>
Date: Tue, 7 Jul 2015 17:29:53 +0100
Subject: KVM: add comments for kvm_debug_exit_arch struct
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bring into line with the comments for the other structures and their
KVM_EXIT_* cases. Also update api.txt to reflect use in kvm_run
documentation.

Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Reviewed-by: Andrew Jones <drjones@redhat.com>
Acked-by: Christoffer Dall <christoffer.dall@linaro.org>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 include/uapi/linux/kvm.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 716ad4ae4d4b..4ab3c6a8d563 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -237,6 +237,7 @@ struct kvm_run {
 			__u32 count;
 			__u64 data_offset; /* relative to kvm_run start */
 		} io;
+		/* KVM_EXIT_DEBUG */
 		struct {
 			struct kvm_debug_exit_arch arch;
 		} debug;
@@ -285,6 +286,7 @@ struct kvm_run {
 			__u32 data;
 			__u8  is_write;
 		} dcr;
+		/* KVM_EXIT_INTERNAL_ERROR */
 		struct {
 			__u32 suberror;
 			/* Available with KVM_CAP_INTERNAL_ERROR_DATA: */
@@ -295,6 +297,7 @@ struct kvm_run {
 		struct {
 			__u64 gprs[32];
 		} osi;
+		/* KVM_EXIT_PAPR_HCALL */
 		struct {
 			__u64 nr;
 			__u64 ret;
-- 
cgit v1.2.3


From 5540546bc93b49f98a0466fe3f96615286c76574 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alex=20Benn=C3=A9e?= <alex.bennee@linaro.org>
Date: Tue, 7 Jul 2015 17:30:01 +0100
Subject: KVM: arm64: guest debug, HW assisted debug support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This adds support for userspace to control the HW debug registers for
guest debug. In the debug ioctl we copy an IMPDEF registers into a new
register set called host_debug_state.

We use the recently introduced vcpu parameter debug_ptr to select which
register set is copied into the real registers when world switch occurs.

I've made some helper functions from hw_breakpoint.c more widely
available for re-use.

As with single step we need to tweak the guest registers to enable the
exceptions so we need to save and restore those bits.

Two new capabilities have been added to the KVM_EXTENSION ioctl to allow
userspace to query the number of hardware break and watch points
available on the host hardware.

Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Christoffer Dall <christoffer.dall@linaro.org>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 include/uapi/linux/kvm.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 4ab3c6a8d563..a1e08e7bbf20 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -820,6 +820,8 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_DISABLE_QUIRKS 116
 #define KVM_CAP_X86_SMM 117
 #define KVM_CAP_MULTI_ADDRESS_SPACE 118
+#define KVM_CAP_GUEST_DEBUG_HW_BPS 119
+#define KVM_CAP_GUEST_DEBUG_HW_WPS 120
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
cgit v1.2.3


From a0d9a8604f29ee3340126ec3f90c9421f930aa50 Mon Sep 17 00:00:00 2001
From: Roopa Prabhu <roopa@cumulusnetworks.com>
Date: Tue, 21 Jul 2015 10:43:45 +0200
Subject: rtnetlink: introduce new RTA_ENCAP_TYPE and RTA_ENCAP attributes

This patch introduces two new RTA attributes to attach encap
data to fib routes.

Example iproute2 command to attach mpls encap data to ipv4 routes

$ip route add 10.1.1.0/30 encap mpls 200 via inet 10.1.1.1 dev swp1

Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Suggested-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/rtnetlink.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index fdd8f07f1d34..0d3d3cc43356 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -308,6 +308,8 @@ enum rtattr_type_t {
 	RTA_VIA,
 	RTA_NEWDST,
 	RTA_PREF,
+	RTA_ENCAP_TYPE,
+	RTA_ENCAP,
 	__RTA_MAX
 };
 
-- 
cgit v1.2.3


From 499a24256862714539e902c0499b67da2bb3ab72 Mon Sep 17 00:00:00 2001
From: Roopa Prabhu <roopa@cumulusnetworks.com>
Date: Tue, 21 Jul 2015 10:43:46 +0200
Subject: lwtunnel: infrastructure for handling light weight tunnels like mpls

Provides infrastructure to parse/dump/store encap information for
light weight tunnels like mpls. Encap information for such tunnels
is associated with fib routes.

This infrastructure is based on previous suggestions from
Eric Biederman to follow the xfrm infrastructure.

Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/lwtunnel.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)
 create mode 100644 include/uapi/linux/lwtunnel.h

(limited to 'include/uapi')

diff --git a/include/uapi/linux/lwtunnel.h b/include/uapi/linux/lwtunnel.h
new file mode 100644
index 000000000000..aa611d931a31
--- /dev/null
+++ b/include/uapi/linux/lwtunnel.h
@@ -0,0 +1,15 @@
+#ifndef _UAPI_LWTUNNEL_H_
+#define _UAPI_LWTUNNEL_H_
+
+#include <linux/types.h>
+
+enum lwtunnel_encap_types {
+	LWTUNNEL_ENCAP_NONE,
+	LWTUNNEL_ENCAP_MPLS,
+	__LWTUNNEL_ENCAP_MAX,
+};
+
+#define LWTUNNEL_ENCAP_MAX (__LWTUNNEL_ENCAP_MAX - 1)
+
+
+#endif /* _UAPI_LWTUNNEL_H_ */
-- 
cgit v1.2.3


From e3e4712ec0961ed586a8db340bd994c4ad7f5dba Mon Sep 17 00:00:00 2001
From: Roopa Prabhu <roopa@cumulusnetworks.com>
Date: Tue, 21 Jul 2015 10:43:53 +0200
Subject: mpls: ip tunnel support

This implementation uses lwtunnel infrastructure to register
hooks for mpls tunnel encaps.

It picks cues from iptunnel_encaps infrastructure and previous
mpls iptunnel RFC patches from Eric W. Biederman and Robert Shearman

Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/mpls_iptunnel.h | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)
 create mode 100644 include/uapi/linux/mpls_iptunnel.h

(limited to 'include/uapi')

diff --git a/include/uapi/linux/mpls_iptunnel.h b/include/uapi/linux/mpls_iptunnel.h
new file mode 100644
index 000000000000..d80a0498f77e
--- /dev/null
+++ b/include/uapi/linux/mpls_iptunnel.h
@@ -0,0 +1,28 @@
+/*
+ *	mpls tunnel api
+ *
+ *	Authors:
+ *		Roopa Prabhu <roopa@cumulusnetworks.com>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _UAPI_LINUX_MPLS_IPTUNNEL_H
+#define _UAPI_LINUX_MPLS_IPTUNNEL_H
+
+/* MPLS tunnel attributes
+ * [RTA_ENCAP] = {
+ *     [MPLS_IPTUNNEL_DST]
+ * }
+ */
+enum {
+	MPLS_IPTUNNEL_UNSPEC,
+	MPLS_IPTUNNEL_DST,
+	__MPLS_IPTUNNEL_MAX,
+};
+#define MPLS_IPTUNNEL_MAX (__MPLS_IPTUNNEL_MAX - 1)
+
+#endif /* _UAPI_LINUX_MPLS_IPTUNNEL_H */
-- 
cgit v1.2.3


From 1d8fff907342d2339796dbd27ea47d0e76a6a2d0 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Tue, 21 Jul 2015 10:43:54 +0200
Subject: ip_tunnel: Make ovs_tunnel_info and ovs_key_ipv4_tunnel generic

Rename the tunnel metadata data structures currently internal to
OVS and make them generic for use by all IP tunnels.

Both structures are kernel internal and will stay that way. Their
members are exposed to user space through individual Netlink
attributes by OVS. It will therefore be possible to extend/modify
these structures without affecting user ABI.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/openvswitch.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 1dab77601c21..d6b885460187 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -321,7 +321,7 @@ enum ovs_key_attr {
 				 * the accepted length of the array. */
 
 #ifdef __KERNEL__
-	OVS_KEY_ATTR_TUNNEL_INFO,  /* struct ovs_tunnel_info */
+	OVS_KEY_ATTR_TUNNEL_INFO,  /* struct ip_tunnel_info */
 #endif
 	__OVS_KEY_ATTR_MAX
 };
-- 
cgit v1.2.3


From ee122c79d4227f6ec642157834b6a90fcffa4382 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Tue, 21 Jul 2015 10:43:58 +0200
Subject: vxlan: Flow based tunneling

Allows putting a VXLAN device into a new flow-based mode in which
skbs with a ip_tunnel_info dst metadata attached will be encapsulated
according to the instructions stored in there with the VXLAN device
defaults taken into consideration.

Similar on the receive side, if the VXLAN_F_COLLECT_METADATA flag is
set, the packet processing will populate a ip_tunnel_info struct for
each packet received and attach it to the skb using the new metadata
dst.  The metadata structure will contain the outer header and tunnel
header fields which have been stripped off. Layers further up in the
stack such as routing, tc or netfitler can later match on these fields
and perform forwarding. It is the responsibility of upper layers to
ensure that the flag is set if the metadata is needed. The flag limits
the additional cost of metadata collecting based on demand.

This prepares the VXLAN device to be steered by the routing and other
subsystems which allows to support encapsulation for a large number
of tunnel endpoints and tunnel ids through a single net_device which
improves the scalability.

It also allows for OVS to leverage this mode which in turn allows for
the removal of the OVS specific VXLAN code.

Because the skb is currently scrubed in vxlan_rcv(), the attachment of
the new dst metadata is postponed until after scrubing which requires
the temporary addition of a new member to vxlan_metadata. This member
is removed again in a later commit after the indirect VXLAN receive API
has been removed.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_link.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 24d68b797c59..9eeb5d9cf8f0 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -382,6 +382,7 @@ enum {
 	IFLA_VXLAN_REMCSUM_RX,
 	IFLA_VXLAN_GBP,
 	IFLA_VXLAN_REMCSUM_NOPARTIAL,
+	IFLA_VXLAN_FLOWBASED,
 	__IFLA_VXLAN_MAX
 };
 #define IFLA_VXLAN_MAX	(__IFLA_VXLAN_MAX - 1)
-- 
cgit v1.2.3


From 3093fbe7ff4bc7d1571fc217dade1cf80330a714 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Tue, 21 Jul 2015 10:44:00 +0200
Subject: route: Per route IP tunnel metadata via lightweight tunnel

This introduces a new IP tunnel lightweight tunnel type which allows
to specify IP tunnel instructions per route. Only IPv4 is supported
at this point.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/lwtunnel.h  |  1 +
 include/uapi/linux/rtnetlink.h | 15 +++++++++++++++
 2 files changed, 16 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/lwtunnel.h b/include/uapi/linux/lwtunnel.h
index aa611d931a31..31377bbea3f8 100644
--- a/include/uapi/linux/lwtunnel.h
+++ b/include/uapi/linux/lwtunnel.h
@@ -6,6 +6,7 @@
 enum lwtunnel_encap_types {
 	LWTUNNEL_ENCAP_NONE,
 	LWTUNNEL_ENCAP_MPLS,
+	LWTUNNEL_ENCAP_IP,
 	__LWTUNNEL_ENCAP_MAX,
 };
 
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index 0d3d3cc43356..47d24cb3fbc1 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -286,6 +286,21 @@ enum rt_class_t {
 
 /* Routing message attributes */
 
+enum ip_tunnel_t {
+	IP_TUN_UNSPEC,
+	IP_TUN_ID,
+	IP_TUN_DST,
+	IP_TUN_SRC,
+	IP_TUN_TTL,
+	IP_TUN_TOS,
+	IP_TUN_SPORT,
+	IP_TUN_DPORT,
+	IP_TUN_FLAGS,
+	__IP_TUN_MAX,
+};
+
+#define IP_TUN_MAX (__IP_TUN_MAX - 1)
+
 enum rtattr_type_t {
 	RTA_UNSPEC,
 	RTA_DST,
-- 
cgit v1.2.3


From e7030878fc8448492b6e5cecd574043f63271298 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Tue, 21 Jul 2015 10:44:01 +0200
Subject: fib: Add fib rule match on tunnel id

This add the ability to select a routing table based on the tunnel
id which allows to maintain separate routing tables for each virtual
tunnel network.

ip rule add from all tunnel-id 100 lookup 100
ip rule add from all tunnel-id 200 lookup 200

A new static key controls the collection of metadata at tunnel level
upon demand.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/fib_rules.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/fib_rules.h b/include/uapi/linux/fib_rules.h
index 2b82d7e30974..96161b8202b5 100644
--- a/include/uapi/linux/fib_rules.h
+++ b/include/uapi/linux/fib_rules.h
@@ -43,7 +43,7 @@ enum {
 	FRA_UNUSED5,
 	FRA_FWMARK,	/* mark */
 	FRA_FLOW,	/* flow/class id */
-	FRA_UNUSED6,
+	FRA_TUN_ID,
 	FRA_SUPPRESS_IFGROUP,
 	FRA_SUPPRESS_PREFIXLEN,
 	FRA_TABLE,	/* Extended table id */
-- 
cgit v1.2.3


From b56ea2985d389a3676638203323ebe22c261b7fe Mon Sep 17 00:00:00 2001
From: Rick Jones <rick.jones2@hp.com>
Date: Tue, 21 Jul 2015 16:14:13 -0700
Subject: net: track success and failure of TCP PMTU probing

Track success and failure of TCP PMTU probing.

Signed-off-by: Rick Jones <rick.jones2@hp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/snmp.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h
index eee8968407f0..25a9ad8bcef1 100644
--- a/include/uapi/linux/snmp.h
+++ b/include/uapi/linux/snmp.h
@@ -278,6 +278,8 @@ enum
 	LINUX_MIB_TCPACKSKIPPEDCHALLENGE,	/* TCPACKSkippedChallenge */
 	LINUX_MIB_TCPWINPROBE,			/* TCPWinProbe */
 	LINUX_MIB_TCPKEEPALIVE,			/* TCPKeepAlive */
+	LINUX_MIB_TCPMTUPFAIL,			/* TCPMTUPFail */
+	LINUX_MIB_TCPMTUPSUCCESS,		/* TCPMTUPSuccess */
 	__LINUX_MIB_MAX
 };
 
-- 
cgit v1.2.3


From 3985e8a3611a93bb36789f65db862e5700aab65e Mon Sep 17 00:00:00 2001
From: Erik Kline <ek@google.com>
Date: Wed, 22 Jul 2015 16:38:25 +0900
Subject: ipv6: sysctl to restrict candidate source addresses

Per RFC 6724, section 4, "Candidate Source Addresses":

    It is RECOMMENDED that the candidate source addresses be the set
    of unicast addresses assigned to the interface that will be used
    to send to the destination (the "outgoing" interface).

Add a sysctl to enable this behaviour.

Signed-off-by: Erik Kline <ek@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ipv6.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h
index 5efa54ae567c..641a146ead7d 100644
--- a/include/uapi/linux/ipv6.h
+++ b/include/uapi/linux/ipv6.h
@@ -171,6 +171,7 @@ enum {
 	DEVCONF_USE_OPTIMISTIC,
 	DEVCONF_ACCEPT_RA_MTU,
 	DEVCONF_STABLE_SECRET,
+	DEVCONF_USE_OIF_ADDRS_ONLY,
 	DEVCONF_MAX
 };
 
-- 
cgit v1.2.3


From 2ce7918990641b07e70e1b25752d666369e2016e Mon Sep 17 00:00:00 2001
From: Andrey Smetanin <asmetanin@virtuozzo.com>
Date: Fri, 3 Jul 2015 15:01:41 +0300
Subject: kvm/x86: add sending hyper-v crash notification to user space

Sending of notification is done by exiting vcpu to user space
if KVM_REQ_HV_CRASH is enabled for vcpu. At exit to user space
the kvm_run structure contains system_event with type
KVM_SYSTEM_EVENT_CRASH to notify about guest crash occurred.

Signed-off-by: Andrey Smetanin <asmetanin@virtuozzo.com>
Signed-off-by: Denis V. Lunev <den@openvz.org>
Reviewed-by: Peter Hornyack <peterhornyack@google.com>
CC: Paolo Bonzini <pbonzini@redhat.com>
CC: Gleb Natapov <gleb@kernel.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/uapi/linux/kvm.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 716ad4ae4d4b..9ef19ebd9df4 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -317,6 +317,7 @@ struct kvm_run {
 		struct {
 #define KVM_SYSTEM_EVENT_SHUTDOWN       1
 #define KVM_SYSTEM_EVENT_RESET          2
+#define KVM_SYSTEM_EVENT_CRASH          3
 			__u32 type;
 			__u64 flags;
 		} system_event;
-- 
cgit v1.2.3


From 45c6df4471edf5404463756df1014fe7e36db3d3 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Tue, 23 Jun 2015 19:09:59 +0200
Subject: tty: linux/gsmmux.h needs linux/types.h

We use __u8 in linux/gsmmux.h, so include linux/types.h to have that
defined.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/uapi/linux/gsmmux.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/gsmmux.h b/include/uapi/linux/gsmmux.h
index c06742d52856..ab055d8cddef 100644
--- a/include/uapi/linux/gsmmux.h
+++ b/include/uapi/linux/gsmmux.h
@@ -3,6 +3,7 @@
 
 #include <linux/if.h>
 #include <linux/ioctl.h>
+#include <linux/types.h>
 
 struct gsm_config
 {
-- 
cgit v1.2.3


From 45ac1403f564f411c6a383a2448688ba8dd705a4 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Tue, 21 Jul 2015 12:44:02 +0300
Subject: perf: Add PERF_RECORD_SWITCH to indicate context switches

There are already two events for context switches, namely the tracepoint
sched:sched_switch and the software event context_switches.
Unfortunately neither are suitable for use by non-privileged users for
the purpose of synchronizing hardware trace data (e.g. Intel PT) to the
context switch.

Tracepoints are no good at all for non-privileged users because they
need either CAP_SYS_ADMIN or /proc/sys/kernel/perf_event_paranoid <= -1.

On the other hand, kernel software events need either CAP_SYS_ADMIN or
/proc/sys/kernel/perf_event_paranoid <= 1.

Now many distributions do default perf_event_paranoid to 1 making
context_switches a contender, except it has another problem (which is
also shared with sched:sched_switch) which is that it happens before
perf schedules events out instead of after perf schedules events in.
Whereas a privileged user can see all the events anyway, a
non-privileged user only sees events for their own processes, in other
words they see when their process was scheduled out not when it was
scheduled in. That presents two problems to use the event:

1. the information comes too late, so tools have to look ahead in the
   event stream to find out what the current state is

2. if they are unlucky tracing might have stopped before the
   context-switches event is recorded.

This new PERF_RECORD_SWITCH event does not have those problems
and it also has a couple of other small advantages.

It is easier to use because it is an auxiliary event (like mmap, comm
and task events) which can be enabled by setting a single bit. It is
smaller than sched:sched_switch and easier to parse.

To make the event useful for privileged users also, if the
context is cpu-wide then the event record will be
PERF_RECORD_SWITCH_CPU_WIDE which is the same as
PERF_RECORD_SWITCH except it also provides the next or
previous pid/tid.

Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Jiri Olsa <jolsa@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Pawel Moll <pawel.moll@arm.com>
Cc: Stephane Eranian <eranian@google.com>
Link: http://lkml.kernel.org/r/1437471846-26995-2-git-send-email-adrian.hunter@intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 include/uapi/linux/perf_event.h | 31 ++++++++++++++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index d97f84c080da..022d0acf7df0 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -330,7 +330,8 @@ struct perf_event_attr {
 				mmap2          :  1, /* include mmap with inode data     */
 				comm_exec      :  1, /* flag comm events that are due to an exec */
 				use_clockid    :  1, /* use @clockid for time fields */
-				__reserved_1   : 38;
+				context_switch :  1, /* context switch data */
+				__reserved_1   : 37;
 
 	union {
 		__u32		wakeup_events;	  /* wakeup every n events */
@@ -572,9 +573,11 @@ struct perf_event_mmap_page {
 /*
  * PERF_RECORD_MISC_MMAP_DATA and PERF_RECORD_MISC_COMM_EXEC are used on
  * different events so can reuse the same bit position.
+ * Ditto PERF_RECORD_MISC_SWITCH_OUT.
  */
 #define PERF_RECORD_MISC_MMAP_DATA		(1 << 13)
 #define PERF_RECORD_MISC_COMM_EXEC		(1 << 13)
+#define PERF_RECORD_MISC_SWITCH_OUT		(1 << 13)
 /*
  * Indicates that the content of PERF_SAMPLE_IP points to
  * the actual instruction that triggered the event. See also
@@ -818,6 +821,32 @@ enum perf_event_type {
 	 */
 	PERF_RECORD_LOST_SAMPLES		= 13,
 
+	/*
+	 * Records a context switch in or out (flagged by
+	 * PERF_RECORD_MISC_SWITCH_OUT). See also
+	 * PERF_RECORD_SWITCH_CPU_WIDE.
+	 *
+	 * struct {
+	 *	struct perf_event_header	header;
+	 *	struct sample_id		sample_id;
+	 * };
+	 */
+	PERF_RECORD_SWITCH			= 14,
+
+	/*
+	 * CPU-wide version of PERF_RECORD_SWITCH with next_prev_pid and
+	 * next_prev_tid that are the next (switching out) or previous
+	 * (switching in) pid/tid.
+	 *
+	 * struct {
+	 *	struct perf_event_header	header;
+	 *	u32				next_prev_pid;
+	 *	u32				next_prev_tid;
+	 *	struct sample_id		sample_id;
+	 * };
+	 */
+	PERF_RECORD_SWITCH_CPU_WIDE		= 15,
+
 	PERF_RECORD_MAX,			/* non-ABI */
 };
 
-- 
cgit v1.2.3


From fc5462f8525b47fa219452289ecb22c921c16823 Mon Sep 17 00:00:00 2001
From: Azael Avalos <coproscefalo@gmail.com>
Date: Wed, 22 Jul 2015 18:09:11 -0600
Subject: toshiba_acpi: Add /dev/toshiba_acpi device

There were previous attempts to "merge" the toshiba SMM module to the
toshiba_acpi one, they were trying to imitate what the old toshiba
module does, however, some models (TOS1900 devices) come with a
"crippled" implementation and do not provide all the "features" a
"genuine" Toshiba BIOS does.

This patch adds a new device called toshiba_acpi, which aim is to
enable userspace to access the SMM on Toshiba laptops via ACPI calls.

Creating a new convenience _IOWR command to access the SCI functions
by opening/closing the SCI internally to avoid buggy BIOS, while at
the same time providing backwards compatibility.

Older programs (and new) who wish to access the SMM on newer models
can do it by pointing their path to /dev/toshiba_acpi (instead of
/dev/toshiba) as the toshiba.h header was modified to reflect these
changes as well as adds all the toshiba_acpi paths and command,
however, it is strongly recommended to use the new IOCTL for any
SCI command to avoid any buggy BIOS.

Signed-off-by: Azael Avalos <coproscefalo@gmail.com>
Signed-off-by: Darren Hart <dvhart@linux.intel.com>
---
 include/uapi/linux/toshiba.h | 32 +++++++++++++++++++++++++++++---
 1 file changed, 29 insertions(+), 3 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/toshiba.h b/include/uapi/linux/toshiba.h
index e9bef5b2f91e..c58bf4b5bb26 100644
--- a/include/uapi/linux/toshiba.h
+++ b/include/uapi/linux/toshiba.h
@@ -1,6 +1,7 @@
 /* toshiba.h -- Linux driver for accessing the SMM on Toshiba laptops 
  *
  * Copyright (c) 1996-2000  Jonathan A. Buzzard (jonathan@buzzard.org.uk)
+ * Copyright (c) 2015  Azael Avalos <coproscefalo@gmail.com>
  *
  * Thanks to Juergen Heinzl <juergen@monocerus.demon.co.uk> for the pointers
  * on making sure the structure is aligned and packed.
@@ -20,9 +21,18 @@
 #ifndef _UAPI_LINUX_TOSHIBA_H
 #define _UAPI_LINUX_TOSHIBA_H
 
-#define TOSH_PROC "/proc/toshiba"
-#define TOSH_DEVICE "/dev/toshiba"
-#define TOSH_SMM _IOWR('t', 0x90, int)	/* broken: meant 24 bytes */
+/*
+ * Toshiba modules paths
+ */
+
+#define TOSH_PROC		"/proc/toshiba"
+#define TOSH_DEVICE		"/dev/toshiba"
+#define TOSHIBA_ACPI_PROC	"/proc/acpi/toshiba"
+#define TOSHIBA_ACPI_DEVICE	"/dev/toshiba_acpi"
+
+/*
+ * Toshiba SMM structure
+ */
 
 typedef struct {
 	unsigned int eax;
@@ -33,5 +43,21 @@ typedef struct {
 	unsigned int edi __attribute__ ((packed));
 } SMMRegisters;
 
+/*
+ * IOCTLs (0x90 - 0x91)
+ */
+
+#define TOSH_SMM		_IOWR('t', 0x90, SMMRegisters)
+/*
+ * Convenience toshiba_acpi command.
+ *
+ * The System Configuration Interface (SCI) is opened/closed internally
+ * to avoid userspace of buggy BIOSes.
+ *
+ * The toshiba_acpi module checks whether the eax register is set with
+ * SCI_GET (0xf300) or SCI_SET (0xf400), returning -EINVAL if not.
+ */
+#define TOSHIBA_ACPI_SCI	_IOWR('t', 0x91, SMMRegisters)
+
 
 #endif /* _UAPI_LINUX_TOSHIBA_H */
-- 
cgit v1.2.3


From e0910bace663b78c026b73bbd711a24ccf410531 Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Thu, 23 Jul 2015 15:43:56 +0200
Subject: lwtunnel: export linux/lwtunnel.h to userspace

Note also that include/linux/lwtunnel.h is not needed.

CC: Thomas Graf <tgraf@suug.ch>
CC: Roopa Prabhu <roopa@cumulusnetworks.com>
Fixes: 499a24256862 ("lwtunnel: infrastructure for handling light weight tunnels like mpls")
Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Acked-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/Kbuild | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
index 1ff9942718fe..aafb9937b162 100644
--- a/include/uapi/linux/Kbuild
+++ b/include/uapi/linux/Kbuild
@@ -243,6 +243,7 @@ header-y += limits.h
 header-y += llc.h
 header-y += loop.h
 header-y += lp.h
+header-y += lwtunnel.h
 header-y += magic.h
 header-y += major.h
 header-y += map_to_7segment.h
-- 
cgit v1.2.3


From ec92777f2ba93c00387b8fe53780c25adc57c744 Mon Sep 17 00:00:00 2001
From: Vishal Verma <vishal.l.verma@intel.com>
Date: Thu, 9 Jul 2015 13:25:35 -0600
Subject: libnvdimm: Update name of the ars_status_record mask field

The spec suggests that this is a simple 'length' field, not a mask.
Update the name accordingly.

Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/uapi/linux/ndctl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/ndctl.h b/include/uapi/linux/ndctl.h
index 2b94ea2287bb..e94bc20016b2 100644
--- a/include/uapi/linux/ndctl.h
+++ b/include/uapi/linux/ndctl.h
@@ -87,7 +87,7 @@ struct nd_cmd_ars_status {
 		__u32 handle;
 		__u32 flags;
 		__u64 err_address;
-		__u64 mask;
+		__u64 length;
 	} __packed records[0];
 } __packed;
 
-- 
cgit v1.2.3


From 39c686b862cdb2049b90e095b6c6c727b2a7ab60 Mon Sep 17 00:00:00 2001
From: Vishal Verma <vishal.l.verma@intel.com>
Date: Thu, 9 Jul 2015 13:25:36 -0600
Subject: libnvdimm: Add DSM support for Address Range Scrub commands

Add support for the three ARS DSM commands:
- Query ARS Capabilities - Queries the firmware to check if a given
  range supports scrub, and if so, which type (persistent vs. volatile)
- Start ARS - Starts a scrub for a given range/type
- Query ARS Status - Checks status of a previously started scrub, and
  provides the error logs if any.

  The commands are described by the example DSM spec at:
  http://pmem.io/documents/NVDIMM_DSM_Interface_Example.pdf

Also add these commands to the nfit_test test framework, and return
canned data.

Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/uapi/linux/ndctl.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/ndctl.h b/include/uapi/linux/ndctl.h
index e94bc20016b2..5b4a4be06e2b 100644
--- a/include/uapi/linux/ndctl.h
+++ b/include/uapi/linux/ndctl.h
@@ -111,6 +111,11 @@ enum {
 	ND_CMD_VENDOR = 9,
 };
 
+enum {
+	ND_ARS_VOLATILE = 1,
+	ND_ARS_PERSISTENT = 2,
+};
+
 static inline const char *nvdimm_bus_cmd_name(unsigned cmd)
 {
 	static const char * const names[] = {
@@ -194,4 +199,9 @@ enum nd_driver_flags {
 enum {
 	ND_MIN_NAMESPACE_SIZE = 0x00400000,
 };
+
+enum ars_masks {
+	ARS_STATUS_MASK = 0x0000FFFF,
+	ARS_EXT_STATUS_SHIFT = 16,
+};
 #endif /* __NDCTL_H__ */
-- 
cgit v1.2.3


From a37281b63681015b12c3b7322e6bd681c0ea1ef4 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Fri, 21 Nov 2014 13:45:08 +0100
Subject: KVM: s390: more irq names for trace events

This patch adds names for missing irq types to the trace events.
In order to identify adapter irqs, the define is moved from
interrupt.c to the other basic irq defines in uapi/linux/kvm.h.

Acked-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 include/uapi/linux/kvm.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 9ef19ebd9df4..0d831f94f8a8 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -482,6 +482,7 @@ struct kvm_s390_psw {
 	 ((ai) << 26))
 #define KVM_S390_INT_IO_MIN		0x00000000u
 #define KVM_S390_INT_IO_MAX		0xfffdffffu
+#define KVM_S390_INT_IO_AI_MASK		0x04000000u
 
 
 struct kvm_s390_interrupt {
-- 
cgit v1.2.3


From 8486a0bba6fc13ae60a8e402cf77c60a7d4e7a04 Mon Sep 17 00:00:00 2001
From: Macpaul Lin <macpaul@gmail.com>
Date: Thu, 9 Jul 2015 15:18:38 +0800
Subject: usb: add usb_otg20_descriptor for OTG 2.0 and above

OTG 2.0 introduces bcdOTG in otg descriptor to identify the OTG and EH
supplement release number with which the OTG device is compliant, this
patch adds structure usb_otg20_descriptor for OTG 2.0 and above.

Signed-off-by: Macpaul Lin <macpaul@gmail.com>
Signed-off-by: Li Jun <jun.li@freescale.com>
Reviewed-by: Roger Quadros <rogerq@ti.com>
Signed-off-by: Felipe Balbi <balbi@ti.com>
---
 include/uapi/linux/usb/ch9.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/usb/ch9.h b/include/uapi/linux/usb/ch9.h
index aa33fd1b2d4f..aec689941c18 100644
--- a/include/uapi/linux/usb/ch9.h
+++ b/include/uapi/linux/usb/ch9.h
@@ -674,6 +674,17 @@ struct usb_otg_descriptor {
 	__u8  bmAttributes;	/* support for HNP, SRP, etc */
 } __attribute__ ((packed));
 
+/* USB_DT_OTG (from OTG 2.0 supplement) */
+struct usb_otg20_descriptor {
+	__u8  bLength;
+	__u8  bDescriptorType;
+
+	__u8  bmAttributes;	/* support for HNP, SRP and ADP, etc */
+	__le16 bcdOTG;		/* OTG and EH supplement release number
+				 * in binary-coded decimal(i.e. 2.0 is 0200H)
+				 */
+} __attribute__ ((packed));
+
 /* from usb_otg_descriptor.bmAttributes */
 #define USB_OTG_SRP		(1 << 0)
 #define USB_OTG_HNP		(1 << 1)	/* swap host/device roles */
-- 
cgit v1.2.3


From 5d701cef9b40188c11c39be3a401bc4feeb154b8 Mon Sep 17 00:00:00 2001
From: Macpaul Lin <macpaul@gmail.com>
Date: Thu, 9 Jul 2015 15:18:39 +0800
Subject: usb: add USB_OTG_ADP definition

Add USB_OTG_ADP definition for usb_otg_descriptor.bmAttributes.

Signed-off-by: Macpaul Lin <macpaul@gmail.com>
Signed-off-by: Li Jun <jun.li@freescale.com>
Acked-by: Peter Chen <peter.chen@freescale.com>
Signed-off-by: Felipe Balbi <balbi@ti.com>
---
 include/uapi/linux/usb/ch9.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/usb/ch9.h b/include/uapi/linux/usb/ch9.h
index aec689941c18..f7adc6e01f9e 100644
--- a/include/uapi/linux/usb/ch9.h
+++ b/include/uapi/linux/usb/ch9.h
@@ -688,6 +688,7 @@ struct usb_otg20_descriptor {
 /* from usb_otg_descriptor.bmAttributes */
 #define USB_OTG_SRP		(1 << 0)
 #define USB_OTG_HNP		(1 << 1)	/* swap host/device roles */
+#define USB_OTG_ADP		(1 << 2)	/* support ADP */
 
 /*-------------------------------------------------------------------------*/
 
-- 
cgit v1.2.3


From d7ee3519042798be6224e97f259ed47a63da4620 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Michal=20Kube=C4=8Dek?= <mkubecek@suse.cz>
Date: Fri, 17 Jul 2015 16:17:56 +0200
Subject: netfilter: nf_ct_sctp: minimal multihoming support

Currently nf_conntrack_proto_sctp module handles only packets between
primary addresses used to establish the connection. Any packets between
secondary addresses are classified as invalid so that usual firewall
configurations drop them. Allowing HEARTBEAT and HEARTBEAT-ACK chunks to
establish a new conntrack would allow traffic between secondary
addresses to pass through. A more sophisticated solution based on the
addresses advertised in the initial handshake (and possibly also later
dynamic address addition and removal) would be much harder to implement.
Moreover, in general we cannot assume to always see the initial
handshake as it can be routed through a different path.

The patch adds two new conntrack states:

  SCTP_CONNTRACK_HEARTBEAT_SENT  - a HEARTBEAT chunk seen but not acked
  SCTP_CONNTRACK_HEARTBEAT_ACKED - a HEARTBEAT acked by HEARTBEAT-ACK

State transition rules:

- HEARTBEAT_SENT responds to usual chunks the same way as NONE (so that
  the behaviour changes as little as possible)
- HEARTBEAT_ACKED responds to usual chunks the same way as ESTABLISHED
  does, except the resulting state is HEARTBEAT_ACKED rather than
  ESTABLISHED
- previously existing states except NONE are preserved when HEARTBEAT or
  HEARTBEAT-ACK is seen
- NONE (in the initial direction) changes to HEARTBEAT_SENT on HEARTBEAT
  and to CLOSED on HEARTBEAT-ACK
- HEARTBEAT_SENT changes to HEARTBEAT_ACKED on HEARTBEAT-ACK in the
  reply direction
- HEARTBEAT_SENT and HEARTBEAT_ACKED are preserved on HEARTBEAT and
  HEARTBEAT-ACK otherwise

Normally, vtag is set from the INIT chunk for the reply direction and
from the INIT-ACK chunk for the originating direction (i.e. each of
these defines vtag value for the opposite direction). For secondary
conntracks, we can't rely on seeing INIT/INIT-ACK and even if we have
seen them, we would need to connect two different conntracks. Therefore
simplified logic is applied: vtag of first packet in each direction
(HEARTBEAT in the originating and HEARTBEAT-ACK in reply direction) is
saved and all following packets in that direction are compared with this
saved value. While INIT and INIT-ACK define vtag for the opposite
direction, vtags extracted from HEARTBEAT and HEARTBEAT-ACK are always
for their direction.

Default timeout values for new states are

  HEARTBEAT_SENT: 30 seconds (default hb_interval)
  HEARTBEAT_ACKED: 210 seconds (hb_interval * path_max_retry + max_rto)

(We cannot expect to see the shutdown sequence so that, unlike
ESTABLISHED, the HEARTBEAT_ACKED timeout shouldn't be too long.)

Signed-off-by: Michal Kubecek <mkubecek@suse.cz>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_conntrack_sctp.h   | 2 ++
 include/uapi/linux/netfilter/nfnetlink_cttimeout.h | 2 ++
 2 files changed, 4 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/netfilter/nf_conntrack_sctp.h b/include/uapi/linux/netfilter/nf_conntrack_sctp.h
index ceeefe6681b5..ed4e776e1242 100644
--- a/include/uapi/linux/netfilter/nf_conntrack_sctp.h
+++ b/include/uapi/linux/netfilter/nf_conntrack_sctp.h
@@ -13,6 +13,8 @@ enum sctp_conntrack {
 	SCTP_CONNTRACK_SHUTDOWN_SENT,
 	SCTP_CONNTRACK_SHUTDOWN_RECD,
 	SCTP_CONNTRACK_SHUTDOWN_ACK_SENT,
+	SCTP_CONNTRACK_HEARTBEAT_SENT,
+	SCTP_CONNTRACK_HEARTBEAT_ACKED,
 	SCTP_CONNTRACK_MAX
 };
 
diff --git a/include/uapi/linux/netfilter/nfnetlink_cttimeout.h b/include/uapi/linux/netfilter/nfnetlink_cttimeout.h
index 1ab0b97b3a1e..f2c10dc140d6 100644
--- a/include/uapi/linux/netfilter/nfnetlink_cttimeout.h
+++ b/include/uapi/linux/netfilter/nfnetlink_cttimeout.h
@@ -92,6 +92,8 @@ enum ctattr_timeout_sctp {
 	CTA_TIMEOUT_SCTP_SHUTDOWN_SENT,
 	CTA_TIMEOUT_SCTP_SHUTDOWN_RECD,
 	CTA_TIMEOUT_SCTP_SHUTDOWN_ACK_SENT,
+	CTA_TIMEOUT_SCTP_HEARTBEAT_SENT,
+	CTA_TIMEOUT_SCTP_HEARTBEAT_ACKED,
 	__CTA_TIMEOUT_SCTP_MAX
 };
 #define CTA_TIMEOUT_SCTP_MAX (__CTA_TIMEOUT_SCTP_MAX - 1)
-- 
cgit v1.2.3


From a0ddef81f4aeeeec3326f6b6a255d8ea13b41908 Mon Sep 17 00:00:00 2001
From: Chris Metcalf <cmetcalf@ezchip.com>
Date: Wed, 22 Jul 2015 14:30:14 -0400
Subject: tile: enable full SECCOMP support

Signed-off-by: Chris Metcalf <cmetcalf@ezchip.com>
---
 include/uapi/linux/audit.h  | 3 +++
 include/uapi/linux/elf-em.h | 2 ++
 2 files changed, 5 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h
index d3475e1f15ec..1f977dd4c370 100644
--- a/include/uapi/linux/audit.h
+++ b/include/uapi/linux/audit.h
@@ -382,6 +382,9 @@ enum {
 #define AUDIT_ARCH_SHEL64	(EM_SH|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
 #define AUDIT_ARCH_SPARC	(EM_SPARC)
 #define AUDIT_ARCH_SPARC64	(EM_SPARCV9|__AUDIT_ARCH_64BIT)
+#define AUDIT_ARCH_TILEGX	(EM_TILEGX|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
+#define AUDIT_ARCH_TILEGX32	(EM_TILEGX|__AUDIT_ARCH_LE)
+#define AUDIT_ARCH_TILEPRO	(EM_TILEPRO|__AUDIT_ARCH_LE)
 #define AUDIT_ARCH_X86_64	(EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
 
 #define AUDIT_PERM_EXEC		1
diff --git a/include/uapi/linux/elf-em.h b/include/uapi/linux/elf-em.h
index b08829667ed7..3429a3ba382b 100644
--- a/include/uapi/linux/elf-em.h
+++ b/include/uapi/linux/elf-em.h
@@ -38,6 +38,8 @@
 #define EM_ALTERA_NIOS2	113	/* Altera Nios II soft-core processor */
 #define EM_TI_C6000	140	/* TI C6X DSPs */
 #define EM_AARCH64	183	/* ARM 64 bit */
+#define EM_TILEPRO	188	/* Tilera TILEPro */
+#define EM_TILEGX	191	/* Tilera TILE-Gx */
 #define EM_FRV		0x5441	/* Fujitsu FR-V */
 #define EM_AVR32	0x18ad	/* Atmel AVR32 */
 
-- 
cgit v1.2.3


From 8013d1d7eafb0589ca766db6b74026f76b7f5cb4 Mon Sep 17 00:00:00 2001
From: Hangbin Liu <liuhangbin@gmail.com>
Date: Thu, 30 Jul 2015 14:28:42 +0800
Subject: net/ipv6: add sysctl option accept_ra_min_hop_limit

Commit 6fd99094de2b ("ipv6: Don't reduce hop limit for an interface")
disabled accept hop limit from RA if it is smaller than the current hop
limit for security stuff. But this behavior kind of break the RFC definition.

RFC 4861, 6.3.4.  Processing Received Router Advertisements
   A Router Advertisement field (e.g., Cur Hop Limit, Reachable Time,
   and Retrans Timer) may contain a value denoting that it is
   unspecified.  In such cases, the parameter should be ignored and the
   host should continue using whatever value it is already using.

   If the received Cur Hop Limit value is non-zero, the host SHOULD set
   its CurHopLimit variable to the received value.

So add sysctl option accept_ra_min_hop_limit to let user choose the minimum
hop limit value they can accept from RA. And set default to 1 to meet RFC
standards.

Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Acked-by: YOSHIFUJI Hideaki <hideaki.yoshifuji@miraclelinux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ipv6.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h
index 641a146ead7d..80f3b74446a1 100644
--- a/include/uapi/linux/ipv6.h
+++ b/include/uapi/linux/ipv6.h
@@ -172,6 +172,7 @@ enum {
 	DEVCONF_ACCEPT_RA_MTU,
 	DEVCONF_STABLE_SECRET,
 	DEVCONF_USE_OIF_ADDRS_ONLY,
+	DEVCONF_ACCEPT_RA_MIN_HOP_LIMIT,
 	DEVCONF_MAX
 };
 
-- 
cgit v1.2.3


From d3aa45ce6b94c65b83971257317867db13e5f492 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Thu, 30 Jul 2015 15:36:57 -0700
Subject: bpf: add helpers to access tunnel metadata

Introduce helpers to let eBPF programs attached to TC manipulate tunnel metadata:
bpf_skb_[gs]et_tunnel_key(skb, key, size, flags)
skb: pointer to skb
key: pointer to 'struct bpf_tunnel_key'
size: size of 'struct bpf_tunnel_key'
flags: room for future extensions

First eBPF program that uses these helpers will allocate per_cpu
metadata_dst structures that will be used on TX.
On RX metadata_dst is allocated by tunnel driver.

Typical usage for TX:
struct bpf_tunnel_key tkey;
... populate tkey ...
bpf_skb_set_tunnel_key(skb, &tkey, sizeof(tkey), 0);
bpf_clone_redirect(skb, vxlan_dev_ifindex, 0);

RX:
struct bpf_tunnel_key tkey = {};
bpf_skb_get_tunnel_key(skb, &tkey, sizeof(tkey), 0);
... lookup or redirect based on tkey ...

'struct bpf_tunnel_key' will be extended in the future by adding
elements to the end and the 'size' argument will indicate which fields
are populated, thereby keeping backwards compatibility.
The 'flags' argument may be used as well when the 'size' is not enough or
to indicate completely different layout of bpf_tunnel_key.

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 2f6c83d714e9..bc0d27d3fbdd 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -258,6 +258,18 @@ enum bpf_func_id {
 	BPF_FUNC_get_cgroup_classid,
 	BPF_FUNC_skb_vlan_push, /* bpf_skb_vlan_push(skb, vlan_proto, vlan_tci) */
 	BPF_FUNC_skb_vlan_pop,  /* bpf_skb_vlan_pop(skb) */
+
+	/**
+	 * bpf_skb_[gs]et_tunnel_key(skb, key, size, flags)
+	 * retrieve or populate tunnel metadata
+	 * @skb: pointer to skb
+	 * @key: pointer to 'struct bpf_tunnel_key'
+	 * @size: size of 'struct bpf_tunnel_key'
+	 * @flags: room for future extensions
+	 * Retrun: 0 on success
+	 */
+	BPF_FUNC_skb_get_tunnel_key,
+	BPF_FUNC_skb_set_tunnel_key,
 	__BPF_FUNC_MAX_ID,
 };
 
@@ -280,4 +292,9 @@ struct __sk_buff {
 	__u32 cb[5];
 };
 
+struct bpf_tunnel_key {
+	__u32 tunnel_id;
+	__u32 remote_ipv4;
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
-- 
cgit v1.2.3


From f8a9b1bc1b238eed9987da747a0e52f5bb009980 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Thu, 30 Jul 2015 20:10:22 -0700
Subject: vxlan: expose COLLECT_METADATA flag to user space

Two vxlan driver flags FLOWBASED and COLLECT_METADATA need to be set to
make use of its new flow mode. The former already exposed. Expose the latter.

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_link.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 9eeb5d9cf8f0..24e22cd4be79 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -383,6 +383,7 @@ enum {
 	IFLA_VXLAN_GBP,
 	IFLA_VXLAN_REMCSUM_NOPARTIAL,
 	IFLA_VXLAN_FLOWBASED,
+	IFLA_VXLAN_COLLECT_METADATA,
 	__IFLA_VXLAN_MAX
 };
 #define IFLA_VXLAN_MAX	(__IFLA_VXLAN_MAX - 1)
-- 
cgit v1.2.3


From 0f7bffd9e512b77279bbce704fad3cb1d6887958 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Fri, 31 Jul 2015 16:49:43 +0200
Subject: bonding: add tlb_dynamic_lb netlink support

tlb_dynamic_lb could be set only via sysfs, this patch allows it to be
set via netlink.

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_link.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 24e22cd4be79..ea047480a1f0 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -434,6 +434,7 @@ enum {
 	IFLA_BOND_AD_ACTOR_SYS_PRIO,
 	IFLA_BOND_AD_USER_PORT_KEY,
 	IFLA_BOND_AD_ACTOR_SYSTEM,
+	IFLA_BOND_TLB_DYNAMIC_LB,
 	__IFLA_BOND_MAX,
 };
 
-- 
cgit v1.2.3


From ba7591d8b28bd16a2eface5d009ab0b60c7629a4 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 1 Aug 2015 00:46:29 +0200
Subject: ebpf: add skb->hash to offset map for usage in {cls, act}_bpf or
 filters

Add skb->hash to the __sk_buff offset map, so it can be accessed from
an eBPF program. We currently already do this for classic BPF filters,
but not yet on eBPF, it might be useful as a demuxer in combination with
helpers like bpf_clone_redirect(), toy example:

  __section("cls-lb") int ingress_main(struct __sk_buff *skb)
  {
    unsigned int which = 3 + (skb->hash & 7);
    /* bpf_skb_store_bytes(skb, ...); */
    /* bpf_l{3,4}_csum_replace(skb, ...); */
    bpf_clone_redirect(skb, which, 0);
    return -1;
  }

I was thinking whether to add skb_get_hash(), but then concluded the
raw skb->hash seems fine in this case: we can directly access the hash
w/o extra eBPF helper function call, it's filled out by many NICs on
ingress, and in case the entropy level would not be sufficient, people
can still implement their own specific sw fallback hash mix anyway.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index bc0d27d3fbdd..2ce13c109b00 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -290,6 +290,7 @@ struct __sk_buff {
 	__u32 ifindex;
 	__u32 tc_index;
 	__u32 cb[5];
+	__u32 hash;
 };
 
 struct bpf_tunnel_key {
-- 
cgit v1.2.3


From 3c7c8468e5d993dfe377a67e41cbb23cda93af9e Mon Sep 17 00:00:00 2001
From: Tomas Winkler <tomas.winkler@intel.com>
Date: Sun, 26 Jul 2015 09:54:20 +0300
Subject: mei: add async event notification ioctls

Add ioctl IOCTL_MEI_NOTIFY_SET for enabling and disabling
async event notification.
Add ioctl IOCTL_MEI_NOTIFY_GET for receiving and acking
an event notification.

Signed-off-by: Tomas Winkler <tomas.winkler@intel.com>
Signed-off-by: Alexander Usyskin <alexander.usyskin@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/uapi/linux/mei.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/mei.h b/include/uapi/linux/mei.h
index bc0d8b69c49e..7c3b64f6a215 100644
--- a/include/uapi/linux/mei.h
+++ b/include/uapi/linux/mei.h
@@ -107,4 +107,23 @@ struct mei_connect_client_data {
 	};
 };
 
+/**
+ * DOC: set and unset event notification for a connected client
+ *
+ * The IOCTL argument is 1 for enabling event notification and 0 for
+ * disabling the service
+ * Return:  -EOPNOTSUPP if the devices doesn't support the feature
+ */
+#define IOCTL_MEI_NOTIFY_SET _IOW('H', 0x02, __u32)
+
+/**
+ * DOC: retrieve notification
+ *
+ * The IOCTL output argument is 1 if an event was is pending and 0 otherwise
+ * the ioctl has to be called in order to acknowledge pending event
+ *
+ * Return:  -EOPNOTSUPP if the devices doesn't support the feature
+ */
+#define IOCTL_MEI_NOTIFY_GET _IOR('H', 0x03, __u32)
+
 #endif /* _LINUX_MEI_H  */
-- 
cgit v1.2.3


From a6affd24f439feddec04bab4d1e3ad6579868367 Mon Sep 17 00:00:00 2001
From: Robert Shearman <rshearma@brocade.com>
Date: Mon, 3 Aug 2015 17:50:04 +0100
Subject: mpls: Use definition for reserved label checks

In multiple locations there are checks for whether the label in hand
is a reserved label or not using the arbritray value of 16. Factor
this out into a #define for better maintainability and for
documentation.

Signed-off-by: Robert Shearman <rshearma@brocade.com>
Acked-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/mpls.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/mpls.h b/include/uapi/linux/mpls.h
index 139d4dd1cab8..24a6cb1aec86 100644
--- a/include/uapi/linux/mpls.h
+++ b/include/uapi/linux/mpls.h
@@ -41,4 +41,6 @@ struct mpls_label {
 #define MPLS_LABEL_OAMALERT		14 /* RFC3429 */
 #define MPLS_LABEL_EXTENSION		15 /* RFC7274 */
 
+#define MPLS_LABEL_FIRST_UNRESERVED	16 /* RFC3032 */
+
 #endif /* _UAPI_MPLS_H */
-- 
cgit v1.2.3


From 71ef3c6b9d4665ee7afbbe4c208a98917dcfc32f Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Sun, 10 May 2015 12:22:42 -0700
Subject: perf: Add cycles to branch_info

Intel Skylake supports reporting the time in cycles a branch in the LBR
took, to give a rough indication of the basic block performance.

Export the cycle information in the branch_info structure.
This can be done by just reusing some currently zero padding.

This is just the generic header change. The architecture
still needs to fill it in.

There's no attempt to convert to real time, as we really
want cycles here.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: eranian@google.com
Link: http://lkml.kernel.org/r/1431285767-27027-5-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/uapi/linux/perf_event.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 022d0acf7df0..2881145cda86 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -951,6 +951,7 @@ union perf_mem_data_src {
  *
  *     in_tx: running in a hardware transaction
  *     abort: aborting a hardware transaction
+ *    cycles: cycles from last branch (or 0 if not supported)
  */
 struct perf_branch_entry {
 	__u64	from;
@@ -959,7 +960,8 @@ struct perf_branch_entry {
 		predicted:1,/* target predicted */
 		in_tx:1,    /* in transaction */
 		abort:1,    /* transaction abort */
-		reserved:60;
+		cycles:16,  /* cycle count to last branch */
+		reserved:44;
 };
 
 #endif /* _UAPI_LINUX_PERF_EVENT_H */
-- 
cgit v1.2.3


From 35c051258e8fd7cb97222f4aa887bcd404c156d0 Mon Sep 17 00:00:00 2001
From: Sinclair Yeh <syeh@vmware.com>
Date: Fri, 26 Jun 2015 01:42:06 -0700
Subject: drm/vmwgfx: Implement screen targets

Add support for the screen target device interface.
Add a getparam parameter and bump minor to signal availability.

Signed-off-by: Sinclair Yeh <syeh@vmware.com>
Signed-off-by: Thomas Hellstrom <thellstrom@vmware.com>
---
 include/uapi/drm/vmwgfx_drm.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi')

diff --git a/include/uapi/drm/vmwgfx_drm.h b/include/uapi/drm/vmwgfx_drm.h
index c472bedbe38e..c8a863180174 100644
--- a/include/uapi/drm/vmwgfx_drm.h
+++ b/include/uapi/drm/vmwgfx_drm.h
@@ -88,6 +88,7 @@
 #define DRM_VMW_PARAM_3D_CAPS_SIZE     8
 #define DRM_VMW_PARAM_MAX_MOB_MEMORY   9
 #define DRM_VMW_PARAM_MAX_MOB_SIZE     10
+#define DRM_VMW_PARAM_SCREEN_TARGET    11
 
 /**
  * enum drm_vmw_handle_type - handle type for ref ioctls
-- 
cgit v1.2.3


From 34d99af52ad40bd498ba66970579a5bc1fb1a3bc Mon Sep 17 00:00:00 2001
From: Richard Guy Briggs <rgb@redhat.com>
Date: Wed, 5 Aug 2015 16:29:37 -0400
Subject: audit: implement audit by executable

This adds the ability audit the actions of a not-yet-running process.

This patch implements the ability to filter on the executable path.  Instead of
just hard coding the ino and dev of the executable we care about at the moment
the rule is inserted into the kernel, use the new audit_fsnotify
infrastructure to manage this dynamically.  This means that if the filename
does not yet exist but the containing directory does, or if the inode in
question is unlinked and creat'd (aka updated) the rule will just continue to
work.  If the containing directory is moved or deleted or the filesystem is
unmounted, the rule is deleted automatically.  A future enhancement would be to
have the rule survive across directory disruptions.

This is a heavily modified version of a patch originally submitted by Eric
Paris with some ideas from Peter Moody.

Cc: Peter Moody <peter@hda3.com>
Cc: Eric Paris <eparis@redhat.com>
Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
[PM: minor whitespace clean to satisfy ./scripts/checkpatch]
Signed-off-by: Paul Moore <pmoore@redhat.com>
---
 include/uapi/linux/audit.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h
index d3475e1f15ec..f6ff62c24aba 100644
--- a/include/uapi/linux/audit.h
+++ b/include/uapi/linux/audit.h
@@ -266,6 +266,7 @@
 #define AUDIT_OBJ_UID	109
 #define AUDIT_OBJ_GID	110
 #define AUDIT_FIELD_COMPARE	111
+#define AUDIT_EXE	112
 
 #define AUDIT_ARG0      200
 #define AUDIT_ARG1      (AUDIT_ARG0+1)
@@ -324,8 +325,10 @@ enum {
 
 #define AUDIT_FEATURE_BITMAP_BACKLOG_LIMIT	0x00000001
 #define AUDIT_FEATURE_BITMAP_BACKLOG_WAIT_TIME	0x00000002
+#define AUDIT_FEATURE_BITMAP_EXECUTABLE_PATH	0x00000004
 #define AUDIT_FEATURE_BITMAP_ALL (AUDIT_FEATURE_BITMAP_BACKLOG_LIMIT | \
-				  AUDIT_FEATURE_BITMAP_BACKLOG_WAIT_TIME)
+				  AUDIT_FEATURE_BITMAP_BACKLOG_WAIT_TIME | \
+				  AUDIT_FEATURE_BITMAP_EXECUTABLE_PATH)
 
 /* deprecated: AUDIT_VERSION_* */
 #define AUDIT_VERSION_LATEST 		AUDIT_FEATURE_BITMAP_ALL
-- 
cgit v1.2.3


From d877f07112f1e5a247c6b585c971a93895c9f738 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Sun, 31 May 2015 18:04:11 +0200
Subject: netfilter: nf_tables: add nft_dup expression

This new expression uses the nf_dup engine to clone packets to a given gateway.
Unlike xt_TEE, we use an index to indicate output interface which should be
fine at this stage.

Moreover, change to the preemtion-safe this_cpu_read(nf_skb_duplicated) from
nf_dup_ipv{4,6} to silence a lockdep splat.

Based on the original tee expression from Arturo Borrero Gonzalez, although
this patch has diverted quite a bit from this initial effort due to the
change to support maps.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index a99e6a997140..2ef35f2c9bda 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -935,6 +935,20 @@ enum nft_redir_attributes {
 };
 #define NFTA_REDIR_MAX		(__NFTA_REDIR_MAX - 1)
 
+/**
+ * enum nft_dup_attributes - nf_tables dup expression netlink attributes
+ *
+ * @NFTA_DUP_SREG_ADDR: source register of address (NLA_U32: nft_registers)
+ * @NFTA_DUP_SREG_DEV: source register of output interface (NLA_U32: nft_register)
+ */
+enum nft_dup_attributes {
+	NFTA_DUP_UNSPEC,
+	NFTA_DUP_SREG_ADDR,
+	NFTA_DUP_SREG_DEV,
+	__NFTA_DUP_MAX
+};
+#define NFTA_DUP_MAX		(__NFTA_DUP_MAX - 1)
+
 /**
  * enum nft_gen_attributes - nf_tables ruleset generation attributes
  *
-- 
cgit v1.2.3


From 3e87baafa4f476017b2453e52a1ca7308b4e6ad5 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Sun, 2 Aug 2015 18:02:14 +0200
Subject: netfilter: nft_limit: add burst parameter

This patch adds the burst parameter. This burst indicates the number of packets
that can exceed the limit.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 2ef35f2c9bda..cafd78943c7d 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -761,11 +761,13 @@ enum nft_ct_attributes {
  *
  * @NFTA_LIMIT_RATE: refill rate (NLA_U64)
  * @NFTA_LIMIT_UNIT: refill unit (NLA_U64)
+ * @NFTA_LIMIT_BURST: burst (NLA_U32)
  */
 enum nft_limit_attributes {
 	NFTA_LIMIT_UNSPEC,
 	NFTA_LIMIT_RATE,
 	NFTA_LIMIT_UNIT,
+	NFTA_LIMIT_BURST,
 	__NFTA_LIMIT_MAX
 };
 #define NFTA_LIMIT_MAX		(__NFTA_LIMIT_MAX - 1)
-- 
cgit v1.2.3


From d2168e849ebf617b2b7feae44c0c0baf739cb610 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 5 Aug 2015 12:38:44 +0200
Subject: netfilter: nft_limit: add per-byte limiting

This patch adds a new NFTA_LIMIT_TYPE netlink attribute to indicate the type of
limiting.

Contrary to per-packet limiting, the cost is calculated from the packet path
since this depends on the packet length.

The burst attribute indicates the number of bytes in which the rate can be
exceeded.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index cafd78943c7d..d8c8a7c9d88a 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -756,18 +756,25 @@ enum nft_ct_attributes {
 };
 #define NFTA_CT_MAX		(__NFTA_CT_MAX - 1)
 
+enum nft_limit_type {
+	NFT_LIMIT_PKTS,
+	NFT_LIMIT_PKT_BYTES
+};
+
 /**
  * enum nft_limit_attributes - nf_tables limit expression netlink attributes
  *
  * @NFTA_LIMIT_RATE: refill rate (NLA_U64)
  * @NFTA_LIMIT_UNIT: refill unit (NLA_U64)
  * @NFTA_LIMIT_BURST: burst (NLA_U32)
+ * @NFTA_LIMIT_TYPE: type of limit (NLA_U32: enum nft_limit_type)
  */
 enum nft_limit_attributes {
 	NFTA_LIMIT_UNSPEC,
 	NFTA_LIMIT_RATE,
 	NFTA_LIMIT_UNIT,
 	NFTA_LIMIT_BURST,
+	NFTA_LIMIT_TYPE,
 	__NFTA_LIMIT_MAX
 };
 #define NFTA_LIMIT_MAX		(__NFTA_LIMIT_MAX - 1)
-- 
cgit v1.2.3


From da8b43c0e1dcea3bcac5f37ea59934ddaa137aed Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Tue, 4 Aug 2015 22:51:07 -0700
Subject: vxlan: combine VXLAN_FLOWBASED into VXLAN_COLLECT_METADATA

IFLA_VXLAN_FLOWBASED is useless without IFLA_VXLAN_COLLECT_METADATA,
so combine them into single IFLA_VXLAN_COLLECT_METADATA flag.
'flowbased' doesn't convey real meaning of the vxlan tunnel mode.
This mode can be used by routing, tc+bpf and ovs.
Only ovs is strictly flow based, so 'collect metadata' is a better
name for this tunnel mode.

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_link.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index ea047480a1f0..f24ec99a2262 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -382,7 +382,6 @@ enum {
 	IFLA_VXLAN_REMCSUM_RX,
 	IFLA_VXLAN_GBP,
 	IFLA_VXLAN_REMCSUM_NOPARTIAL,
-	IFLA_VXLAN_FLOWBASED,
 	IFLA_VXLAN_COLLECT_METADATA,
 	__IFLA_VXLAN_MAX
 };
-- 
cgit v1.2.3


From ea317b267e9d03a8241893aa176fba7661d07579 Mon Sep 17 00:00:00 2001
From: Kaixu Xia <xiakaixu@huawei.com>
Date: Thu, 6 Aug 2015 07:02:34 +0000
Subject: bpf: Add new bpf map type to store the pointer to struct perf_event

Introduce a new bpf map type 'BPF_MAP_TYPE_PERF_EVENT_ARRAY'.
This map only stores the pointer to struct perf_event. The
user space event FDs from perf_event_open() syscall are converted
to the pointer to struct perf_event and stored in map.

Signed-off-by: Kaixu Xia <xiakaixu@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 2ce13c109b00..a1814e8e53a7 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -114,6 +114,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_HASH,
 	BPF_MAP_TYPE_ARRAY,
 	BPF_MAP_TYPE_PROG_ARRAY,
+	BPF_MAP_TYPE_PERF_EVENT_ARRAY,
 };
 
 enum bpf_prog_type {
-- 
cgit v1.2.3


From 35578d7984003097af2b1e34502bc943d40c1804 Mon Sep 17 00:00:00 2001
From: Kaixu Xia <xiakaixu@huawei.com>
Date: Thu, 6 Aug 2015 07:02:35 +0000
Subject: bpf: Implement function bpf_perf_event_read() that get the selected
 hardware PMU conuter

According to the perf_event_map_fd and index, the function
bpf_perf_event_read() can convert the corresponding map
value to the pointer to struct perf_event and return the
Hardware PMU counter value.

Signed-off-by: Kaixu Xia <xiakaixu@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index a1814e8e53a7..92a48e2d5461 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -271,6 +271,7 @@ enum bpf_func_id {
 	 */
 	BPF_FUNC_skb_get_tunnel_key,
 	BPF_FUNC_skb_set_tunnel_key,
+	BPF_FUNC_perf_event_read,	/* u64 bpf_perf_event_read(&map, index) */
 	__BPF_FUNC_MAX_ID,
 };
 
-- 
cgit v1.2.3


From a7854037da006a7472c48773e3190db55217ec9b Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Fri, 7 Aug 2015 19:40:45 +0300
Subject: bridge: netlink: add support for vlan_filtering attribute

This patch adds the ability to toggle the vlan filtering support via
netlink. Since we're already running with rtnl in .changelink() we don't
need to take any additional locks.

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_link.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index f24ec99a2262..d450be36add2 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -230,6 +230,7 @@ enum {
 	IFLA_BR_AGEING_TIME,
 	IFLA_BR_STP_STATE,
 	IFLA_BR_PRIORITY,
+	IFLA_BR_VLAN_FILTERING,
 	__IFLA_BR_MAX,
 };
 
-- 
cgit v1.2.3


From fb811395cd5a71b9e94a068f524a6f4a21b67bdb Mon Sep 17 00:00:00 2001
From: Rick Jones <rick.jones2@hp.com>
Date: Fri, 7 Aug 2015 11:10:37 -0700
Subject: net: add explicit logging and stat for neighbour table overflow

Add an explicit neighbour table overflow message (ratelimited) and
statistic to make diagnosing neighbour table overflows tractable in
the wild.

Diagnosing a neighbour table overflow can be quite difficult in the wild
because there is no explicit dmesg logged.  Callers to neighbour code
seem to use net_dbg_ratelimit when the neighbour call fails which means
the "base message" is not emitted and the callback suppressed messages
from the ratelimiting can end-up juxtaposed with unrelated messages.
Further, a forced garbage collection will increment a stat on each call
whether it was successful in freeing-up a table entry or not, so that
statistic is only a hint.  So, add a net_info_ratelimited message and
explicit statistic to the neighbour code.

Signed-off-by: Rick Jones <rick.jones2@hp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/neighbour.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/neighbour.h b/include/uapi/linux/neighbour.h
index 2e35c61bbdd1..788655bfa0f3 100644
--- a/include/uapi/linux/neighbour.h
+++ b/include/uapi/linux/neighbour.h
@@ -106,6 +106,7 @@ struct ndt_stats {
 	__u64		ndts_rcv_probes_ucast;
 	__u64		ndts_periodic_gc_runs;
 	__u64		ndts_forced_gc_runs;
+	__u64		ndts_table_fulls;
 };
 
 enum {
-- 
cgit v1.2.3


From 2e15ea390e6f4466655066d97e22ec66870a042c Mon Sep 17 00:00:00 2001
From: Pravin B Shelar <pshelar@nicira.com>
Date: Fri, 7 Aug 2015 23:51:42 -0700
Subject: ip_gre: Add support to collect tunnel metadata.

Following patch create new tunnel flag which enable
tunnel metadata collection on given device.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_tunnel.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/if_tunnel.h b/include/uapi/linux/if_tunnel.h
index bd3cc11a431f..af4de90ba27d 100644
--- a/include/uapi/linux/if_tunnel.h
+++ b/include/uapi/linux/if_tunnel.h
@@ -112,6 +112,7 @@ enum {
 	IFLA_GRE_ENCAP_FLAGS,
 	IFLA_GRE_ENCAP_SPORT,
 	IFLA_GRE_ENCAP_DPORT,
+	IFLA_GRE_COLLECT_METADATA,
 	__IFLA_GRE_MAX,
 };
 
-- 
cgit v1.2.3


From d80efd5cb3dec16a8d1aea9b8a4a7921972dba65 Mon Sep 17 00:00:00 2001
From: Thomas Hellstrom <thellstrom@vmware.com>
Date: Mon, 10 Aug 2015 10:39:35 -0700
Subject: drm/vmwgfx: Initial DX support

Initial DX support.
Co-authored with Sinclair Yeh, Charmaine Lee and Jakob Bornecrantz.

Signed-off-by: Thomas Hellstrom <thellstrom@vmware.com>
Signed-off-by: Sinclair Yeh <syeh@vmware.com>
Signed-off-by: Charmaine Lee <charmainel@vmware.com>
---
 include/uapi/drm/vmwgfx_drm.h | 35 ++++++++++++++++++++++++++++++++---
 1 file changed, 32 insertions(+), 3 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/drm/vmwgfx_drm.h b/include/uapi/drm/vmwgfx_drm.h
index c8a863180174..c5bcddd9f58c 100644
--- a/include/uapi/drm/vmwgfx_drm.h
+++ b/include/uapi/drm/vmwgfx_drm.h
@@ -64,6 +64,7 @@
 #define DRM_VMW_GB_SURFACE_CREATE    23
 #define DRM_VMW_GB_SURFACE_REF       24
 #define DRM_VMW_SYNCCPU              25
+#define DRM_VMW_CREATE_EXTENDED_CONTEXT 26
 
 /*************************************************************************/
 /**
@@ -89,6 +90,7 @@
 #define DRM_VMW_PARAM_MAX_MOB_MEMORY   9
 #define DRM_VMW_PARAM_MAX_MOB_SIZE     10
 #define DRM_VMW_PARAM_SCREEN_TARGET    11
+#define DRM_VMW_PARAM_DX               12
 
 /**
  * enum drm_vmw_handle_type - handle type for ref ioctls
@@ -297,7 +299,7 @@ union drm_vmw_surface_reference_arg {
  * Argument to the DRM_VMW_EXECBUF Ioctl.
  */
 
-#define DRM_VMW_EXECBUF_VERSION 1
+#define DRM_VMW_EXECBUF_VERSION 2
 
 struct drm_vmw_execbuf_arg {
 	uint64_t commands;
@@ -306,6 +308,8 @@ struct drm_vmw_execbuf_arg {
 	uint64_t fence_rep;
 	uint32_t version;
 	uint32_t flags;
+	uint32_t context_handle;
+	uint32_t pad64;
 };
 
 /**
@@ -826,7 +830,6 @@ struct drm_vmw_update_layout_arg {
 enum drm_vmw_shader_type {
 	drm_vmw_shader_type_vs = 0,
 	drm_vmw_shader_type_ps,
-	drm_vmw_shader_type_gs
 };
 
 
@@ -908,6 +911,8 @@ enum drm_vmw_surface_flags {
  * @buffer_handle     Buffer handle of backup buffer. SVGA3D_INVALID_ID
  *                    if none.
  * @base_size         Size of the base mip level for all faces.
+ * @array_size        Must be zero for non-DX hardware, and if non-zero
+ *                    svga3d_flags must have proper bind flags setup.
  *
  * Input argument to the  DRM_VMW_GB_SURFACE_CREATE Ioctl.
  * Part of output argument for the DRM_VMW_GB_SURFACE_REF Ioctl.
@@ -920,7 +925,7 @@ struct drm_vmw_gb_surface_create_req {
 	uint32_t multisample_count;
 	uint32_t autogen_filter;
 	uint32_t buffer_handle;
-	uint32_t pad64;
+	uint32_t array_size;
 	struct drm_vmw_size base_size;
 };
 
@@ -1060,4 +1065,28 @@ struct drm_vmw_synccpu_arg {
 	uint32_t pad64;
 };
 
+/*************************************************************************/
+/**
+ * DRM_VMW_CREATE_EXTENDED_CONTEXT - Create a host context.
+ *
+ * Allocates a device unique context id, and queues a create context command
+ * for the host. Does not wait for host completion.
+ */
+enum drm_vmw_extended_context {
+	drm_vmw_context_legacy,
+	drm_vmw_context_dx
+};
+
+/**
+ * union drm_vmw_extended_context_arg
+ *
+ * @req: Context type.
+ * @rep: Context identifier.
+ *
+ * Argument to the DRM_VMW_CREATE_EXTENDED_CONTEXT Ioctl.
+ */
+union drm_vmw_extended_context_arg {
+	enum drm_vmw_extended_context req;
+	struct drm_vmw_context_arg rep;
+};
 #endif
-- 
cgit v1.2.3


From 54fbde8a94a8a78547597215c9e4be590d075ee0 Mon Sep 17 00:00:00 2001
From: Sinclair Yeh <syeh@vmware.com>
Date: Wed, 29 Jul 2015 12:38:02 -0700
Subject: drm/vmwgfx: Fix copyright headers

Updating and fixing copyright headers.
Bump version minor to signal vgpu10 support.

Signed-off-by: Sinclair Yeh <syeh@vmware.com>
Signed-off-by: Thomas Hellstrom <thellstrom@vmware.com>
Reviewed-by: Brian Paul <brianp@vmware.com>
---
 include/uapi/drm/vmwgfx_drm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/drm/vmwgfx_drm.h b/include/uapi/drm/vmwgfx_drm.h
index c5bcddd9f58c..05b204954d16 100644
--- a/include/uapi/drm/vmwgfx_drm.h
+++ b/include/uapi/drm/vmwgfx_drm.h
@@ -1,6 +1,6 @@
 /**************************************************************************
  *
- * Copyright © 2009 VMware, Inc., Palo Alto, CA., USA
+ * Copyright © 2009-2015 VMware, Inc., Palo Alto, CA., USA
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
-- 
cgit v1.2.3


From 35103d11173b8fea874183f8aa508ae71234d299 Mon Sep 17 00:00:00 2001
From: Andy Gospodarek <gospo@cumulusnetworks.com>
Date: Thu, 13 Aug 2015 10:39:01 -0400
Subject: net: ipv6 sysctl option to ignore routes when nexthop link is down

Like the ipv4 patch with a similar title, this adds a sysctl to allow
the user to change routing behavior based on whether or not the
interface associated with the nexthop was an up or down link.  The
default setting preserves the current behavior, but anyone that enables
it will notice that nexthops on down interfaces will no longer be
selected:

net.ipv6.conf.all.ignore_routes_with_linkdown = 0
net.ipv6.conf.default.ignore_routes_with_linkdown = 0
net.ipv6.conf.lo.ignore_routes_with_linkdown = 0
...

When the above sysctls are set, not only will link status be reported to
userspace, but an indication that a nexthop is dead and will not be used
is also reported.

1000::/8 via 7000::2 dev p7p1  metric 1024 dead linkdown  pref medium
1000::/8 via 8000::2 dev p8p1  metric 1024  pref medium
7000::/8 dev p7p1  proto kernel  metric 256 dead linkdown  pref medium
8000::/8 dev p8p1  proto kernel  metric 256  pref medium
9000::/8 via 8000::2 dev p8p1  metric 2048  pref medium
9000::/8 via 7000::2 dev p7p1  metric 1024 dead linkdown  pref medium
fe80::/64 dev p7p1  proto kernel  metric 256 dead linkdown  pref medium
fe80::/64 dev p8p1  proto kernel  metric 256  pref medium

This also adds devconf support and notification when sysctl values
change.

v2: drop use of rt6i_nhflags since it is not needed right now

Signed-off-by: Andy Gospodarek <gospo@cumulusnetworks.com>
Signed-off-by: Dinesh Dutt <ddutt@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ipv6.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h
index 80f3b74446a1..38b4fef20219 100644
--- a/include/uapi/linux/ipv6.h
+++ b/include/uapi/linux/ipv6.h
@@ -173,6 +173,7 @@ enum {
 	DEVCONF_STABLE_SECRET,
 	DEVCONF_USE_OIF_ADDRS_ONLY,
 	DEVCONF_ACCEPT_RA_MIN_HOP_LIMIT,
+	DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN,
 	DEVCONF_MAX
 };
 
-- 
cgit v1.2.3


From 4e3c89920cd3a6cfce22c6f537690747c26128dd Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Thu, 13 Aug 2015 14:59:00 -0600
Subject: net: Introduce VRF related flags and helpers

Add a VRF_MASTER flag for interfaces and helper functions for determining
if a device is a VRF_MASTER.

Add link attribute for passing VRF_TABLE id.

Add vrf_ptr to netdevice.

Add various macros for determining if a device is a VRF device, the index
of the master VRF device and table associated with VRF device.

Signed-off-by: Shrijeet Mukherjee <shm@cumulusnetworks.com>
Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_link.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index d450be36add2..313c305fd1ad 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -341,6 +341,15 @@ enum macvlan_macaddr_mode {
 
 #define MACVLAN_FLAG_NOPROMISC	1
 
+/* VRF section */
+enum {
+	IFLA_VRF_UNSPEC,
+	IFLA_VRF_TABLE,
+	__IFLA_VRF_MAX
+};
+
+#define IFLA_VRF_MAX (__IFLA_VRF_MAX - 1)
+
 /* IPVLAN section */
 enum {
 	IFLA_IPVLAN_UNSPEC,
-- 
cgit v1.2.3


From a1c234f95cae2d293047bb6c36e7a4840dbac815 Mon Sep 17 00:00:00 2001
From: Jiri Benc <jbenc@redhat.com>
Date: Fri, 14 Aug 2015 16:40:40 +0200
Subject: lwtunnel: rename ip lwtunnel attributes

We already have IFLA_IPTUN_ netlink attributes. The IP_TUN_ attributes look
very similar, yet they serve very different purpose. This is confusing for
anyone trying to implement a user space tool supporting lwt.

As the IP_TUN_ attributes are used only for the lightweight tunnels, prefix
them with LWTUNNEL_IP_ instead to make their purpose clear. Also, it's more
logical to have them in lwtunnel.h together with the encap enum.

Fixes: 3093fbe7ff4b ("route: Per route IP tunnel metadata via lightweight tunnel")
Signed-off-by: Jiri Benc <jbenc@redhat.com>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/lwtunnel.h  | 14 ++++++++++++++
 include/uapi/linux/rtnetlink.h | 15 ---------------
 2 files changed, 14 insertions(+), 15 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/lwtunnel.h b/include/uapi/linux/lwtunnel.h
index 31377bbea3f8..3bf223bc2367 100644
--- a/include/uapi/linux/lwtunnel.h
+++ b/include/uapi/linux/lwtunnel.h
@@ -12,5 +12,19 @@ enum lwtunnel_encap_types {
 
 #define LWTUNNEL_ENCAP_MAX (__LWTUNNEL_ENCAP_MAX - 1)
 
+enum lwtunnel_ip_t {
+	LWTUNNEL_IP_UNSPEC,
+	LWTUNNEL_IP_ID,
+	LWTUNNEL_IP_DST,
+	LWTUNNEL_IP_SRC,
+	LWTUNNEL_IP_TTL,
+	LWTUNNEL_IP_TOS,
+	LWTUNNEL_IP_SPORT,
+	LWTUNNEL_IP_DPORT,
+	LWTUNNEL_IP_FLAGS,
+	__LWTUNNEL_IP_MAX,
+};
+
+#define LWTUNNEL_IP_MAX (__LWTUNNEL_IP_MAX - 1)
 
 #endif /* _UAPI_LWTUNNEL_H_ */
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index 47d24cb3fbc1..0d3d3cc43356 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -286,21 +286,6 @@ enum rt_class_t {
 
 /* Routing message attributes */
 
-enum ip_tunnel_t {
-	IP_TUN_UNSPEC,
-	IP_TUN_ID,
-	IP_TUN_DST,
-	IP_TUN_SRC,
-	IP_TUN_TTL,
-	IP_TUN_TOS,
-	IP_TUN_SPORT,
-	IP_TUN_DPORT,
-	IP_TUN_FLAGS,
-	__IP_TUN_MAX,
-};
-
-#define IP_TUN_MAX (__IP_TUN_MAX - 1)
-
 enum rtattr_type_t {
 	RTA_UNSPEC,
 	RTA_DST,
-- 
cgit v1.2.3


From 47dceb8ecdc1c3ad1818dfea3d659a05b74c3fc2 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Fri, 14 Aug 2015 22:31:34 -0400
Subject: packet: add classic BPF fanout mode

Add fanout mode PACKET_FANOUT_CBPF that accepts a classic BPF program
to select a socket.

This avoids having to keep adding special case fanout modes. One
example use case is application layer load balancing. The QUIC
protocol, for instance, encodes a connection ID in UDP payload.

Also add socket option SOL_PACKET/PACKET_FANOUT_DATA that updates data
associated with the socket group. Fanout mode PACKET_FANOUT_CBPF is the
only user so far.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_packet.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/if_packet.h b/include/uapi/linux/if_packet.h
index d3d715f8c88f..a4bb16fa822e 100644
--- a/include/uapi/linux/if_packet.h
+++ b/include/uapi/linux/if_packet.h
@@ -55,6 +55,7 @@ struct sockaddr_ll {
 #define PACKET_TX_HAS_OFF		19
 #define PACKET_QDISC_BYPASS		20
 #define PACKET_ROLLOVER_STATS		21
+#define PACKET_FANOUT_DATA		22
 
 #define PACKET_FANOUT_HASH		0
 #define PACKET_FANOUT_LB		1
@@ -62,6 +63,7 @@ struct sockaddr_ll {
 #define PACKET_FANOUT_ROLLOVER		3
 #define PACKET_FANOUT_RND		4
 #define PACKET_FANOUT_QM		5
+#define PACKET_FANOUT_CBPF		6
 #define PACKET_FANOUT_FLAG_ROLLOVER	0x1000
 #define PACKET_FANOUT_FLAG_DEFRAG	0x8000
 
-- 
cgit v1.2.3


From f2e520956a1ab636698f8160194c9b8ac0989aab Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Fri, 14 Aug 2015 22:31:35 -0400
Subject: packet: add extended BPF fanout mode

Add fanout mode PACKET_FANOUT_EBPF that accepts an en extended BPF
program to select a socket.

Update the internal eBPF program by passing to socket option
SOL_PACKET/PACKET_FANOUT_DATA a file descriptor returned by bpf().

Signed-off-by: Willem de Bruijn <willemb@google.com>
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_packet.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/if_packet.h b/include/uapi/linux/if_packet.h
index a4bb16fa822e..9e7edfd8141e 100644
--- a/include/uapi/linux/if_packet.h
+++ b/include/uapi/linux/if_packet.h
@@ -64,6 +64,7 @@ struct sockaddr_ll {
 #define PACKET_FANOUT_RND		4
 #define PACKET_FANOUT_QM		5
 #define PACKET_FANOUT_CBPF		6
+#define PACKET_FANOUT_EBPF		7
 #define PACKET_FANOUT_FLAG_ROLLOVER	0x1000
 #define PACKET_FANOUT_FLAG_DEFRAG	0x8000
 
-- 
cgit v1.2.3


From deedb59039f111c41aa5a54ee384c8e7c08bc78a Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 14 Aug 2015 16:03:39 +0200
Subject: netfilter: nf_conntrack: add direction support for zones

This work adds a direction parameter to netfilter zones, so identity
separation can be performed only in original/reply or both directions
(default). This basically opens up the possibility of doing NAT with
conflicting IP address/port tuples from multiple, isolated tenants
on a host (e.g. from a netns) without requiring each tenant to NAT
twice resp. to use its own dedicated IP address to SNAT to, meaning
overlapping tuples can be made unique with the zone identifier in
original direction, where the NAT engine will then allocate a unique
tuple in the commonly shared default zone for the reply direction.
In some restricted, local DNAT cases, also port redirection could be
used for making the reply traffic unique w/o requiring SNAT.

The consensus we've reached and discussed at NFWS and since the initial
implementation [1] was to directly integrate the direction meta data
into the existing zones infrastructure, as opposed to the ct->mark
approach we proposed initially.

As we pass the nf_conntrack_zone object directly around, we don't have
to touch all call-sites, but only those, that contain equality checks
of zones. Thus, based on the current direction (original or reply),
we either return the actual id, or the default NF_CT_DEFAULT_ZONE_ID.
CT expectations are direction-agnostic entities when expectations are
being compared among themselves, so we can only use the identifier
in this case.

Note that zone identifiers can not be included into the hash mix
anymore as they don't contain a "stable" value that would be equal
for both directions at all times, f.e. if only zone->id would
unconditionally be xor'ed into the table slot hash, then replies won't
find the corresponding conntracking entry anymore.

If no particular direction is specified when configuring zones, the
behaviour is exactly as we expect currently (both directions).

Support has been added for the CT netlink interface as well as the
x_tables raw CT target, which both already offer existing interfaces
to user space for the configuration of zones.

Below a minimal, simplified collision example (script in [2]) with
netperf sessions:

  +--- tenant-1 ---+   mark := 1
  |    netperf     |--+
  +----------------+  |                CT zone := mark [ORIGINAL]
   [ip,sport] := X   +--------------+  +--- gateway ---+
                     | mark routing |--|     SNAT      |-- ... +
                     +--------------+  +---------------+       |
  +--- tenant-2 ---+  |                                     ~~~|~~~
  |    netperf     |--+                +-----------+           |
  +----------------+   mark := 2       | netserver |------ ... +
   [ip,sport] := X                     +-----------+
                                        [ip,port] := Y
On the gateway netns, example:

  iptables -t raw -A PREROUTING -j CT --zone mark --zone-dir ORIGINAL
  iptables -t nat -A POSTROUTING -o <dev> -j SNAT --to-source <ip> --random-fully

  iptables -t mangle -A PREROUTING -m conntrack --ctdir ORIGINAL -j CONNMARK --save-mark
  iptables -t mangle -A POSTROUTING -m conntrack --ctdir REPLY -j CONNMARK --restore-mark

conntrack dump from gateway netns:

  netperf -H 10.1.1.2 -t TCP_STREAM -l60 -p12865,5555 from each tenant netns

  tcp 6 431995 ESTABLISHED src=40.1.1.1 dst=10.1.1.2 sport=5555 dport=12865 zone-orig=1
                           src=10.1.1.2 dst=10.1.1.1 sport=12865 dport=1024
               [ASSURED] mark=1 secctx=system_u:object_r:unlabeled_t:s0 use=1

  tcp 6 431994 ESTABLISHED src=40.1.1.1 dst=10.1.1.2 sport=5555 dport=12865 zone-orig=2
                           src=10.1.1.2 dst=10.1.1.1 sport=12865 dport=5555
               [ASSURED] mark=2 secctx=system_u:object_r:unlabeled_t:s0 use=1

  tcp 6 299 ESTABLISHED src=40.1.1.1 dst=10.1.1.2 sport=39438 dport=33768 zone-orig=1
                        src=10.1.1.2 dst=10.1.1.1 sport=33768 dport=39438
               [ASSURED] mark=1 secctx=system_u:object_r:unlabeled_t:s0 use=1

  tcp 6 300 ESTABLISHED src=40.1.1.1 dst=10.1.1.2 sport=32889 dport=40206 zone-orig=2
                        src=10.1.1.2 dst=10.1.1.1 sport=40206 dport=32889
               [ASSURED] mark=2 secctx=system_u:object_r:unlabeled_t:s0 use=2

Taking this further, test script in [2] creates 200 tenants and runs
original-tuple colliding netperf sessions each. A conntrack -L dump in
the gateway netns also confirms 200 overlapping entries, all in ESTABLISHED
state as expected.

I also did run various other tests with some permutations of the script,
to mention some: SNAT in random/random-fully/persistent mode, no zones (no
overlaps), static zones (original, reply, both directions), etc.

  [1] http://thread.gmane.org/gmane.comp.security.firewalls.netfilter.devel/57412/
  [2] https://paste.fedoraproject.org/242835/65657871/

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nfnetlink_conntrack.h | 1 +
 include/uapi/linux/netfilter/xt_CT.h               | 6 +++++-
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/netfilter/nfnetlink_conntrack.h b/include/uapi/linux/netfilter/nfnetlink_conntrack.h
index acad6c52a652..c1a4e1441a25 100644
--- a/include/uapi/linux/netfilter/nfnetlink_conntrack.h
+++ b/include/uapi/linux/netfilter/nfnetlink_conntrack.h
@@ -61,6 +61,7 @@ enum ctattr_tuple {
 	CTA_TUPLE_UNSPEC,
 	CTA_TUPLE_IP,
 	CTA_TUPLE_PROTO,
+	CTA_TUPLE_ZONE,
 	__CTA_TUPLE_MAX
 };
 #define CTA_TUPLE_MAX (__CTA_TUPLE_MAX - 1)
diff --git a/include/uapi/linux/netfilter/xt_CT.h b/include/uapi/linux/netfilter/xt_CT.h
index 5a688c1ca4d7..452005ff0e9e 100644
--- a/include/uapi/linux/netfilter/xt_CT.h
+++ b/include/uapi/linux/netfilter/xt_CT.h
@@ -6,7 +6,11 @@
 enum {
 	XT_CT_NOTRACK		= 1 << 0,
 	XT_CT_NOTRACK_ALIAS	= 1 << 1,
-	XT_CT_MASK		= XT_CT_NOTRACK | XT_CT_NOTRACK_ALIAS,
+	XT_CT_ZONE_DIR_ORIG	= 1 << 2,
+	XT_CT_ZONE_DIR_REPL	= 1 << 3,
+
+	XT_CT_MASK		= XT_CT_NOTRACK | XT_CT_NOTRACK_ALIAS |
+				  XT_CT_ZONE_DIR_ORIG | XT_CT_ZONE_DIR_REPL,
 };
 
 struct xt_ct_target_info {
-- 
cgit v1.2.3


From 5e8018fc61423e677398d4ad4d72df70b9788e77 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 14 Aug 2015 16:03:40 +0200
Subject: netfilter: nf_conntrack: add efficient mark to zone mapping

This work adds the possibility of deriving the zone id from the skb->mark
field in a scalable manner. This allows for having only a single template
serving hundreds/thousands of different zones, for example, instead of the
need to have one match for each zone as an extra CT jump target.

Note that we'd need to have this information attached to the template as at
the time when we're trying to lookup a possible ct object, we already need
to know zone information for a possible match when going into
__nf_conntrack_find_get(). This work provides a minimal implementation for
a possible mapping.

In order to not add/expose an extra ct->status bit, the zone structure has
been extended to carry a flag for deriving the mark.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/xt_CT.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/netfilter/xt_CT.h b/include/uapi/linux/netfilter/xt_CT.h
index 452005ff0e9e..9e520418b858 100644
--- a/include/uapi/linux/netfilter/xt_CT.h
+++ b/include/uapi/linux/netfilter/xt_CT.h
@@ -8,9 +8,11 @@ enum {
 	XT_CT_NOTRACK_ALIAS	= 1 << 1,
 	XT_CT_ZONE_DIR_ORIG	= 1 << 2,
 	XT_CT_ZONE_DIR_REPL	= 1 << 3,
+	XT_CT_ZONE_MARK		= 1 << 4,
 
 	XT_CT_MASK		= XT_CT_NOTRACK | XT_CT_NOTRACK_ALIAS |
-				  XT_CT_ZONE_DIR_ORIG | XT_CT_ZONE_DIR_REPL,
+				  XT_CT_ZONE_DIR_ORIG | XT_CT_ZONE_DIR_REPL |
+				  XT_CT_ZONE_MARK,
 };
 
 struct xt_ct_target_info {
-- 
cgit v1.2.3


From 65d7ab8de582bc668e3dabb6ff48f750098a6e78 Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Mon, 17 Aug 2015 13:42:27 -0700
Subject: net: Identifier Locator Addressing module

Adding new module name ila. This implements ILA translation. Light
weight tunnel redirection is used to perform the translation in
the data path. This is configured by the "ip -6 route" command
using the "encap ila <locator>" option, where <locator> is the
value to set in destination locator of the packet. e.g.

ip -6 route add 3333:0:0:1:5555:0:1:0/128 \
      encap ila 2001:0:0:1 via 2401:db00:20:911a:face:0:25:0

Sets a route where 3333:0:0:1 will be overwritten by
2001:0:0:1 on output.

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ila.h      | 15 +++++++++++++++
 include/uapi/linux/lwtunnel.h |  1 +
 2 files changed, 16 insertions(+)
 create mode 100644 include/uapi/linux/ila.h

(limited to 'include/uapi')

diff --git a/include/uapi/linux/ila.h b/include/uapi/linux/ila.h
new file mode 100644
index 000000000000..7ed9e670814e
--- /dev/null
+++ b/include/uapi/linux/ila.h
@@ -0,0 +1,15 @@
+/* ila.h - ILA Interface */
+
+#ifndef _UAPI_LINUX_ILA_H
+#define _UAPI_LINUX_ILA_H
+
+enum {
+	ILA_ATTR_UNSPEC,
+	ILA_ATTR_LOCATOR,			/* u64 */
+
+	__ILA_ATTR_MAX,
+};
+
+#define ILA_ATTR_MAX		(__ILA_ATTR_MAX - 1)
+
+#endif /* _UAPI_LINUX_ILA_H */
diff --git a/include/uapi/linux/lwtunnel.h b/include/uapi/linux/lwtunnel.h
index 3bf223bc2367..aa84ca396bcb 100644
--- a/include/uapi/linux/lwtunnel.h
+++ b/include/uapi/linux/lwtunnel.h
@@ -7,6 +7,7 @@ enum lwtunnel_encap_types {
 	LWTUNNEL_ENCAP_NONE,
 	LWTUNNEL_ENCAP_MPLS,
 	LWTUNNEL_ENCAP_IP,
+	LWTUNNEL_ENCAP_ILA,
 	__LWTUNNEL_ENCAP_MAX,
 };
 
-- 
cgit v1.2.3


From d9232a3da8683cd9c9854a858bcca968fe5f3bca Mon Sep 17 00:00:00 2001
From: Ian Munsie <imunsie@au1.ibm.com>
Date: Thu, 23 Jul 2015 16:43:56 +1000
Subject: cxl: Add alternate MMIO error handling

userspace programs using cxl currently have to use two strategies for
dealing with MMIO errors simultaneously. They have to check every read
for a return of all Fs in case the adapter has gone away and the kernel
has not yet noticed, and they have to deal with SIGBUS in case the
kernel has already noticed, invalidated the mapping and marked the
context as failed.

In order to simplify things, this patch adds an alternative approach
where the kernel will return a page filled with Fs instead of delivering
a SIGBUS. This allows userspace to only need to deal with one of these
two error paths, and is intended for use in libraries that use cxl
transparently and may not be able to safely install a signal handler.

This approach will only work if certain constraints are met. Namely, if
the application is both reading and writing to an address in the problem
state area it cannot assume that a non-FF read is OK, as it may just be
reading out a value it has previously written. Further - since only one
page is used per context a write to a given offset would be visible when
reading the same offset from a different page in the mapping (this only
applies within a single context, not between contexts).

An application could deal with this by e.g. making sure it also reads
from a read-only offset after any reads to a read/write offset.

Due to these constraints, this functionality must be explicitly
requested by userspace when starting the context by passing in the
CXL_START_WORK_ERR_FF flag.

Signed-off-by: Ian Munsie <imunsie@au1.ibm.com>
Acked-by: Michael Neuling <mikey@neuling.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 include/uapi/misc/cxl.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/misc/cxl.h b/include/uapi/misc/cxl.h
index 99a8ca15fe64..1e889aa8a36e 100644
--- a/include/uapi/misc/cxl.h
+++ b/include/uapi/misc/cxl.h
@@ -29,8 +29,10 @@ struct cxl_ioctl_start_work {
 
 #define CXL_START_WORK_AMR		0x0000000000000001ULL
 #define CXL_START_WORK_NUM_IRQS		0x0000000000000002ULL
+#define CXL_START_WORK_ERR_FF		0x0000000000000004ULL
 #define CXL_START_WORK_ALL		(CXL_START_WORK_AMR |\
-					 CXL_START_WORK_NUM_IRQS)
+					 CXL_START_WORK_NUM_IRQS |\
+					 CXL_START_WORK_ERR_FF)
 
 
 /* Possible modes that an afu can be in */
-- 
cgit v1.2.3


From 81f03fedcce7ee7e83c37237ecaa2f68aad236fd Mon Sep 17 00:00:00 2001
From: Jon Derrick <jonathan.derrick@intel.com>
Date: Mon, 10 Aug 2015 15:20:41 -0600
Subject: NVMe: Add nvme subsystem reset IOCTL

Controllers can perform optional subsystem resets as introduced in NVMe
1.1. This patch adds an IOCTL to trigger the subsystem reset by writing
"NVMe" to the NSSR register.

Signed-off-by: Jon Derrick <jonathan.derrick@intel.com>
Acked-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/uapi/linux/nvme.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/nvme.h b/include/uapi/linux/nvme.h
index 732b32e92b02..8864194a4151 100644
--- a/include/uapi/linux/nvme.h
+++ b/include/uapi/linux/nvme.h
@@ -584,5 +584,6 @@ struct nvme_passthru_cmd {
 #define NVME_IOCTL_SUBMIT_IO	_IOW('N', 0x42, struct nvme_user_io)
 #define NVME_IOCTL_IO_CMD	_IOWR('N', 0x43, struct nvme_passthru_cmd)
 #define NVME_IOCTL_RESET	_IO('N', 0x44)
+#define NVME_IOCTL_SUBSYS_RESET	_IO('N', 0x45)
 
 #endif /* _UAPI_LINUX_NVME_H */
-- 
cgit v1.2.3


From bd49784fd1e8f42c7600fbfa206361324857f373 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Tue, 18 Aug 2015 16:26:16 -0400
Subject: dm stats: report precise_timestamps and histogram in @stats_list
 output

If the user selected the precise_timestamps or histogram options, report
it in the @stats_list message output.

If the user didn't select these options, no extra tokens are reported,
thus it is backward compatible with old software that doesn't know about
precise timestamps and histogram.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org # 4.2
---
 include/uapi/linux/dm-ioctl.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/dm-ioctl.h b/include/uapi/linux/dm-ioctl.h
index 061aca3a962d..d34611e35a30 100644
--- a/include/uapi/linux/dm-ioctl.h
+++ b/include/uapi/linux/dm-ioctl.h
@@ -267,9 +267,9 @@ enum {
 #define DM_DEV_SET_GEOMETRY	_IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl)
 
 #define DM_VERSION_MAJOR	4
-#define DM_VERSION_MINOR	32
+#define DM_VERSION_MINOR	33
 #define DM_VERSION_PATCHLEVEL	0
-#define DM_VERSION_EXTRA	"-ioctl (2015-6-26)"
+#define DM_VERSION_EXTRA	"-ioctl (2015-8-18)"
 
 /* Status bits */
 #define DM_READONLY_FLAG	(1 << 0) /* In/Out */
-- 
cgit v1.2.3


From 32a2b002ce615eadd3bfaddabde290f70a1dd17b Mon Sep 17 00:00:00 2001
From: Jiri Benc <jbenc@redhat.com>
Date: Thu, 20 Aug 2015 13:56:32 +0200
Subject: ipv6: route: per route IP tunnel metadata via lightweight tunnel

Allow specification of per route IP tunnel instructions also for IPv6.
This complements commit 3093fbe7ff4b ("route: Per route IP tunnel metadata
via lightweight tunnel").

Signed-off-by: Jiri Benc <jbenc@redhat.com>
CC: YOSHIFUJI Hideaki <hideaki.yoshifuji@miraclelinux.com>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/lwtunnel.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/lwtunnel.h b/include/uapi/linux/lwtunnel.h
index aa84ca396bcb..34141a5dfe74 100644
--- a/include/uapi/linux/lwtunnel.h
+++ b/include/uapi/linux/lwtunnel.h
@@ -8,6 +8,7 @@ enum lwtunnel_encap_types {
 	LWTUNNEL_ENCAP_MPLS,
 	LWTUNNEL_ENCAP_IP,
 	LWTUNNEL_ENCAP_ILA,
+	LWTUNNEL_ENCAP_IP6,
 	__LWTUNNEL_ENCAP_MAX,
 };
 
@@ -28,4 +29,19 @@ enum lwtunnel_ip_t {
 
 #define LWTUNNEL_IP_MAX (__LWTUNNEL_IP_MAX - 1)
 
+enum lwtunnel_ip6_t {
+	LWTUNNEL_IP6_UNSPEC,
+	LWTUNNEL_IP6_ID,
+	LWTUNNEL_IP6_DST,
+	LWTUNNEL_IP6_SRC,
+	LWTUNNEL_IP6_HOPLIMIT,
+	LWTUNNEL_IP6_TC,
+	LWTUNNEL_IP6_SPORT,
+	LWTUNNEL_IP6_DPORT,
+	LWTUNNEL_IP6_FLAGS,
+	__LWTUNNEL_IP6_MAX,
+};
+
+#define LWTUNNEL_IP6_MAX (__LWTUNNEL_IP6_MAX - 1)
+
 #endif /* _UAPI_LWTUNNEL_H_ */
-- 
cgit v1.2.3


From e4ff67513096e6e196ca58043fce04d0f87babbe Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Sun, 26 Jul 2015 15:03:27 +0300
Subject: ipvs: add sync_maxlen parameter for the sync daemon

Allow setups with large MTU to send large sync packets by
adding sync_maxlen parameter. The default value is now based
on MTU but no more than 1500 for compatibility reasons.

To avoid problems if MTU changes allow fragmentation by
sending packets with DF=0. Problem reported by Dan Carpenter.

Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 include/uapi/linux/ip_vs.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/ip_vs.h b/include/uapi/linux/ip_vs.h
index 3199243f2028..68377d8c8870 100644
--- a/include/uapi/linux/ip_vs.h
+++ b/include/uapi/linux/ip_vs.h
@@ -406,6 +406,7 @@ enum {
 	IPVS_DAEMON_ATTR_STATE,		/* sync daemon state (master/backup) */
 	IPVS_DAEMON_ATTR_MCAST_IFN,	/* multicast interface name */
 	IPVS_DAEMON_ATTR_SYNC_ID,	/* SyncID we belong to */
+	IPVS_DAEMON_ATTR_SYNC_MAXLEN,	/* UDP Payload Size */
 	__IPVS_DAEMON_ATTR_MAX,
 };
 
-- 
cgit v1.2.3


From d33288172e72c4729e8b9f2243fb40601afabc8f Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Sun, 26 Jul 2015 15:03:28 +0300
Subject: ipvs: add more mcast parameters for the sync daemon

- mcast_group: configure the multicast address, now IPv6
is supported too

- mcast_port: configure the multicast port

- mcast_ttl: configure the multicast TTL/HOP_LIMIT

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 include/uapi/linux/ip_vs.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/ip_vs.h b/include/uapi/linux/ip_vs.h
index 68377d8c8870..391395c06c7e 100644
--- a/include/uapi/linux/ip_vs.h
+++ b/include/uapi/linux/ip_vs.h
@@ -407,6 +407,10 @@ enum {
 	IPVS_DAEMON_ATTR_MCAST_IFN,	/* multicast interface name */
 	IPVS_DAEMON_ATTR_SYNC_ID,	/* SyncID we belong to */
 	IPVS_DAEMON_ATTR_SYNC_MAXLEN,	/* UDP Payload Size */
+	IPVS_DAEMON_ATTR_MCAST_GROUP,	/* IPv4 Multicast Address */
+	IPVS_DAEMON_ATTR_MCAST_GROUP6,	/* IPv6 Multicast Address */
+	IPVS_DAEMON_ATTR_MCAST_PORT,	/* Multicast Port (base) */
+	IPVS_DAEMON_ATTR_MCAST_TTL,	/* Multicast TTL */
 	__IPVS_DAEMON_ATTR_MAX,
 };
 
-- 
cgit v1.2.3


From b96f465035f9fae83c1d8de3e80eecfe6877608c Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Tue, 25 Aug 2015 12:51:44 -0500
Subject: dlm: fix lvb copy for user locks

For a userland lock request, the previous and current
lock modes are used to decide when the lvb should be
copied back to the user.  The wrong previous value was
used, so that it always matched the current value.
This caused the lvb to be copied back to the user in
the wrong cases.

Signed-off-by: David Teigland <teigland@redhat.com>
---
 include/uapi/linux/dlm_device.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/dlm_device.h b/include/uapi/linux/dlm_device.h
index 3060783c4191..df56c8ff0769 100644
--- a/include/uapi/linux/dlm_device.h
+++ b/include/uapi/linux/dlm_device.h
@@ -26,7 +26,7 @@
 /* Version of the device interface */
 #define DLM_DEVICE_VERSION_MAJOR 6
 #define DLM_DEVICE_VERSION_MINOR 0
-#define DLM_DEVICE_VERSION_PATCH 1
+#define DLM_DEVICE_VERSION_PATCH 2
 
 /* struct passed to the lock write */
 struct dlm_lock_params {
-- 
cgit v1.2.3


From 1afe839e6b31a85fc53adbf8757d6373908d414d Mon Sep 17 00:00:00 2001
From: Andreas Herz <andi@geekosphere.org>
Date: Fri, 21 Aug 2015 11:31:32 +0200
Subject: netfilter: ip6t_REJECT: added missing icmpv6 codes

RFC 4443 added two new codes values for ICMPv6 type 1:

 5 - Source address failed ingress/egress policy
 6 - Reject route to destination

And RFC 7084 states in L-14 that IPv6 Router MUST send ICMPv6 Destination
Unreachable with code 5 for packets forwarded to it that use an address
from a prefix that has been invalidated.

Codes 5 and 6 are more informative subsets of code 1.

Signed-off-by: Andreas Herz <andi@geekosphere.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter_ipv6/ip6t_REJECT.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/netfilter_ipv6/ip6t_REJECT.h b/include/uapi/linux/netfilter_ipv6/ip6t_REJECT.h
index 205ed62e4605..cd2e940c8bf5 100644
--- a/include/uapi/linux/netfilter_ipv6/ip6t_REJECT.h
+++ b/include/uapi/linux/netfilter_ipv6/ip6t_REJECT.h
@@ -10,7 +10,9 @@ enum ip6t_reject_with {
 	IP6T_ICMP6_ADDR_UNREACH,
 	IP6T_ICMP6_PORT_UNREACH,
 	IP6T_ICMP6_ECHOREPLY,
-	IP6T_TCP_RESET
+	IP6T_TCP_RESET,
+	IP6T_ICMP6_POLICY_FAIL,
+	IP6T_ICMP6_REJECT_ROUTE
 };
 
 struct ip6t_reject_info {
-- 
cgit v1.2.3


From 65be2c79acc3aa0f9c0e8d4871f5a451d854465a Mon Sep 17 00:00:00 2001
From: "Matthew R. Ochs" <mrochs@linux.vnet.ibm.com>
Date: Thu, 13 Aug 2015 21:47:43 -0500
Subject: cxlflash: Superpipe support

Add superpipe supporting infrastructure to device driver for the IBM CXL
Flash adapter. This patch allows userspace applications to take advantage
of the accelerated I/O features that this adapter provides and bypass the
traditional filesystem stack.

Signed-off-by: Matthew R. Ochs <mrochs@linux.vnet.ibm.com>
Signed-off-by: Manoj N. Kumar <manoj@linux.vnet.ibm.com>
Reviewed-by: Michael Neuling <mikey@neuling.org>
Reviewed-by: Wen Xiong <wenxiong@linux.vnet.ibm.com>
Reviewed-by: Brian King <brking@linux.vnet.ibm.com>
Signed-off-by: James Bottomley <JBottomley@Odin.com>
---
 include/uapi/scsi/Kbuild           |   1 +
 include/uapi/scsi/cxlflash_ioctl.h | 140 +++++++++++++++++++++++++++++++++++++
 2 files changed, 141 insertions(+)
 create mode 100644 include/uapi/scsi/cxlflash_ioctl.h

(limited to 'include/uapi')

diff --git a/include/uapi/scsi/Kbuild b/include/uapi/scsi/Kbuild
index 75746d52f208..d791e0ad509d 100644
--- a/include/uapi/scsi/Kbuild
+++ b/include/uapi/scsi/Kbuild
@@ -3,3 +3,4 @@ header-y += fc/
 header-y += scsi_bsg_fc.h
 header-y += scsi_netlink.h
 header-y += scsi_netlink_fc.h
+header-y += cxlflash_ioctl.h
diff --git a/include/uapi/scsi/cxlflash_ioctl.h b/include/uapi/scsi/cxlflash_ioctl.h
new file mode 100644
index 000000000000..570773406531
--- /dev/null
+++ b/include/uapi/scsi/cxlflash_ioctl.h
@@ -0,0 +1,140 @@
+/*
+ * CXL Flash Device Driver
+ *
+ * Written by: Manoj N. Kumar <manoj@linux.vnet.ibm.com>, IBM Corporation
+ *             Matthew R. Ochs <mrochs@linux.vnet.ibm.com>, IBM Corporation
+ *
+ * Copyright (C) 2015 IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _CXLFLASH_IOCTL_H
+#define _CXLFLASH_IOCTL_H
+
+#include <linux/types.h>
+
+/*
+ * Structure and flag definitions CXL Flash superpipe ioctls
+ */
+
+#define DK_CXLFLASH_VERSION_0	0
+
+struct dk_cxlflash_hdr {
+	__u16 version;			/* Version data */
+	__u16 rsvd[3];			/* Reserved for future use */
+	__u64 flags;			/* Input flags */
+	__u64 return_flags;		/* Returned flags */
+};
+
+/*
+ * Notes:
+ * -----
+ * The 'context_id' field of all ioctl structures contains the context
+ * identifier for a context in the lower 32-bits (upper 32-bits are not
+ * to be used when identifying a context to the AFU). That said, the value
+ * in its entirety (all 64-bits) is to be treated as an opaque cookie and
+ * should be presented as such when issuing ioctls.
+ *
+ * For DK_CXLFLASH_ATTACH ioctl, user specifies read/write access
+ * permissions via the O_RDONLY, O_WRONLY, and O_RDWR flags defined in
+ * the fcntl.h header file.
+ */
+#define DK_CXLFLASH_ATTACH_REUSE_CONTEXT	0x8000000000000000ULL
+
+struct dk_cxlflash_attach {
+	struct dk_cxlflash_hdr hdr;	/* Common fields */
+	__u64 num_interrupts;		/* Requested number of interrupts */
+	__u64 context_id;		/* Returned context */
+	__u64 mmio_size;		/* Returned size of MMIO area */
+	__u64 block_size;		/* Returned block size, in bytes */
+	__u64 adap_fd;			/* Returned adapter file descriptor */
+	__u64 last_lba;			/* Returned last LBA on the device */
+	__u64 max_xfer;			/* Returned max transfer size, blocks */
+	__u64 reserved[8];		/* Reserved for future use */
+};
+
+struct dk_cxlflash_detach {
+	struct dk_cxlflash_hdr hdr;	/* Common fields */
+	__u64 context_id;		/* Context to detach */
+	__u64 reserved[8];		/* Reserved for future use */
+};
+
+struct dk_cxlflash_udirect {
+	struct dk_cxlflash_hdr hdr;	/* Common fields */
+	__u64 context_id;		/* Context to own physical resources */
+	__u64 rsrc_handle;		/* Returned resource handle */
+	__u64 last_lba;			/* Returned last LBA on the device */
+	__u64 reserved[8];		/* Reserved for future use */
+};
+
+struct dk_cxlflash_release {
+	struct dk_cxlflash_hdr hdr;	/* Common fields */
+	__u64 context_id;		/* Context owning resources */
+	__u64 rsrc_handle;		/* Resource handle to release */
+	__u64 reserved[8];		/* Reserved for future use */
+};
+
+#define DK_CXLFLASH_VERIFY_SENSE_LEN	18
+#define DK_CXLFLASH_VERIFY_HINT_SENSE	0x8000000000000000ULL
+
+struct dk_cxlflash_verify {
+	struct dk_cxlflash_hdr hdr;	/* Common fields */
+	__u64 context_id;		/* Context owning resources to verify */
+	__u64 rsrc_handle;		/* Resource handle of LUN */
+	__u64 hint;			/* Reasons for verify */
+	__u64 last_lba;			/* Returned last LBA of device */
+	__u8 sense_data[DK_CXLFLASH_VERIFY_SENSE_LEN]; /* SCSI sense data */
+	__u8 pad[6];			/* Pad to next 8-byte boundary */
+	__u64 reserved[8];		/* Reserved for future use */
+};
+
+#define DK_CXLFLASH_RECOVER_AFU_CONTEXT_RESET	0x8000000000000000ULL
+
+struct dk_cxlflash_recover_afu {
+	struct dk_cxlflash_hdr hdr;	/* Common fields */
+	__u64 reason;			/* Reason for recovery request */
+	__u64 context_id;		/* Context to recover / updated ID */
+	__u64 mmio_size;		/* Returned size of MMIO area */
+	__u64 adap_fd;			/* Returned adapter file descriptor */
+	__u64 reserved[8];		/* Reserved for future use */
+};
+
+#define DK_CXLFLASH_MANAGE_LUN_WWID_LEN			16
+#define DK_CXLFLASH_MANAGE_LUN_ENABLE_SUPERPIPE		0x8000000000000000ULL
+#define DK_CXLFLASH_MANAGE_LUN_DISABLE_SUPERPIPE	0x4000000000000000ULL
+#define DK_CXLFLASH_MANAGE_LUN_ALL_PORTS_ACCESSIBLE	0x2000000000000000ULL
+
+struct dk_cxlflash_manage_lun {
+	struct dk_cxlflash_hdr hdr;			/* Common fields */
+	__u8 wwid[DK_CXLFLASH_MANAGE_LUN_WWID_LEN];	/* Page83 WWID, NAA-6 */
+	__u64 reserved[8];				/* Rsvd, future use */
+};
+
+union cxlflash_ioctls {
+	struct dk_cxlflash_attach attach;
+	struct dk_cxlflash_detach detach;
+	struct dk_cxlflash_udirect udirect;
+	struct dk_cxlflash_release release;
+	struct dk_cxlflash_verify verify;
+	struct dk_cxlflash_recover_afu recover_afu;
+	struct dk_cxlflash_manage_lun manage_lun;
+};
+
+#define MAX_CXLFLASH_IOCTL_SZ	(sizeof(union cxlflash_ioctls))
+
+#define CXL_MAGIC 0xCA
+#define CXL_IOWR(_n, _s)	_IOWR(CXL_MAGIC, _n, struct _s)
+
+#define DK_CXLFLASH_ATTACH		CXL_IOWR(0x80, dk_cxlflash_attach)
+#define DK_CXLFLASH_USER_DIRECT		CXL_IOWR(0x81, dk_cxlflash_udirect)
+#define DK_CXLFLASH_RELEASE		CXL_IOWR(0x82, dk_cxlflash_release)
+#define DK_CXLFLASH_DETACH		CXL_IOWR(0x83, dk_cxlflash_detach)
+#define DK_CXLFLASH_VERIFY		CXL_IOWR(0x84, dk_cxlflash_verify)
+#define DK_CXLFLASH_RECOVER_AFU		CXL_IOWR(0x85, dk_cxlflash_recover_afu)
+#define DK_CXLFLASH_MANAGE_LUN		CXL_IOWR(0x86, dk_cxlflash_manage_lun)
+
+#endif /* ifndef _CXLFLASH_IOCTL_H */
-- 
cgit v1.2.3


From 2cb79266d6b229dbebd31fe114af1bdab25c8076 Mon Sep 17 00:00:00 2001
From: "Matthew R. Ochs" <mrochs@linux.vnet.ibm.com>
Date: Thu, 13 Aug 2015 21:47:53 -0500
Subject: cxlflash: Virtual LUN support

Add support for physical LUN segmentation (virtual LUNs) to device
driver supporting the IBM CXL Flash adapter. This patch allows user
space applications to virtually segment a physical LUN into N virtual
LUNs, taking advantage of the translation features provided by this
adapter.

Signed-off-by: Matthew R. Ochs <mrochs@linux.vnet.ibm.com>
Signed-off-by: Manoj N. Kumar <manoj@linux.vnet.ibm.com>
Reviewed-by: Michael Neuling <mikey@neuling.org>
Reviewed-by: Wen Xiong <wenxiong@linux.vnet.ibm.com>
Signed-off-by: James Bottomley <JBottomley@Odin.com>
---
 include/uapi/scsi/cxlflash_ioctl.h | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/scsi/cxlflash_ioctl.h b/include/uapi/scsi/cxlflash_ioctl.h
index 570773406531..831351b2e660 100644
--- a/include/uapi/scsi/cxlflash_ioctl.h
+++ b/include/uapi/scsi/cxlflash_ioctl.h
@@ -71,6 +71,17 @@ struct dk_cxlflash_udirect {
 	__u64 reserved[8];		/* Reserved for future use */
 };
 
+#define DK_CXLFLASH_UVIRTUAL_NEED_WRITE_SAME	0x8000000000000000ULL
+
+struct dk_cxlflash_uvirtual {
+	struct dk_cxlflash_hdr hdr;	/* Common fields */
+	__u64 context_id;		/* Context to own virtual resources */
+	__u64 lun_size;			/* Requested size, in 4K blocks */
+	__u64 rsrc_handle;		/* Returned resource handle */
+	__u64 last_lba;			/* Returned last LBA of LUN */
+	__u64 reserved[8];		/* Reserved for future use */
+};
+
 struct dk_cxlflash_release {
 	struct dk_cxlflash_hdr hdr;	/* Common fields */
 	__u64 context_id;		/* Context owning resources */
@@ -78,6 +89,23 @@ struct dk_cxlflash_release {
 	__u64 reserved[8];		/* Reserved for future use */
 };
 
+struct dk_cxlflash_resize {
+	struct dk_cxlflash_hdr hdr;	/* Common fields */
+	__u64 context_id;		/* Context owning resources */
+	__u64 rsrc_handle;		/* Resource handle of LUN to resize */
+	__u64 req_size;			/* New requested size, in 4K blocks */
+	__u64 last_lba;			/* Returned last LBA of LUN */
+	__u64 reserved[8];		/* Reserved for future use */
+};
+
+struct dk_cxlflash_clone {
+	struct dk_cxlflash_hdr hdr;	/* Common fields */
+	__u64 context_id_src;		/* Context to clone from */
+	__u64 context_id_dst;		/* Context to clone to */
+	__u64 adap_fd_src;		/* Source context adapter fd */
+	__u64 reserved[8];		/* Reserved for future use */
+};
+
 #define DK_CXLFLASH_VERIFY_SENSE_LEN	18
 #define DK_CXLFLASH_VERIFY_HINT_SENSE	0x8000000000000000ULL
 
@@ -118,7 +146,10 @@ union cxlflash_ioctls {
 	struct dk_cxlflash_attach attach;
 	struct dk_cxlflash_detach detach;
 	struct dk_cxlflash_udirect udirect;
+	struct dk_cxlflash_uvirtual uvirtual;
 	struct dk_cxlflash_release release;
+	struct dk_cxlflash_resize resize;
+	struct dk_cxlflash_clone clone;
 	struct dk_cxlflash_verify verify;
 	struct dk_cxlflash_recover_afu recover_afu;
 	struct dk_cxlflash_manage_lun manage_lun;
@@ -136,5 +167,8 @@ union cxlflash_ioctls {
 #define DK_CXLFLASH_VERIFY		CXL_IOWR(0x84, dk_cxlflash_verify)
 #define DK_CXLFLASH_RECOVER_AFU		CXL_IOWR(0x85, dk_cxlflash_recover_afu)
 #define DK_CXLFLASH_MANAGE_LUN		CXL_IOWR(0x86, dk_cxlflash_manage_lun)
+#define DK_CXLFLASH_USER_VIRTUAL	CXL_IOWR(0x87, dk_cxlflash_uvirtual)
+#define DK_CXLFLASH_VLUN_RESIZE		CXL_IOWR(0x88, dk_cxlflash_resize)
+#define DK_CXLFLASH_VLUN_CLONE		CXL_IOWR(0x89, dk_cxlflash_clone)
 
 #endif /* ifndef _CXLFLASH_IOCTL_H */
-- 
cgit v1.2.3


From 7f8a436eaa2c3ddd8e1ff2fbca267e6275085536 Mon Sep 17 00:00:00 2001
From: Joe Stringer <joestringer@nicira.com>
Date: Wed, 26 Aug 2015 11:31:48 -0700
Subject: openvswitch: Add conntrack action

Expose the kernel connection tracker via OVS. Userspace components can
make use of the CT action to populate the connection state (ct_state)
field for a flow. This state can be subsequently matched.

Exposed connection states are OVS_CS_F_*:
- NEW (0x01) - Beginning of a new connection.
- ESTABLISHED (0x02) - Part of an existing connection.
- RELATED (0x04) - Related to an established connection.
- INVALID (0x20) - Could not track the connection for this packet.
- REPLY_DIR (0x40) - This packet is in the reply direction for the flow.
- TRACKED (0x80) - This packet has been sent through conntrack.

When the CT action is executed by itself, it will send the packet
through the connection tracker and populate the ct_state field with one
or more of the connection state flags above. The CT action will always
set the TRACKED bit.

When the COMMIT flag is passed to the conntrack action, this specifies
that information about the connection should be stored. This allows
subsequent packets for the same (or related) connections to be
correlated with this connection. Sending subsequent packets for the
connection through conntrack allows the connection tracker to consider
the packets as ESTABLISHED, RELATED, and/or REPLY_DIR.

The CT action may optionally take a zone to track the flow within. This
allows connections with the same 5-tuple to be kept logically separate
from connections in other zones. If the zone is specified, then the
"ct_zone" match field will be subsequently populated with the zone id.

IP fragments are handled by transparently assembling them as part of the
CT action. The maximum received unit (MRU) size is tracked so that
refragmentation can occur during output.

IP frag handling contributed by Andy Zhou.

Based on original design by Justin Pettit.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Signed-off-by: Justin Pettit <jpettit@nicira.com>
Signed-off-by: Andy Zhou <azhou@nicira.com>
Acked-by: Thomas Graf <tgraf@suug.ch>
Acked-by: Pravin B Shelar <pshelar@nicira.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/openvswitch.h | 40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index d6b885460187..55f599792673 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -164,6 +164,9 @@ enum ovs_packet_cmd {
  * %OVS_USERSPACE_ATTR_EGRESS_TUN_PORT attribute, which is sent only if the
  * output port is actually a tunnel port. Contains the output tunnel key
  * extracted from the packet as nested %OVS_TUNNEL_KEY_ATTR_* attributes.
+ * @OVS_PACKET_ATTR_MRU: Present for an %OVS_PACKET_CMD_ACTION and
+ * %OVS_PACKET_ATTR_USERSPACE action specify the Maximum received fragment
+ * size.
  *
  * These attributes follow the &struct ovs_header within the Generic Netlink
  * payload for %OVS_PACKET_* commands.
@@ -180,6 +183,7 @@ enum ovs_packet_attr {
 	OVS_PACKET_ATTR_UNUSED2,
 	OVS_PACKET_ATTR_PROBE,      /* Packet operation is a feature probe,
 				       error logging should be suppressed. */
+	OVS_PACKET_ATTR_MRU,	    /* Maximum received IP fragment size. */
 	__OVS_PACKET_ATTR_MAX
 };
 
@@ -319,6 +323,8 @@ enum ovs_key_attr {
 	OVS_KEY_ATTR_MPLS,      /* array of struct ovs_key_mpls.
 				 * The implementation may restrict
 				 * the accepted length of the array. */
+	OVS_KEY_ATTR_CT_STATE,	/* u8 bitmask of OVS_CS_F_* */
+	OVS_KEY_ATTR_CT_ZONE,	/* u16 connection tracking zone. */
 
 #ifdef __KERNEL__
 	OVS_KEY_ATTR_TUNNEL_INFO,  /* struct ip_tunnel_info */
@@ -431,6 +437,15 @@ struct ovs_key_nd {
 	__u8	nd_tll[ETH_ALEN];
 };
 
+/* OVS_KEY_ATTR_CT_STATE flags */
+#define OVS_CS_F_NEW               0x01 /* Beginning of a new connection. */
+#define OVS_CS_F_ESTABLISHED       0x02 /* Part of an existing connection. */
+#define OVS_CS_F_RELATED           0x04 /* Related to an established
+					 * connection. */
+#define OVS_CS_F_INVALID           0x20 /* Could not track connection. */
+#define OVS_CS_F_REPLY_DIR         0x40 /* Flow is in the reply direction. */
+#define OVS_CS_F_TRACKED           0x80 /* Conntrack has occurred. */
+
 /**
  * enum ovs_flow_attr - attributes for %OVS_FLOW_* commands.
  * @OVS_FLOW_ATTR_KEY: Nested %OVS_KEY_ATTR_* attributes specifying the flow
@@ -594,6 +609,28 @@ struct ovs_action_hash {
 	uint32_t  hash_basis;
 };
 
+/**
+ * enum ovs_ct_attr - Attributes for %OVS_ACTION_ATTR_CT action.
+ * @OVS_CT_ATTR_FLAGS: u32 connection tracking flags.
+ * @OVS_CT_ATTR_ZONE: u16 connection tracking zone.
+ */
+enum ovs_ct_attr {
+	OVS_CT_ATTR_UNSPEC,
+	OVS_CT_ATTR_FLAGS,      /* u8 bitmask of OVS_CT_F_*. */
+	OVS_CT_ATTR_ZONE,       /* u16 zone id. */
+	__OVS_CT_ATTR_MAX
+};
+
+#define OVS_CT_ATTR_MAX (__OVS_CT_ATTR_MAX - 1)
+
+/*
+ * OVS_CT_ATTR_FLAGS flags - bitmask of %OVS_CT_F_*
+ * @OVS_CT_F_COMMIT: Commits the flow to the conntrack table. This allows
+ * future packets for the same connection to be identified as 'established'
+ * or 'related'.
+ */
+#define OVS_CT_F_COMMIT		0x01
+
 /**
  * enum ovs_action_attr - Action types.
  *
@@ -623,6 +660,8 @@ struct ovs_action_hash {
  * indicate the new packet contents. This could potentially still be
  * %ETH_P_MPLS if the resulting MPLS label stack is not empty.  If there
  * is no MPLS label stack, as determined by ethertype, no action is taken.
+ * @OVS_ACTION_ATTR_CT: Track the connection. Populate the conntrack-related
+ * entries in the flow key.
  *
  * Only a single header can be set with a single %OVS_ACTION_ATTR_SET.  Not all
  * fields within a header are modifiable, e.g. the IPv4 protocol and fragment
@@ -648,6 +687,7 @@ enum ovs_action_attr {
 				       * data immediately followed by a mask.
 				       * The data must be zero for the unmasked
 				       * bits. */
+	OVS_ACTION_ATTR_CT,           /* One nested OVS_CT_ATTR_* . */
 
 	__OVS_ACTION_ATTR_MAX,	      /* Nothing past this will be accepted
 				       * from userspace. */
-- 
cgit v1.2.3


From 182e3042e15de759e81618d11fe4f62f5259d982 Mon Sep 17 00:00:00 2001
From: Joe Stringer <joestringer@nicira.com>
Date: Wed, 26 Aug 2015 11:31:49 -0700
Subject: openvswitch: Allow matching on conntrack mark

Allow matching and setting the ct_mark field. As with ct_state and
ct_zone, these fields are populated when the CT action is executed. To
write to this field, a value and mask can be specified as a nested
attribute under the CT action. This data is stored with the conntrack
entry, and is executed after the lookup occurs for the CT action. The
conntrack entry itself must be committed using the COMMIT flag in the CT
action flags for this change to persist.

Signed-off-by: Justin Pettit <jpettit@nicira.com>
Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Thomas Graf <tgraf@suug.ch>
Acked-by: Pravin B Shelar <pshelar@nicira.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/openvswitch.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 55f599792673..7a185b554343 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -325,6 +325,7 @@ enum ovs_key_attr {
 				 * the accepted length of the array. */
 	OVS_KEY_ATTR_CT_STATE,	/* u8 bitmask of OVS_CS_F_* */
 	OVS_KEY_ATTR_CT_ZONE,	/* u16 connection tracking zone. */
+	OVS_KEY_ATTR_CT_MARK,	/* u32 connection tracking mark */
 
 #ifdef __KERNEL__
 	OVS_KEY_ATTR_TUNNEL_INFO,  /* struct ip_tunnel_info */
@@ -613,11 +614,15 @@ struct ovs_action_hash {
  * enum ovs_ct_attr - Attributes for %OVS_ACTION_ATTR_CT action.
  * @OVS_CT_ATTR_FLAGS: u32 connection tracking flags.
  * @OVS_CT_ATTR_ZONE: u16 connection tracking zone.
+ * @OVS_CT_ATTR_MARK: u32 value followed by u32 mask. For each bit set in the
+ * mask, the corresponding bit in the value is copied to the connection
+ * tracking mark field in the connection.
  */
 enum ovs_ct_attr {
 	OVS_CT_ATTR_UNSPEC,
 	OVS_CT_ATTR_FLAGS,      /* u8 bitmask of OVS_CT_F_*. */
 	OVS_CT_ATTR_ZONE,       /* u16 zone id. */
+	OVS_CT_ATTR_MARK,       /* mark to associate with this connection. */
 	__OVS_CT_ATTR_MAX
 };
 
-- 
cgit v1.2.3


From c2ac667358708d7cce64c78f58af6adf4c1e848b Mon Sep 17 00:00:00 2001
From: Joe Stringer <joestringer@nicira.com>
Date: Wed, 26 Aug 2015 11:31:52 -0700
Subject: openvswitch: Allow matching on conntrack label

Allow matching and setting the ct_label field. As with ct_mark, this is
populated by executing the CT action. The label field may be modified by
specifying a label and mask nested under the CT action. It is stored as
metadata attached to the connection. Label modification occurs after
lookup, and will only persist when the conntrack entry is committed by
providing the COMMIT flag to the CT action. Labels are currently fixed
to 128 bits in size.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Thomas Graf <tgraf@suug.ch>
Acked-by: Pravin B Shelar <pshelar@nicira.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/openvswitch.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 7a185b554343..9d52058a9330 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -326,6 +326,7 @@ enum ovs_key_attr {
 	OVS_KEY_ATTR_CT_STATE,	/* u8 bitmask of OVS_CS_F_* */
 	OVS_KEY_ATTR_CT_ZONE,	/* u16 connection tracking zone. */
 	OVS_KEY_ATTR_CT_MARK,	/* u32 connection tracking mark */
+	OVS_KEY_ATTR_CT_LABEL,	/* 16-octet connection tracking label */
 
 #ifdef __KERNEL__
 	OVS_KEY_ATTR_TUNNEL_INFO,  /* struct ip_tunnel_info */
@@ -438,6 +439,11 @@ struct ovs_key_nd {
 	__u8	nd_tll[ETH_ALEN];
 };
 
+#define OVS_CT_LABEL_LEN	16
+struct ovs_key_ct_label {
+	__u8	ct_label[OVS_CT_LABEL_LEN];
+};
+
 /* OVS_KEY_ATTR_CT_STATE flags */
 #define OVS_CS_F_NEW               0x01 /* Beginning of a new connection. */
 #define OVS_CS_F_ESTABLISHED       0x02 /* Part of an existing connection. */
@@ -617,12 +623,16 @@ struct ovs_action_hash {
  * @OVS_CT_ATTR_MARK: u32 value followed by u32 mask. For each bit set in the
  * mask, the corresponding bit in the value is copied to the connection
  * tracking mark field in the connection.
+ * @OVS_CT_ATTR_LABEL: %OVS_CT_LABEL_LEN value followed by %OVS_CT_LABEL_LEN
+ * mask. For each bit set in the mask, the corresponding bit in the value is
+ * copied to the connection tracking label field in the connection.
  */
 enum ovs_ct_attr {
 	OVS_CT_ATTR_UNSPEC,
 	OVS_CT_ATTR_FLAGS,      /* u8 bitmask of OVS_CT_F_*. */
 	OVS_CT_ATTR_ZONE,       /* u16 zone id. */
 	OVS_CT_ATTR_MARK,       /* mark to associate with this connection. */
+	OVS_CT_ATTR_LABEL,      /* label to associate with this connection. */
 	__OVS_CT_ATTR_MAX
 };
 
-- 
cgit v1.2.3


From cae3a2627520c3795b54533c5328b77af3405dbe Mon Sep 17 00:00:00 2001
From: Joe Stringer <joestringer@nicira.com>
Date: Wed, 26 Aug 2015 11:31:53 -0700
Subject: openvswitch: Allow attaching helpers to ct action

Add support for using conntrack helpers to assist protocol detection.
The new OVS_CT_ATTR_HELPER attribute of the CT action specifies a helper
to be used for this connection. If no helper is specified, then helpers
will be automatically applied as per the sysctl configuration of
net.netfilter.nf_conntrack_helper.

The helper may be specified as part of the conntrack action, eg:
ct(helper=ftp). Initial packets for related connections should be
committed to allow later packets for the flow to be considered
established.

Example ovs-ofctl flows allowing FTP connections from ports 1->2:
in_port=1,tcp,action=ct(helper=ftp,commit),2
in_port=2,tcp,ct_state=-trk,action=ct(recirc)
in_port=2,tcp,ct_state=+trk-new+est,action=1
in_port=2,tcp,ct_state=+trk+rel,action=1

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Thomas Graf <tgraf@suug.ch>
Acked-by: Pravin B Shelar <pshelar@nicira.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/openvswitch.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 9d52058a9330..32e07d8cbaf4 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -626,6 +626,7 @@ struct ovs_action_hash {
  * @OVS_CT_ATTR_LABEL: %OVS_CT_LABEL_LEN value followed by %OVS_CT_LABEL_LEN
  * mask. For each bit set in the mask, the corresponding bit in the value is
  * copied to the connection tracking label field in the connection.
+ * @OVS_CT_ATTR_HELPER: variable length string defining conntrack ALG.
  */
 enum ovs_ct_attr {
 	OVS_CT_ATTR_UNSPEC,
@@ -633,6 +634,8 @@ enum ovs_ct_attr {
 	OVS_CT_ATTR_ZONE,       /* u16 zone id. */
 	OVS_CT_ATTR_MARK,       /* mark to associate with this connection. */
 	OVS_CT_ATTR_LABEL,      /* label to associate with this connection. */
+	OVS_CT_ATTR_HELPER,     /* netlink helper to assist detection of
+				   related connections. */
 	__OVS_CT_ATTR_MAX
 };
 
-- 
cgit v1.2.3


From d2d427b3927bd7a0348fc7f323d0e291f79a2779 Mon Sep 17 00:00:00 2001
From: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
Date: Thu, 27 Aug 2015 15:32:26 +0900
Subject: bridge: Add netlink support for vlan_protocol attribute

This enables bridge vlan_protocol to be configured through netlink.

When CONFIG_BRIDGE_VLAN_FILTERING is disabled, kernel behaves the
same way as this feature is not implemented.

Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_link.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 313c305fd1ad..2d13dd44ecaa 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -231,6 +231,7 @@ enum {
 	IFLA_BR_STP_STATE,
 	IFLA_BR_PRIORITY,
 	IFLA_BR_VLAN_FILTERING,
+	IFLA_BR_VLAN_PROTOCOL,
 	__IFLA_BR_MAX,
 };
 
-- 
cgit v1.2.3


From cd7918b35f0ee0106bbe2ce4a14b5a8c9763deb8 Mon Sep 17 00:00:00 2001
From: Pravin B Shelar <pshelar@nicira.com>
Date: Wed, 26 Aug 2015 23:46:51 -0700
Subject: geneve: Make dst-port configurable.

Add netlink interface to configure Geneve UDP port number.
So that user can configure it for a Gevene device.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Reviewed-by: Jesse Gross <jesse@nicira.com>
Acked-by: Thomas Graf <tgraf@suug.ch>
Acked-by: John W. Linville <linville@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_link.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 2d13dd44ecaa..9d73c31896d0 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -410,6 +410,7 @@ enum {
 	IFLA_GENEVE_REMOTE,
 	IFLA_GENEVE_TTL,
 	IFLA_GENEVE_TOS,
+	IFLA_GENEVE_PORT,	/* destination port */
 	__IFLA_GENEVE_MAX
 };
 #define IFLA_GENEVE_MAX	(__IFLA_GENEVE_MAX - 1)
-- 
cgit v1.2.3


From e305ac6cf5a1e1386aedce7ef9cb773635d5845c Mon Sep 17 00:00:00 2001
From: Pravin B Shelar <pshelar@nicira.com>
Date: Wed, 26 Aug 2015 23:46:52 -0700
Subject: geneve: Add support to collect tunnel metadata.

Following patch create new tunnel flag which enable
tunnel metadata collection on given device. These devices
can be used by tunnel metadata based routing or by OVS.
Geneve Consolidation patch get rid of collect_md_tun to
simplify tunnel lookup further.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Reviewed-by: Jesse Gross <jesse@nicira.com>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_link.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 9d73c31896d0..3a5f263cfc2f 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -411,6 +411,7 @@ enum {
 	IFLA_GENEVE_TTL,
 	IFLA_GENEVE_TOS,
 	IFLA_GENEVE_PORT,	/* destination port */
+	IFLA_GENEVE_COLLECT_METADATA,
 	__IFLA_GENEVE_MAX
 };
 #define IFLA_GENEVE_MAX	(__IFLA_GENEVE_MAX - 1)
-- 
cgit v1.2.3


From 5153aacfb8e2744af68e7b84ccd3f02aeefe4f48 Mon Sep 17 00:00:00 2001
From: Kinglong Mee <kinglongmee@gmail.com>
Date: Wed, 26 Aug 2015 21:12:15 +0800
Subject: NFS: Update NFS4_BITMAP_SIZE

v4.1/v4.2 have define attributes at word2, nfs client also support
security label now.

v3, same as v2.

Signed-off-by: Kinglong Mee <kinglongmee@gmail.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 include/uapi/linux/nfs4.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/nfs4.h b/include/uapi/linux/nfs4.h
index 2119c7c274d7..2b871e0858d9 100644
--- a/include/uapi/linux/nfs4.h
+++ b/include/uapi/linux/nfs4.h
@@ -15,7 +15,7 @@
 
 #include <linux/types.h>
 
-#define NFS4_BITMAP_SIZE	2
+#define NFS4_BITMAP_SIZE	3
 #define NFS4_VERIFIER_SIZE	8
 #define NFS4_STATEID_SEQID_SIZE 4
 #define NFS4_STATEID_OTHER_SIZE 12
-- 
cgit v1.2.3


From 0a6a3a23ea6efde079a5b77688541a98bf202721 Mon Sep 17 00:00:00 2001
From: Christophe Ricard <christophe.ricard@gmail.com>
Date: Fri, 28 Aug 2015 07:07:48 +0200
Subject: netlink: add NETLINK_CAP_ACK socket option

Since commit c05cdb1b864f ("netlink: allow large data transfers from
user-space"), the kernel may fail to allocate the necessary room for the
acknowledgment message back to userspace. This patch introduces a new
socket option that trims off the payload of the original netlink message.

The netlink message header is still included, so the user can guess from
the sequence number what is the message that has triggered the
acknowledgment.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Christophe Ricard <christophe-h.ricard@st.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/netlink.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h
index cf6a65cccbdf..6f3fe16cd22a 100644
--- a/include/uapi/linux/netlink.h
+++ b/include/uapi/linux/netlink.h
@@ -110,6 +110,7 @@ struct nlmsgerr {
 #define NETLINK_TX_RING			7
 #define NETLINK_LISTEN_ALL_NSID		8
 #define NETLINK_LIST_MEMBERSHIPS	9
+#define NETLINK_CAP_ACK			10
 
 struct nl_pktinfo {
 	__u32	group;
-- 
cgit v1.2.3


From b8d3e4163a3562d7cba486687904383e78e7dd6a Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 31 Aug 2015 15:58:46 +0200
Subject: fib, fib6: reject invalid feature bits

Feature bits that are invalid should not be accepted by the kernel,
only the lower 4 bits may be configured, but not the remaining ones.
Even from these 4, 2 of them are unused.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/rtnetlink.h | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index 0d3d3cc43356..702024769c74 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -418,10 +418,13 @@ enum {
 
 #define RTAX_MAX (__RTAX_MAX - 1)
 
-#define RTAX_FEATURE_ECN	0x00000001
-#define RTAX_FEATURE_SACK	0x00000002
-#define RTAX_FEATURE_TIMESTAMP	0x00000004
-#define RTAX_FEATURE_ALLFRAG	0x00000008
+#define RTAX_FEATURE_ECN	(1 << 0)
+#define RTAX_FEATURE_SACK	(1 << 1)
+#define RTAX_FEATURE_TIMESTAMP	(1 << 2)
+#define RTAX_FEATURE_ALLFRAG	(1 << 3)
+
+#define RTAX_FEATURE_MASK	(RTAX_FEATURE_ECN | RTAX_FEATURE_SACK | \
+				 RTAX_FEATURE_TIMESTAMP | RTAX_FEATURE_ALLFRAG)
 
 struct rta_session {
 	__u8	proto;
-- 
cgit v1.2.3


From 16f7249ddf831f5ec0e1358222ce5db300446b84 Mon Sep 17 00:00:00 2001
From: Artem Savkov <asavkov@redhat.com>
Date: Wed, 2 Sep 2015 13:41:18 +0200
Subject: uapi/drm/i915_drm.h: fix userspace compilation.

commit 346add7834557b5b9628b9bf2387106d42e631d4
Author: Daniel Vetter <daniel.vetter@ffwll.ch>
Date:   Tue Jul 14 18:07:30 2015 +0200

    drm/i915: Use expcitly fixed type in compat32 structs

changed the type of param field in drm_i915_getparam from int to
s32. This header is exported to userspace and needs to use userspace
type __s32 instead.

This fixes userspace compilation errors like the following:
include/drm/i915_drm.h:361:2: error: unknown type name 's32'
  s32 param;

Signed-off-by: Artem Savkov <asavkov@redhat.com>
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
---
 include/uapi/drm/i915_drm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index dbd16a2d37db..fd5aa47bd689 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -358,7 +358,7 @@ typedef struct drm_i915_irq_wait {
 #define I915_PARAM_HAS_RESOURCE_STREAMER 36
 
 typedef struct drm_i915_getparam {
-	s32 param;
+	__s32 param;
 	/*
 	 * WARNING: Using pointers instead of fixed-size u64 means we need to write
 	 * compat32 code. Don't repeat this mistake.
-- 
cgit v1.2.3


From 58319057b7847667f0c9585b9de0e8932b0fdb08 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Fri, 4 Sep 2015 15:42:45 -0700
Subject: capabilities: ambient capabilities

Credit where credit is due: this idea comes from Christoph Lameter with
a lot of valuable input from Serge Hallyn.  This patch is heavily based
on Christoph's patch.

===== The status quo =====

On Linux, there are a number of capabilities defined by the kernel.  To
perform various privileged tasks, processes can wield capabilities that
they hold.

Each task has four capability masks: effective (pE), permitted (pP),
inheritable (pI), and a bounding set (X).  When the kernel checks for a
capability, it checks pE.  The other capability masks serve to modify
what capabilities can be in pE.

Any task can remove capabilities from pE, pP, or pI at any time.  If a
task has a capability in pP, it can add that capability to pE and/or pI.
If a task has CAP_SETPCAP, then it can add any capability to pI, and it
can remove capabilities from X.

Tasks are not the only things that can have capabilities; files can also
have capabilities.  A file can have no capabilty information at all [1].
If a file has capability information, then it has a permitted mask (fP)
and an inheritable mask (fI) as well as a single effective bit (fE) [2].
File capabilities modify the capabilities of tasks that execve(2) them.

A task that successfully calls execve has its capabilities modified for
the file ultimately being excecuted (i.e.  the binary itself if that
binary is ELF or for the interpreter if the binary is a script.) [3] In
the capability evolution rules, for each mask Z, pZ represents the old
value and pZ' represents the new value.  The rules are:

  pP' = (X & fP) | (pI & fI)
  pI' = pI
  pE' = (fE ? pP' : 0)
  X is unchanged

For setuid binaries, fP, fI, and fE are modified by a moderately
complicated set of rules that emulate POSIX behavior.  Similarly, if
euid == 0 or ruid == 0, then fP, fI, and fE are modified differently
(primary, fP and fI usually end up being the full set).  For nonroot
users executing binaries with neither setuid nor file caps, fI and fP
are empty and fE is false.

As an extra complication, if you execute a process as nonroot and fE is
set, then the "secure exec" rules are in effect: AT_SECURE gets set,
LD_PRELOAD doesn't work, etc.

This is rather messy.  We've learned that making any changes is
dangerous, though: if a new kernel version allows an unprivileged
program to change its security state in a way that persists cross
execution of a setuid program or a program with file caps, this
persistent state is surprisingly likely to allow setuid or file-capped
programs to be exploited for privilege escalation.

===== The problem =====

Capability inheritance is basically useless.

If you aren't root and you execute an ordinary binary, fI is zero, so
your capabilities have no effect whatsoever on pP'.  This means that you
can't usefully execute a helper process or a shell command with elevated
capabilities if you aren't root.

On current kernels, you can sort of work around this by setting fI to
the full set for most or all non-setuid executable files.  This causes
pP' = pI for nonroot, and inheritance works.  No one does this because
it's a PITA and it isn't even supported on most filesystems.

If you try this, you'll discover that every nonroot program ends up with
secure exec rules, breaking many things.

This is a problem that has bitten many people who have tried to use
capabilities for anything useful.

===== The proposed change =====

This patch adds a fifth capability mask called the ambient mask (pA).
pA does what most people expect pI to do.

pA obeys the invariant that no bit can ever be set in pA if it is not
set in both pP and pI.  Dropping a bit from pP or pI drops that bit from
pA.  This ensures that existing programs that try to drop capabilities
still do so, with a complication.  Because capability inheritance is so
broken, setting KEEPCAPS, using setresuid to switch to nonroot uids, and
then calling execve effectively drops capabilities.  Therefore,
setresuid from root to nonroot conditionally clears pA unless
SECBIT_NO_SETUID_FIXUP is set.  Processes that don't like this can
re-add bits to pA afterwards.

The capability evolution rules are changed:

  pA' = (file caps or setuid or setgid ? 0 : pA)
  pP' = (X & fP) | (pI & fI) | pA'
  pI' = pI
  pE' = (fE ? pP' : pA')
  X is unchanged

If you are nonroot but you have a capability, you can add it to pA.  If
you do so, your children get that capability in pA, pP, and pE.  For
example, you can set pA = CAP_NET_BIND_SERVICE, and your children can
automatically bind low-numbered ports.  Hallelujah!

Unprivileged users can create user namespaces, map themselves to a
nonzero uid, and create both privileged (relative to their namespace)
and unprivileged process trees.  This is currently more or less
impossible.  Hallelujah!

You cannot use pA to try to subvert a setuid, setgid, or file-capped
program: if you execute any such program, pA gets cleared and the
resulting evolution rules are unchanged by this patch.

Users with nonzero pA are unlikely to unintentionally leak that
capability.  If they run programs that try to drop privileges, dropping
privileges will still work.

It's worth noting that the degree of paranoia in this patch could
possibly be reduced without causing serious problems.  Specifically, if
we allowed pA to persist across executing non-pA-aware setuid binaries
and across setresuid, then, naively, the only capabilities that could
leak as a result would be the capabilities in pA, and any attacker
*already* has those capabilities.  This would make me nervous, though --
setuid binaries that tried to privilege-separate might fail to do so,
and putting CAP_DAC_READ_SEARCH or CAP_DAC_OVERRIDE into pA could have
unexpected side effects.  (Whether these unexpected side effects would
be exploitable is an open question.) I've therefore taken the more
paranoid route.  We can revisit this later.

An alternative would be to require PR_SET_NO_NEW_PRIVS before setting
ambient capabilities.  I think that this would be annoying and would
make granting otherwise unprivileged users minor ambient capabilities
(CAP_NET_BIND_SERVICE or CAP_NET_RAW for example) much less useful than
it is with this patch.

===== Footnotes =====

[1] Files that are missing the "security.capability" xattr or that have
unrecognized values for that xattr end up with has_cap set to false.
The code that does that appears to be complicated for no good reason.

[2] The libcap capability mask parsers and formatters are dangerously
misleading and the documentation is flat-out wrong.  fE is *not* a mask;
it's a single bit.  This has probably confused every single person who
has tried to use file capabilities.

[3] Linux very confusingly processes both the script and the interpreter
if applicable, for reasons that elude me.  The results from thinking
about a script's file capabilities and/or setuid bits are mostly
discarded.

Preliminary userspace code is here, but it needs updating:
https://git.kernel.org/cgit/linux/kernel/git/luto/util-linux-playground.git/commit/?h=cap_ambient&id=7f5afbd175d2

Here is a test program that can be used to verify the functionality
(from Christoph):

/*
 * Test program for the ambient capabilities. This program spawns a shell
 * that allows running processes with a defined set of capabilities.
 *
 * (C) 2015 Christoph Lameter <cl@linux.com>
 * Released under: GPL v3 or later.
 *
 *
 * Compile using:
 *
 *	gcc -o ambient_test ambient_test.o -lcap-ng
 *
 * This program must have the following capabilities to run properly:
 * Permissions for CAP_NET_RAW, CAP_NET_ADMIN, CAP_SYS_NICE
 *
 * A command to equip the binary with the right caps is:
 *
 *	setcap cap_net_raw,cap_net_admin,cap_sys_nice+p ambient_test
 *
 *
 * To get a shell with additional caps that can be inherited by other processes:
 *
 *	./ambient_test /bin/bash
 *
 *
 * Verifying that it works:
 *
 * From the bash spawed by ambient_test run
 *
 *	cat /proc/$$/status
 *
 * and have a look at the capabilities.
 */

#include <stdlib.h>
#include <stdio.h>
#include <errno.h>
#include <cap-ng.h>
#include <sys/prctl.h>
#include <linux/capability.h>

/*
 * Definitions from the kernel header files. These are going to be removed
 * when the /usr/include files have these defined.
 */
#define PR_CAP_AMBIENT 47
#define PR_CAP_AMBIENT_IS_SET 1
#define PR_CAP_AMBIENT_RAISE 2
#define PR_CAP_AMBIENT_LOWER 3
#define PR_CAP_AMBIENT_CLEAR_ALL 4

static void set_ambient_cap(int cap)
{
	int rc;

	capng_get_caps_process();
	rc = capng_update(CAPNG_ADD, CAPNG_INHERITABLE, cap);
	if (rc) {
		printf("Cannot add inheritable cap\n");
		exit(2);
	}
	capng_apply(CAPNG_SELECT_CAPS);

	/* Note the two 0s at the end. Kernel checks for these */
	if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, cap, 0, 0)) {
		perror("Cannot set cap");
		exit(1);
	}
}

int main(int argc, char **argv)
{
	int rc;

	set_ambient_cap(CAP_NET_RAW);
	set_ambient_cap(CAP_NET_ADMIN);
	set_ambient_cap(CAP_SYS_NICE);

	printf("Ambient_test forking shell\n");
	if (execv(argv[1], argv + 1))
		perror("Cannot exec");

	return 0;
}

Signed-off-by: Christoph Lameter <cl@linux.com> # Original author
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Acked-by: Serge E. Hallyn <serge.hallyn@ubuntu.com>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Aaron Jones <aaronmdjones@gmail.com>
Cc: Ted Ts'o <tytso@mit.edu>
Cc: Andrew G. Morgan <morgan@kernel.org>
Cc: Mimi Zohar <zohar@linux.vnet.ibm.com>
Cc: Austin S Hemmelgarn <ahferroin7@gmail.com>
Cc: Markku Savela <msa@moth.iki.fi>
Cc: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: James Morris <james.l.morris@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/uapi/linux/prctl.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 31891d9535e2..a8d0759a9e40 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -190,4 +190,11 @@ struct prctl_mm_map {
 # define PR_FP_MODE_FR		(1 << 0)	/* 64b FP registers */
 # define PR_FP_MODE_FRE		(1 << 1)	/* 32b compatibility */
 
+/* Control the ambient capability set */
+#define PR_CAP_AMBIENT			47
+# define PR_CAP_AMBIENT_IS_SET		1
+# define PR_CAP_AMBIENT_RAISE		2
+# define PR_CAP_AMBIENT_LOWER		3
+# define PR_CAP_AMBIENT_CLEAR_ALL	4
+
 #endif /* _LINUX_PRCTL_H */
-- 
cgit v1.2.3


From 746bf6d64275be0c65b0631d8a72b16f1454cfa1 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Fri, 4 Sep 2015 15:42:51 -0700
Subject: capabilities: add a securebit to disable PR_CAP_AMBIENT_RAISE

Per Andrew Morgan's request, add a securebit to allow admins to disable
PR_CAP_AMBIENT_RAISE.  This securebit will prevent processes from adding
capabilities to their ambient set.

For simplicity, this disables PR_CAP_AMBIENT_RAISE entirely rather than
just disabling setting previously cleared bits.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Acked-by: Andrew G. Morgan <morgan@kernel.org>
Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Serge Hallyn <serge.hallyn@canonical.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Aaron Jones <aaronmdjones@gmail.com>
Cc: Ted Ts'o <tytso@mit.edu>
Cc: Andrew G. Morgan <morgan@kernel.org>
Cc: Mimi Zohar <zohar@linux.vnet.ibm.com>
Cc: Austin S Hemmelgarn <ahferroin7@gmail.com>
Cc: Markku Savela <msa@moth.iki.fi>
Cc: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: James Morris <james.l.morris@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/uapi/linux/securebits.h | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/securebits.h b/include/uapi/linux/securebits.h
index 985aac9e6bf8..35ac35cef217 100644
--- a/include/uapi/linux/securebits.h
+++ b/include/uapi/linux/securebits.h
@@ -43,9 +43,18 @@
 #define SECBIT_KEEP_CAPS	(issecure_mask(SECURE_KEEP_CAPS))
 #define SECBIT_KEEP_CAPS_LOCKED (issecure_mask(SECURE_KEEP_CAPS_LOCKED))
 
+/* When set, a process cannot add new capabilities to its ambient set. */
+#define SECURE_NO_CAP_AMBIENT_RAISE		6
+#define SECURE_NO_CAP_AMBIENT_RAISE_LOCKED	7  /* make bit-6 immutable */
+
+#define SECBIT_NO_CAP_AMBIENT_RAISE (issecure_mask(SECURE_NO_CAP_AMBIENT_RAISE))
+#define SECBIT_NO_CAP_AMBIENT_RAISE_LOCKED \
+			(issecure_mask(SECURE_NO_CAP_AMBIENT_RAISE_LOCKED))
+
 #define SECURE_ALL_BITS		(issecure_mask(SECURE_NOROOT) | \
 				 issecure_mask(SECURE_NO_SETUID_FIXUP) | \
-				 issecure_mask(SECURE_KEEP_CAPS))
+				 issecure_mask(SECURE_KEEP_CAPS) | \
+				 issecure_mask(SECURE_NO_CAP_AMBIENT_RAISE))
 #define SECURE_ALL_LOCKS	(SECURE_ALL_BITS << 1)
 
 #endif /* _UAPI_LINUX_SECUREBITS_H */
-- 
cgit v1.2.3


From 1038628d80e96e3a086189172d9be8eb85ecfabf Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Fri, 4 Sep 2015 15:46:04 -0700
Subject: userfaultfd: uAPI

Defines the uAPI of the userfaultfd, notably the ioctl numbers and protocol.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Pavel Emelyanov <xemul@parallels.com>
Cc: Sanidhya Kashyap <sanidhya.gatech@gmail.com>
Cc: zhang.zhanghailiang@huawei.com
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: Peter Feiner <pfeiner@google.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: "Huangpeng (Peter)" <peter.huangpeng@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/uapi/linux/Kbuild        |  1 +
 include/uapi/linux/userfaultfd.h | 83 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 84 insertions(+)
 create mode 100644 include/uapi/linux/userfaultfd.h

(limited to 'include/uapi')

diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
index aafb9937b162..70ff1d9abf0d 100644
--- a/include/uapi/linux/Kbuild
+++ b/include/uapi/linux/Kbuild
@@ -456,3 +456,4 @@ header-y += xfrm.h
 header-y += xilinx-v4l2-controls.h
 header-y += zorro.h
 header-y += zorro_ids.h
+header-y += userfaultfd.h
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
new file mode 100644
index 000000000000..09c2e2a8c9d6
--- /dev/null
+++ b/include/uapi/linux/userfaultfd.h
@@ -0,0 +1,83 @@
+/*
+ *  include/linux/userfaultfd.h
+ *
+ *  Copyright (C) 2007  Davide Libenzi <davidel@xmailserver.org>
+ *  Copyright (C) 2015  Red Hat, Inc.
+ *
+ */
+
+#ifndef _LINUX_USERFAULTFD_H
+#define _LINUX_USERFAULTFD_H
+
+#include <linux/types.h>
+
+#define UFFD_API ((__u64)0xAA)
+/* FIXME: add "|UFFD_BIT_WP" to UFFD_API_BITS after implementing it */
+#define UFFD_API_BITS (UFFD_BIT_WRITE)
+#define UFFD_API_IOCTLS				\
+	((__u64)1 << _UFFDIO_REGISTER |		\
+	 (__u64)1 << _UFFDIO_UNREGISTER |	\
+	 (__u64)1 << _UFFDIO_API)
+#define UFFD_API_RANGE_IOCTLS			\
+	((__u64)1 << _UFFDIO_WAKE)
+
+/*
+ * Valid ioctl command number range with this API is from 0x00 to
+ * 0x3F.  UFFDIO_API is the fixed number, everything else can be
+ * changed by implementing a different UFFD_API. If sticking to the
+ * same UFFD_API more ioctl can be added and userland will be aware of
+ * which ioctl the running kernel implements through the ioctl command
+ * bitmask written by the UFFDIO_API.
+ */
+#define _UFFDIO_REGISTER		(0x00)
+#define _UFFDIO_UNREGISTER		(0x01)
+#define _UFFDIO_WAKE			(0x02)
+#define _UFFDIO_API			(0x3F)
+
+/* userfaultfd ioctl ids */
+#define UFFDIO 0xAA
+#define UFFDIO_API		_IOWR(UFFDIO, _UFFDIO_API,	\
+				      struct uffdio_api)
+#define UFFDIO_REGISTER		_IOWR(UFFDIO, _UFFDIO_REGISTER, \
+				      struct uffdio_register)
+#define UFFDIO_UNREGISTER	_IOR(UFFDIO, _UFFDIO_UNREGISTER,	\
+				     struct uffdio_range)
+#define UFFDIO_WAKE		_IOR(UFFDIO, _UFFDIO_WAKE,	\
+				     struct uffdio_range)
+
+/*
+ * Valid bits below PAGE_SHIFT in the userfault address read through
+ * the read() syscall.
+ */
+#define UFFD_BIT_WRITE	(1<<0)	/* this was a write fault, MISSING or WP */
+#define UFFD_BIT_WP	(1<<1)	/* handle_userfault() reason VM_UFFD_WP */
+#define UFFD_BITS	2	/* two above bits used for UFFD_BIT_* mask */
+
+struct uffdio_api {
+	/* userland asks for an API number */
+	__u64 api;
+
+	/* kernel answers below with the available features for the API */
+	__u64 bits;
+	__u64 ioctls;
+};
+
+struct uffdio_range {
+	__u64 start;
+	__u64 len;
+};
+
+struct uffdio_register {
+	struct uffdio_range range;
+#define UFFDIO_REGISTER_MODE_MISSING	((__u64)1<<0)
+#define UFFDIO_REGISTER_MODE_WP		((__u64)1<<1)
+	__u64 mode;
+
+	/*
+	 * kernel answers which ioctl commands are available for the
+	 * range, keep at the end as the last 8 bytes aren't read.
+	 */
+	__u64 ioctls;
+};
+
+#endif /* _LINUX_USERFAULTFD_H */
-- 
cgit v1.2.3


From 3f602d2724b1f7d2d27ddcd7963a040a5890fd16 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@parallels.com>
Date: Fri, 4 Sep 2015 15:46:34 -0700
Subject: userfaultfd: Rename uffd_api.bits into .features

This is (seems to be) the minimal thing that is required to unblock
standard uffd usage from the non-cooperative one.  Now more bits can be
added to the features field indicating e.g.  UFFD_FEATURE_FORK and others
needed for the latter use-case.

Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: Sanidhya Kashyap <sanidhya.gatech@gmail.com>
Cc: zhang.zhanghailiang@huawei.com
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: Peter Feiner <pfeiner@google.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: "Huangpeng (Peter)" <peter.huangpeng@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/uapi/linux/userfaultfd.h | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index 09c2e2a8c9d6..330206016249 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -12,8 +12,8 @@
 #include <linux/types.h>
 
 #define UFFD_API ((__u64)0xAA)
-/* FIXME: add "|UFFD_BIT_WP" to UFFD_API_BITS after implementing it */
-#define UFFD_API_BITS (UFFD_BIT_WRITE)
+/* FIXME: add "|UFFD_FEATURE_WP" to UFFD_API_FEATURES after implementing it */
+#define UFFD_API_FEATURES (UFFD_FEATURE_WRITE_BIT)
 #define UFFD_API_IOCTLS				\
 	((__u64)1 << _UFFDIO_REGISTER |		\
 	 (__u64)1 << _UFFDIO_UNREGISTER |	\
@@ -53,12 +53,18 @@
 #define UFFD_BIT_WP	(1<<1)	/* handle_userfault() reason VM_UFFD_WP */
 #define UFFD_BITS	2	/* two above bits used for UFFD_BIT_* mask */
 
+/*
+ * Features reported in uffdio_api.features field
+ */
+#define UFFD_FEATURE_WRITE_BIT	(1<<0) /* Corresponds to UFFD_BIT_WRITE */
+#define UFFD_FEATURE_WP_BIT	(1<<1) /* Corresponds to UFFD_BIT_WP */
+
 struct uffdio_api {
 	/* userland asks for an API number */
 	__u64 api;
 
 	/* kernel answers below with the available features for the API */
-	__u64 bits;
+	__u64 features;
 	__u64 ioctls;
 };
 
-- 
cgit v1.2.3


From a9b85f9415fd9e529d03299e5335433f614ec1fb Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Fri, 4 Sep 2015 15:46:37 -0700
Subject: userfaultfd: change the read API to return a uffd_msg

I had requests to return the full address (not the page aligned one) to
userland.

It's not entirely clear how the page offset could be relevant because
userfaults aren't like SIGBUS that can sigjump to a different place and it
actually skip resolving the fault depending on a page offset.  There's
currently no real way to skip the fault especially because after a
UFFDIO_COPY|ZEROPAGE, the fault is optimized to be retried within the
kernel without having to return to userland first (not even self modifying
code replacing the .text that touched the faulting address would prevent
the fault to be repeated).  Userland cannot skip repeating the fault even
more so if the fault was triggered by a KVM secondary page fault or any
get_user_pages or any copy-user inside some syscall which will return to
kernel code.  The second time FAULT_FLAG_RETRY_NOWAIT won't be set leading
to a SIGBUS being raised because the userfault can't wait if it cannot
release the mmap_map first (and FAULT_FLAG_RETRY_NOWAIT is required for
that).

Still returning userland a proper structure during the read() on the uffd,
can allow to use the current UFFD_API for the future non-cooperative
extensions too and it looks cleaner as well.  Once we get additional
fields there's no point to return the fault address page aligned anymore
to reuse the bits below PAGE_SHIFT.

The only downside is that the read() syscall will read 32bytes instead of
8bytes but that's not going to be measurable overhead.

The total number of new events that can be extended or of new future bits
for already shipped events, is limited to 64 by the features field of the
uffdio_api structure.  If more will be needed a bump of UFFD_API will be
required.

[akpm@linux-foundation.org: use __packed]
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Pavel Emelyanov <xemul@parallels.com>
Cc: Sanidhya Kashyap <sanidhya.gatech@gmail.com>
Cc: zhang.zhanghailiang@huawei.com
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: Peter Feiner <pfeiner@google.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: "Huangpeng (Peter)" <peter.huangpeng@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/uapi/linux/userfaultfd.h | 70 +++++++++++++++++++++++++++++++---------
 1 file changed, 55 insertions(+), 15 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index 330206016249..a5f8825381ef 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -11,9 +11,15 @@
 
 #include <linux/types.h>
 
+#include <linux/compiler.h>
+
 #define UFFD_API ((__u64)0xAA)
-/* FIXME: add "|UFFD_FEATURE_WP" to UFFD_API_FEATURES after implementing it */
-#define UFFD_API_FEATURES (UFFD_FEATURE_WRITE_BIT)
+/*
+ * After implementing the respective features it will become:
+ * #define UFFD_API_FEATURES (UFFD_FEATURE_PAGEFAULT_FLAG_WP | \
+ *			      UFFD_FEATURE_EVENT_FORK)
+ */
+#define UFFD_API_FEATURES (0)
 #define UFFD_API_IOCTLS				\
 	((__u64)1 << _UFFDIO_REGISTER |		\
 	 (__u64)1 << _UFFDIO_UNREGISTER |	\
@@ -45,26 +51,60 @@
 #define UFFDIO_WAKE		_IOR(UFFDIO, _UFFDIO_WAKE,	\
 				     struct uffdio_range)
 
-/*
- * Valid bits below PAGE_SHIFT in the userfault address read through
- * the read() syscall.
- */
-#define UFFD_BIT_WRITE	(1<<0)	/* this was a write fault, MISSING or WP */
-#define UFFD_BIT_WP	(1<<1)	/* handle_userfault() reason VM_UFFD_WP */
-#define UFFD_BITS	2	/* two above bits used for UFFD_BIT_* mask */
+/* read() structure */
+struct uffd_msg {
+	__u8	event;
+
+	__u8	reserved1;
+	__u16	reserved2;
+	__u32	reserved3;
+
+	union {
+		struct {
+			__u64	flags;
+			__u64	address;
+		} pagefault;
+
+		struct {
+			/* unused reserved fields */
+			__u64	reserved1;
+			__u64	reserved2;
+			__u64	reserved3;
+		} reserved;
+	} arg;
+} __packed;
 
 /*
- * Features reported in uffdio_api.features field
+ * Start at 0x12 and not at 0 to be more strict against bugs.
  */
-#define UFFD_FEATURE_WRITE_BIT	(1<<0) /* Corresponds to UFFD_BIT_WRITE */
-#define UFFD_FEATURE_WP_BIT	(1<<1) /* Corresponds to UFFD_BIT_WP */
+#define UFFD_EVENT_PAGEFAULT	0x12
+#if 0 /* not available yet */
+#define UFFD_EVENT_FORK		0x13
+#endif
+
+/* flags for UFFD_EVENT_PAGEFAULT */
+#define UFFD_PAGEFAULT_FLAG_WRITE	(1<<0)	/* If this was a write fault */
+#define UFFD_PAGEFAULT_FLAG_WP		(1<<1)	/* If reason is VM_UFFD_WP */
 
 struct uffdio_api {
-	/* userland asks for an API number */
+	/* userland asks for an API number and the features to enable */
 	__u64 api;
-
-	/* kernel answers below with the available features for the API */
+	/*
+	 * Kernel answers below with the all available features for
+	 * the API, this notifies userland of which events and/or
+	 * which flags for each event are enabled in the current
+	 * kernel.
+	 *
+	 * Note: UFFD_EVENT_PAGEFAULT and UFFD_PAGEFAULT_FLAG_WRITE
+	 * are to be considered implicitly always enabled in all kernels as
+	 * long as the uffdio_api.api requested matches UFFD_API.
+	 */
+#if 0 /* not available yet */
+#define UFFD_FEATURE_PAGEFAULT_FLAG_WP		(1<<0)
+#define UFFD_FEATURE_EVENT_FORK			(1<<1)
+#endif
 	__u64 features;
+
 	__u64 ioctls;
 };
 
-- 
cgit v1.2.3


From 1f1c6f075904c241f9e44eb37efa8777141fc938 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Fri, 4 Sep 2015 15:47:01 -0700
Subject: userfaultfd: UFFDIO_COPY|UFFDIO_ZEROPAGE uAPI

This implements the uABI of UFFDIO_COPY and UFFDIO_ZEROPAGE.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Pavel Emelyanov <xemul@parallels.com>
Cc: Sanidhya Kashyap <sanidhya.gatech@gmail.com>
Cc: zhang.zhanghailiang@huawei.com
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: Peter Feiner <pfeiner@google.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: "Huangpeng (Peter)" <peter.huangpeng@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/uapi/linux/userfaultfd.h | 42 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 41 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index a5f8825381ef..df0e09bb7dd5 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -25,7 +25,9 @@
 	 (__u64)1 << _UFFDIO_UNREGISTER |	\
 	 (__u64)1 << _UFFDIO_API)
 #define UFFD_API_RANGE_IOCTLS			\
-	((__u64)1 << _UFFDIO_WAKE)
+	((__u64)1 << _UFFDIO_WAKE |		\
+	 (__u64)1 << _UFFDIO_COPY |		\
+	 (__u64)1 << _UFFDIO_ZEROPAGE)
 
 /*
  * Valid ioctl command number range with this API is from 0x00 to
@@ -38,6 +40,8 @@
 #define _UFFDIO_REGISTER		(0x00)
 #define _UFFDIO_UNREGISTER		(0x01)
 #define _UFFDIO_WAKE			(0x02)
+#define _UFFDIO_COPY			(0x03)
+#define _UFFDIO_ZEROPAGE		(0x04)
 #define _UFFDIO_API			(0x3F)
 
 /* userfaultfd ioctl ids */
@@ -50,6 +54,10 @@
 				     struct uffdio_range)
 #define UFFDIO_WAKE		_IOR(UFFDIO, _UFFDIO_WAKE,	\
 				     struct uffdio_range)
+#define UFFDIO_COPY		_IOWR(UFFDIO, _UFFDIO_COPY,	\
+				      struct uffdio_copy)
+#define UFFDIO_ZEROPAGE		_IOWR(UFFDIO, _UFFDIO_ZEROPAGE,	\
+				      struct uffdio_zeropage)
 
 /* read() structure */
 struct uffd_msg {
@@ -126,4 +134,36 @@ struct uffdio_register {
 	__u64 ioctls;
 };
 
+struct uffdio_copy {
+	__u64 dst;
+	__u64 src;
+	__u64 len;
+	/*
+	 * There will be a wrprotection flag later that allows to map
+	 * pages wrprotected on the fly. And such a flag will be
+	 * available if the wrprotection ioctl are implemented for the
+	 * range according to the uffdio_register.ioctls.
+	 */
+#define UFFDIO_COPY_MODE_DONTWAKE		((__u64)1<<0)
+	__u64 mode;
+
+	/*
+	 * "copy" is written by the ioctl and must be at the end: the
+	 * copy_from_user will not read the last 8 bytes.
+	 */
+	__s64 copy;
+};
+
+struct uffdio_zeropage {
+	struct uffdio_range range;
+#define UFFDIO_ZEROPAGE_MODE_DONTWAKE		((__u64)1<<0)
+	__u64 mode;
+
+	/*
+	 * "zeropage" is written by the ioctl and must be at the end:
+	 * the copy_from_user will not read the last 8 bytes.
+	 */
+	__s64 zeropage;
+};
+
 #endif /* _LINUX_USERFAULTFD_H */
-- 
cgit v1.2.3


From a13d7201d7deedcbb6ac6efa94a1a7d34d3d79ec Mon Sep 17 00:00:00 2001
From: Julien Grall <julien.grall@citrix.com>
Date: Fri, 7 Aug 2015 17:34:41 +0100
Subject: xen/privcmd: Further s/MFN/GFN/ clean-up

The privcmd code is mixing the usage of GFN and MFN within the same
functions which make the code difficult to understand when you only work
with auto-translated guests.

The privcmd driver is only dealing with GFN so replace all the mention
of MFN into GFN.

The ioctl structure used to map foreign change has been left unchanged
given that the userspace is using it. Nonetheless, add a comment to
explain the expected value within the "mfn" field.

Signed-off-by: Julien Grall <julien.grall@citrix.com>
Reviewed-by: David Vrabel <david.vrabel@citrix.com>
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
---
 include/uapi/xen/privcmd.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/xen/privcmd.h b/include/uapi/xen/privcmd.h
index a85316811d79..7ddeeda93809 100644
--- a/include/uapi/xen/privcmd.h
+++ b/include/uapi/xen/privcmd.h
@@ -44,6 +44,10 @@ struct privcmd_hypercall {
 
 struct privcmd_mmap_entry {
 	__u64 va;
+	/*
+	 * This should be a GFN. It's not possible to change the name because
+	 * it's exposed to the user-space.
+	 */
 	__u64 mfn;
 	__u64 npages;
 };
-- 
cgit v1.2.3


From b14132797d8041a42e03f4ffa1e722da1425adfb Mon Sep 17 00:00:00 2001
From: Mike Frysinger <vapier@gentoo.org>
Date: Tue, 18 Aug 2015 03:28:01 -0400
Subject: elf-em.h: move EM_MICROBLAZE to the common header

The linux/audit.h header uses EM_MICROBLAZE in order to define
AUDIT_ARCH_MICROBLAZE, but it's only available in the microblaze
asm headers.  Move it to the common elf-em.h header so that the
define can be used on non-microblaze systems.  Otherwise we get
build errors that EM_MICROBLAZE isn't defined when we try to use
the AUDIT_ARCH_MICROBLAZE symbol.

Signed-off-by: Mike Frysinger <vapier@gentoo.org>
Signed-off-by: Michal Simek <michal.simek@xilinx.com>
---
 include/uapi/linux/elf-em.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/elf-em.h b/include/uapi/linux/elf-em.h
index 3429a3ba382b..b56dfcfe922a 100644
--- a/include/uapi/linux/elf-em.h
+++ b/include/uapi/linux/elf-em.h
@@ -39,6 +39,7 @@
 #define EM_TI_C6000	140	/* TI C6X DSPs */
 #define EM_AARCH64	183	/* ARM 64 bit */
 #define EM_TILEPRO	188	/* Tilera TILEPro */
+#define EM_MICROBLAZE	189	/* Xilinx MicroBlaze */
 #define EM_TILEGX	191	/* Tilera TILE-Gx */
 #define EM_FRV		0x5441	/* Fujitsu FR-V */
 #define EM_AVR32	0x18ad	/* Atmel AVR32 */
-- 
cgit v1.2.3


From 1ab1e895492d8084dfc1c854efacde219e56b8c1 Mon Sep 17 00:00:00 2001
From: Henrik Austad <henrik@austad.us>
Date: Wed, 9 Sep 2015 12:25:17 +0200
Subject: ether: add IEEE 1722 ethertype - TSN

IEEE 1722 describes AVB (later renamed to TSN - Time Sensitive
Networking), a protocol, encapsualtion and synchronization to utilize
standard networks for audio/video (and later other time-sensitive)
streams.

This standard uses ethertype 0x22F0.

http://standards.ieee.org/develop/regauth/ethertype/eth.txt

This is a respin of a previous patch ("ether: add AVB frame type
ETH_P_AVB")

CC: "David S. Miller" <davem@davemloft.net>
CC: netdev@vger.kernel.org
CC: linux-api@vger.kernel.org
CC: linux-kernel@vger.kernel.org
Signed-off-by: Henrik Austad <henrik@austad.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_ether.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h
index aa63ed023c2b..ea9221b0331a 100644
--- a/include/uapi/linux/if_ether.h
+++ b/include/uapi/linux/if_ether.h
@@ -42,6 +42,7 @@
 #define ETH_P_LOOP	0x0060		/* Ethernet Loopback packet	*/
 #define ETH_P_PUP	0x0200		/* Xerox PUP packet		*/
 #define ETH_P_PUPAT	0x0201		/* Xerox PUP Addr Trans packet	*/
+#define ETH_P_TSN	0x22F0		/* TSN (IEEE 1722) packet	*/
 #define ETH_P_IP	0x0800		/* Internet Protocol packet	*/
 #define ETH_P_X25	0x0805		/* CCITT X.25			*/
 #define ETH_P_ARP	0x0806		/* Address Resolution packet	*/
-- 
cgit v1.2.3


From f074a8f49eb87cde95ac9d040ad5e7ea4f029738 Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@parallels.com>
Date: Wed, 9 Sep 2015 15:35:48 -0700
Subject: proc: export idle flag via kpageflags

As noted by Minchan, a benefit of reading idle flag from /proc/kpageflags
is that one can easily filter dirty and/or unevictable pages while
estimating the size of unused memory.

Note that idle flag read from /proc/kpageflags may be stale in case the
page was accessed via a PTE, because it would be too costly to iterate
over all page mappings on each /proc/kpageflags read to provide an
up-to-date value.  To make sure the flag is up-to-date one has to read
/sys/kernel/mm/page_idle/bitmap first.

Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Reviewed-by: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Raghavendra K T <raghavendra.kt@linux.vnet.ibm.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Greg Thelen <gthelen@google.com>
Cc: Michel Lespinasse <walken@google.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/uapi/linux/kernel-page-flags.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/kernel-page-flags.h b/include/uapi/linux/kernel-page-flags.h
index a6c4962e5d46..5da5f8751ce7 100644
--- a/include/uapi/linux/kernel-page-flags.h
+++ b/include/uapi/linux/kernel-page-flags.h
@@ -33,6 +33,7 @@
 #define KPF_THP			22
 #define KPF_BALLOON		23
 #define KPF_ZERO_PAGE		24
+#define KPF_IDLE		25
 
 
 #endif /* _UAPILINUX_KERNEL_PAGE_FLAGS_H */
-- 
cgit v1.2.3


From ac64a2ce509104a746321a4f9646b6750cf281eb Mon Sep 17 00:00:00 2001
From: David Disseldorp <ddiss@suse.de>
Date: Fri, 4 Sep 2015 01:39:56 +0200
Subject: target: use stringify.h instead of own definition

Signed-off-by: David Disseldorp <ddiss@suse.de>
Acked-by: Andy Grover <agrover@redhat.com>
Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
---
 include/uapi/linux/target_core_user.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/target_core_user.h b/include/uapi/linux/target_core_user.h
index b67f99d3c520..95c6521d8a95 100644
--- a/include/uapi/linux/target_core_user.h
+++ b/include/uapi/linux/target_core_user.h
@@ -42,10 +42,6 @@
 #define TCMU_MAILBOX_VERSION 2
 #define ALIGN_SIZE 64 /* Should be enough for most CPUs */
 
-/* See https://gcc.gnu.org/onlinedocs/cpp/Stringification.html */
-#define xstr(s) str(s)
-#define str(s) #s
-
 struct tcmu_mailbox {
 	__u16 version;
 	__u16 flags;
-- 
cgit v1.2.3


From 5b25b13ab08f616efd566347d809b4ece54570d1 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Fri, 11 Sep 2015 13:07:39 -0700
Subject: sys_membarrier(): system-wide memory barrier (generic, x86)

Here is an implementation of a new system call, sys_membarrier(), which
executes a memory barrier on all threads running on the system.  It is
implemented by calling synchronize_sched().  It can be used to
distribute the cost of user-space memory barriers asymmetrically by
transforming pairs of memory barriers into pairs consisting of
sys_membarrier() and a compiler barrier.  For synchronization primitives
that distinguish between read-side and write-side (e.g.  userspace RCU
[1], rwlocks), the read-side can be accelerated significantly by moving
the bulk of the memory barrier overhead to the write-side.

The existing applications of which I am aware that would be improved by
this system call are as follows:

* Through Userspace RCU library (http://urcu.so)
  - DNS server (Knot DNS) https://www.knot-dns.cz/
  - Network sniffer (http://netsniff-ng.org/)
  - Distributed object storage (https://sheepdog.github.io/sheepdog/)
  - User-space tracing (http://lttng.org)
  - Network storage system (https://www.gluster.org/)
  - Virtual routers (https://events.linuxfoundation.org/sites/events/files/slides/DPDK_RCU_0MQ.pdf)
  - Financial software (https://lkml.org/lkml/2015/3/23/189)

Those projects use RCU in userspace to increase read-side speed and
scalability compared to locking.  Especially in the case of RCU used by
libraries, sys_membarrier can speed up the read-side by moving the bulk of
the memory barrier cost to synchronize_rcu().

* Direct users of sys_membarrier
  - core dotnet garbage collector (https://github.com/dotnet/coreclr/issues/198)

Microsoft core dotnet GC developers are planning to use the mprotect()
side-effect of issuing memory barriers through IPIs as a way to implement
Windows FlushProcessWriteBuffers() on Linux.  They are referring to
sys_membarrier in their github thread, specifically stating that
sys_membarrier() is what they are looking for.

To explain the benefit of this scheme, let's introduce two example threads:

Thread A (non-frequent, e.g. executing liburcu synchronize_rcu())
Thread B (frequent, e.g. executing liburcu
rcu_read_lock()/rcu_read_unlock())

In a scheme where all smp_mb() in thread A are ordering memory accesses
with respect to smp_mb() present in Thread B, we can change each
smp_mb() within Thread A into calls to sys_membarrier() and each
smp_mb() within Thread B into compiler barriers "barrier()".

Before the change, we had, for each smp_mb() pairs:

Thread A                    Thread B
previous mem accesses       previous mem accesses
smp_mb()                    smp_mb()
following mem accesses      following mem accesses

After the change, these pairs become:

Thread A                    Thread B
prev mem accesses           prev mem accesses
sys_membarrier()            barrier()
follow mem accesses         follow mem accesses

As we can see, there are two possible scenarios: either Thread B memory
accesses do not happen concurrently with Thread A accesses (1), or they
do (2).

1) Non-concurrent Thread A vs Thread B accesses:

Thread A                    Thread B
prev mem accesses
sys_membarrier()
follow mem accesses
                            prev mem accesses
                            barrier()
                            follow mem accesses

In this case, thread B accesses will be weakly ordered. This is OK,
because at that point, thread A is not particularly interested in
ordering them with respect to its own accesses.

2) Concurrent Thread A vs Thread B accesses

Thread A                    Thread B
prev mem accesses           prev mem accesses
sys_membarrier()            barrier()
follow mem accesses         follow mem accesses

In this case, thread B accesses, which are ensured to be in program
order thanks to the compiler barrier, will be "upgraded" to full
smp_mb() by synchronize_sched().

* Benchmarks

On Intel Xeon E5405 (8 cores)
(one thread is calling sys_membarrier, the other 7 threads are busy
looping)

1000 non-expedited sys_membarrier calls in 33s =3D 33 milliseconds/call.

* User-space user of this system call: Userspace RCU library

Both the signal-based and the sys_membarrier userspace RCU schemes
permit us to remove the memory barrier from the userspace RCU
rcu_read_lock() and rcu_read_unlock() primitives, thus significantly
accelerating them. These memory barriers are replaced by compiler
barriers on the read-side, and all matching memory barriers on the
write-side are turned into an invocation of a memory barrier on all
active threads in the process. By letting the kernel perform this
synchronization rather than dumbly sending a signal to every process
threads (as we currently do), we diminish the number of unnecessary wake
ups and only issue the memory barriers on active threads. Non-running
threads do not need to execute such barrier anyway, because these are
implied by the scheduler context switches.

Results in liburcu:

Operations in 10s, 6 readers, 2 writers:

memory barriers in reader:    1701557485 reads, 2202847 writes
signal-based scheme:          9830061167 reads,    6700 writes
sys_membarrier:               9952759104 reads,     425 writes
sys_membarrier (dyn. check):  7970328887 reads,     425 writes

The dynamic sys_membarrier availability check adds some overhead to
the read-side compared to the signal-based scheme, but besides that,
sys_membarrier slightly outperforms the signal-based scheme. However,
this non-expedited sys_membarrier implementation has a much slower grace
period than signal and memory barrier schemes.

Besides diminishing the number of wake-ups, one major advantage of the
membarrier system call over the signal-based scheme is that it does not
need to reserve a signal. This plays much more nicely with libraries,
and with processes injected into for tracing purposes, for which we
cannot expect that signals will be unused by the application.

An expedited version of this system call can be added later on to speed
up the grace period. Its implementation will likely depend on reading
the cpu_curr()->mm without holding each CPU's rq lock.

This patch adds the system call to x86 and to asm-generic.

[1] http://urcu.so

membarrier(2) man page:

MEMBARRIER(2)              Linux Programmer's Manual             MEMBARRIER(2)

NAME
       membarrier - issue memory barriers on a set of threads

SYNOPSIS
       #include <linux/membarrier.h>

       int membarrier(int cmd, int flags);

DESCRIPTION
       The cmd argument is one of the following:

       MEMBARRIER_CMD_QUERY
              Query  the  set  of  supported commands. It returns a bitmask of
              supported commands.

       MEMBARRIER_CMD_SHARED
              Execute a memory barrier on all threads running on  the  system.
              Upon  return from system call, the caller thread is ensured that
              all running threads have passed through a state where all memory
              accesses  to  user-space  addresses  match program order between
              entry to and return from the system  call  (non-running  threads
              are de facto in such a state). This covers threads from all pro=E2=80=90
              cesses running on the system.  This command returns 0.

       The flags argument needs to be 0. For future extensions.

       All memory accesses performed  in  program  order  from  each  targeted
       thread is guaranteed to be ordered with respect to sys_membarrier(). If
       we use the semantic "barrier()" to represent a compiler barrier forcing
       memory  accesses  to  be performed in program order across the barrier,
       and smp_mb() to represent explicit memory barriers forcing full  memory
       ordering  across  the barrier, we have the following ordering table for
       each pair of barrier(), sys_membarrier() and smp_mb():

       The pair ordering is detailed as (O: ordered, X: not ordered):

                              barrier()   smp_mb() sys_membarrier()
              barrier()          X           X            O
              smp_mb()           X           O            O
              sys_membarrier()   O           O            O

RETURN VALUE
       On success, these system calls return zero.  On error, -1 is  returned,
       and errno is set appropriately. For a given command, with flags
       argument set to 0, this system call is guaranteed to always return the
       same value until reboot.

ERRORS
       ENOSYS System call is not implemented.

       EINVAL Invalid arguments.

Linux                             2015-04-15                     MEMBARRIER(2)

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Nicholas Miell <nmiell@comcast.net>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Alan Cox <gnomes@lxorguk.ukuu.org.uk>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Stephen Hemminger <stephen@networkplumber.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Pranith Kumar <bobby.prani@gmail.com>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/uapi/asm-generic/unistd.h |  4 ++-
 include/uapi/linux/Kbuild         |  1 +
 include/uapi/linux/membarrier.h   | 53 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 57 insertions(+), 1 deletion(-)
 create mode 100644 include/uapi/linux/membarrier.h

(limited to 'include/uapi')

diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index e016bd9b1a04..8da542a2874d 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -709,9 +709,11 @@ __SYSCALL(__NR_memfd_create, sys_memfd_create)
 __SYSCALL(__NR_bpf, sys_bpf)
 #define __NR_execveat 281
 __SC_COMP(__NR_execveat, sys_execveat, compat_sys_execveat)
+#define __NR_membarrier 282
+__SYSCALL(__NR_membarrier, sys_membarrier)
 
 #undef __NR_syscalls
-#define __NR_syscalls 282
+#define __NR_syscalls 283
 
 /*
  * All syscalls below here should go away really,
diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
index 70ff1d9abf0d..f7b2db44eb4b 100644
--- a/include/uapi/linux/Kbuild
+++ b/include/uapi/linux/Kbuild
@@ -252,6 +252,7 @@ header-y += mdio.h
 header-y += media.h
 header-y += media-bus-format.h
 header-y += mei.h
+header-y += membarrier.h
 header-y += memfd.h
 header-y += mempolicy.h
 header-y += meye.h
diff --git a/include/uapi/linux/membarrier.h b/include/uapi/linux/membarrier.h
new file mode 100644
index 000000000000..e0b108bd2624
--- /dev/null
+++ b/include/uapi/linux/membarrier.h
@@ -0,0 +1,53 @@
+#ifndef _UAPI_LINUX_MEMBARRIER_H
+#define _UAPI_LINUX_MEMBARRIER_H
+
+/*
+ * linux/membarrier.h
+ *
+ * membarrier system call API
+ *
+ * Copyright (c) 2010, 2015 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/**
+ * enum membarrier_cmd - membarrier system call command
+ * @MEMBARRIER_CMD_QUERY:   Query the set of supported commands. It returns
+ *                          a bitmask of valid commands.
+ * @MEMBARRIER_CMD_SHARED:  Execute a memory barrier on all running threads.
+ *                          Upon return from system call, the caller thread
+ *                          is ensured that all running threads have passed
+ *                          through a state where all memory accesses to
+ *                          user-space addresses match program order between
+ *                          entry to and return from the system call
+ *                          (non-running threads are de facto in such a
+ *                          state). This covers threads from all processes
+ *                          running on the system. This command returns 0.
+ *
+ * Command to be passed to the membarrier system call. The commands need to
+ * be a single bit each, except for MEMBARRIER_CMD_QUERY which is assigned to
+ * the value 0.
+ */
+enum membarrier_cmd {
+	MEMBARRIER_CMD_QUERY = 0,
+	MEMBARRIER_CMD_SHARED = (1 << 0),
+};
+
+#endif /* _UAPI_LINUX_MEMBARRIER_H */
-- 
cgit v1.2.3