From 8cf4d6a224a0226987d9cba69cb46d93814fe449 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Fri, 15 Nov 2013 15:57:53 +0100
Subject: net: reorder struct netns_ct for better cache-line usage

Reorder struct netns_ct so that atomic_t "count" changes don't
slowdown users of read mostly fields.

This is based on Eric Dumazet's proposed patch:
 "netfilter: conntrack: remove the central spinlock"
 http://thread.gmane.org/gmane.linux.network/268758/focus=47306

The tricky part of cache-aligning this structure, that it is getting
inlined in struct net (include/net/net_namespace.h), thus changes to
other netns_xxx structures affects our alignment.

Eric's original patch contained an ambiguity on 32-bit regarding
alignment in struct net.  This patch also takes 32-bit into account,
and in case of changed (struct net) alignment sysctl_xxx entries have
been ordered according to how often they are accessed.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Reviewed-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netns/conntrack.h | 33 +++++++++++++++++----------------
 1 file changed, 17 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h
index c9c0c538b68b..fbcc7fa536dc 100644
--- a/include/net/netns/conntrack.h
+++ b/include/net/netns/conntrack.h
@@ -65,6 +65,23 @@ struct nf_ip_net {
 struct netns_ct {
 	atomic_t		count;
 	unsigned int		expect_count;
+#ifdef CONFIG_SYSCTL
+	struct ctl_table_header	*sysctl_header;
+	struct ctl_table_header	*acct_sysctl_header;
+	struct ctl_table_header	*tstamp_sysctl_header;
+	struct ctl_table_header	*event_sysctl_header;
+	struct ctl_table_header	*helper_sysctl_header;
+#endif
+	char			*slabname;
+	unsigned int		sysctl_log_invalid; /* Log invalid packets */
+	unsigned int		sysctl_events_retry_timeout;
+	int			sysctl_events;
+	int			sysctl_acct;
+	int			sysctl_auto_assign_helper;
+	bool			auto_assign_helper_warned;
+	int			sysctl_tstamp;
+	int			sysctl_checksum;
+
 	unsigned int		htable_size;
 	struct kmem_cache	*nf_conntrack_cachep;
 	struct hlist_nulls_head	*hash;
@@ -75,14 +92,6 @@ struct netns_ct {
 	struct ip_conntrack_stat __percpu *stat;
 	struct nf_ct_event_notifier __rcu *nf_conntrack_event_cb;
 	struct nf_exp_event_notifier __rcu *nf_expect_event_cb;
-	int			sysctl_events;
-	unsigned int		sysctl_events_retry_timeout;
-	int			sysctl_acct;
-	int			sysctl_tstamp;
-	int			sysctl_checksum;
-	unsigned int		sysctl_log_invalid; /* Log invalid packets */
-	int			sysctl_auto_assign_helper;
-	bool			auto_assign_helper_warned;
 	struct nf_ip_net	nf_ct_proto;
 #if defined(CONFIG_NF_CONNTRACK_LABELS)
 	unsigned int		labels_used;
@@ -92,13 +101,5 @@ struct netns_ct {
 	struct hlist_head	*nat_bysource;
 	unsigned int		nat_htable_size;
 #endif
-#ifdef CONFIG_SYSCTL
-	struct ctl_table_header	*sysctl_header;
-	struct ctl_table_header	*acct_sysctl_header;
-	struct ctl_table_header	*tstamp_sysctl_header;
-	struct ctl_table_header	*event_sysctl_header;
-	struct ctl_table_header	*helper_sysctl_header;
-#endif
-	char			*slabname;
 };
 #endif
-- 
cgit v1.2.3


From 08c0cad69f32ad1e881fa3fb7f5e0a25db5b07ce Mon Sep 17 00:00:00 2001
From: Valentina Giusti <valentina.giusti@bmw-carit.de>
Date: Fri, 20 Dec 2013 17:28:53 +0100
Subject: netfilter: nfnetlink_queue: enable UID/GID socket info retrieval

Thanks to commits 41063e9 (ipv4: Early TCP socket demux) and 421b388
(udp: ipv4: Add udp early demux) it is now possible to parse UID and
GID socket info also for incoming TCP and UDP connections. Having
this info available, it is convenient to let NFQUEUE parse it in
order to improve and refine the traffic analysis in userspace.

Signed-off-by: Valentina Giusti <valentina.giusti@bmw-carit.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nfnetlink_queue.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/netfilter/nfnetlink_queue.h b/include/uapi/linux/netfilter/nfnetlink_queue.h
index 0132bad79de7..8dd819e2b5fe 100644
--- a/include/uapi/linux/netfilter/nfnetlink_queue.h
+++ b/include/uapi/linux/netfilter/nfnetlink_queue.h
@@ -47,6 +47,8 @@ enum nfqnl_attr_type {
 	NFQA_CAP_LEN,			/* __u32 length of captured packet */
 	NFQA_SKB_INFO,			/* __u32 skb meta information */
 	NFQA_EXP,			/* nf_conntrack_netlink.h */
+	NFQA_UID,			/* __u32 sk uid */
+	NFQA_GID,			/* __u32 sk gid */
 
 	__NFQA_MAX
 };
@@ -99,7 +101,8 @@ enum nfqnl_attr_config {
 #define NFQA_CFG_F_FAIL_OPEN			(1 << 0)
 #define NFQA_CFG_F_CONNTRACK			(1 << 1)
 #define NFQA_CFG_F_GSO				(1 << 2)
-#define NFQA_CFG_F_MAX				(1 << 3)
+#define NFQA_CFG_F_UID_GID			(1 << 3)
+#define NFQA_CFG_F_MAX				(1 << 4)
 
 /* flags for NFQA_SKB_INFO */
 /* packet appears to have wrong checksums, but they are ok */
-- 
cgit v1.2.3


From 6a649f339802f104549e1fb211e381036661e244 Mon Sep 17 00:00:00 2001
From: "fan.du" <fan.du@windriver.com>
Date: Wed, 18 Dec 2013 11:27:02 +0800
Subject: netfilter: add IPv4/6 IPComp extension match support

With this plugin, user could specify IPComp tagged with certain
CPI that host not interested will be DROPped or any other action.

For example:
iptables  -A INPUT -p 108 -m ipcomp --ipcompspi 0x87 -j DROP
ip6tables -A INPUT -p 108 -m ipcomp --ipcompspi 0x87 -j DROP

Then input IPComp packet with CPI equates 0x87 will not reach
upper layer anymore.

Signed-off-by: Fan Du <fan.du@windriver.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/Kbuild      |  1 +
 include/uapi/linux/netfilter/xt_ipcomp.h | 16 ++++++++++++++++
 2 files changed, 17 insertions(+)
 create mode 100644 include/uapi/linux/netfilter/xt_ipcomp.h

(limited to 'include')

diff --git a/include/uapi/linux/netfilter/Kbuild b/include/uapi/linux/netfilter/Kbuild
index 17c3af2c4bb9..91be8ce623f0 100644
--- a/include/uapi/linux/netfilter/Kbuild
+++ b/include/uapi/linux/netfilter/Kbuild
@@ -54,6 +54,7 @@ header-y += xt_ecn.h
 header-y += xt_esp.h
 header-y += xt_hashlimit.h
 header-y += xt_helper.h
+header-y += xt_ipcomp.h
 header-y += xt_iprange.h
 header-y += xt_ipvs.h
 header-y += xt_length.h
diff --git a/include/uapi/linux/netfilter/xt_ipcomp.h b/include/uapi/linux/netfilter/xt_ipcomp.h
new file mode 100644
index 000000000000..45c7e40eb8e1
--- /dev/null
+++ b/include/uapi/linux/netfilter/xt_ipcomp.h
@@ -0,0 +1,16 @@
+#ifndef _XT_IPCOMP_H
+#define _XT_IPCOMP_H
+
+#include <linux/types.h>
+
+struct xt_ipcomp {
+	__u32 spis[2];	/* Security Parameter Index */
+	__u8 invflags;	/* Inverse flags */
+	__u8 hdrres;	/* Test of the Reserved Filed */
+};
+
+/* Values for "invflags" field in struct xt_ipcomp. */
+#define XT_IPCOMP_INV_SPI	0x01	/* Invert the sense of spi. */
+#define XT_IPCOMP_INV_MASK	0x01	/* All possible flags. */
+
+#endif /*_XT_IPCOMP_H*/
-- 
cgit v1.2.3


From 34ce324019e76f6d93768d68343a0e78f464d754 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Fri, 20 Dec 2013 22:40:29 +0100
Subject: netfilter: nf_nat: add full port randomization support

We currently use prandom_u32() for allocation of ports in tcp bind(0)
and udp code. In case of plain SNAT we try to keep the ports as is
or increment on collision.

SNAT --random mode does use per-destination incrementing port
allocation. As a recent paper pointed out in [1] that this mode of
port allocation makes it possible to an attacker to find the randomly
allocated ports through a timing side-channel in a socket overloading
attack conducted through an off-path attacker.

So, NF_NAT_RANGE_PROTO_RANDOM actually weakens the port randomization
in regard to the attack described in this paper. As we need to keep
compatibility, add another flag called NF_NAT_RANGE_PROTO_RANDOM_FULLY
that would replace the NF_NAT_RANGE_PROTO_RANDOM hash-based port
selection algorithm with a simple prandom_u32() in order to mitigate
this attack vector. Note that the lfsr113's internal state is
periodically reseeded by the kernel through a local secure entropy
source.

More details can be found in [1], the basic idea is to send bursts
of packets to a socket to overflow its receive queue and measure
the latency to detect a possible retransmit when the port is found.
Because of increasing ports to given destination and port, further
allocations can be predicted. This information could then be used by
an attacker for e.g. for cache-poisoning, NS pinning, and degradation
of service attacks against DNS servers [1]:

  The best defense against the poisoning attacks is to properly
  deploy and validate DNSSEC; DNSSEC provides security not only
  against off-path attacker but even against MitM attacker. We hope
  that our results will help motivate administrators to adopt DNSSEC.
  However, full DNSSEC deployment make take significant time, and
  until that happens, we recommend short-term, non-cryptographic
  defenses. We recommend to support full port randomisation,
  according to practices recommended in [2], and to avoid
  per-destination sequential port allocation, which we show may be
  vulnerable to derandomisation attacks.

Joint work between Hannes Frederic Sowa and Daniel Borkmann.

 [1] https://sites.google.com/site/hayashulman/files/NIC-derandomisation.pdf
 [2] http://arxiv.org/pdf/1205.5190v1.pdf

Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_nat.h | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/netfilter/nf_nat.h b/include/uapi/linux/netfilter/nf_nat.h
index bf0cc373ffb6..1ad3659102b6 100644
--- a/include/uapi/linux/netfilter/nf_nat.h
+++ b/include/uapi/linux/netfilter/nf_nat.h
@@ -4,10 +4,14 @@
 #include <linux/netfilter.h>
 #include <linux/netfilter/nf_conntrack_tuple_common.h>
 
-#define NF_NAT_RANGE_MAP_IPS		1
-#define NF_NAT_RANGE_PROTO_SPECIFIED	2
-#define NF_NAT_RANGE_PROTO_RANDOM	4
-#define NF_NAT_RANGE_PERSISTENT		8
+#define NF_NAT_RANGE_MAP_IPS			(1 << 0)
+#define NF_NAT_RANGE_PROTO_SPECIFIED		(1 << 1)
+#define NF_NAT_RANGE_PROTO_RANDOM		(1 << 2)
+#define NF_NAT_RANGE_PERSISTENT			(1 << 3)
+#define NF_NAT_RANGE_PROTO_RANDOM_FULLY		(1 << 4)
+
+#define NF_NAT_RANGE_PROTO_RANDOM_ALL		\
+	(NF_NAT_RANGE_PROTO_RANDOM | NF_NAT_RANGE_PROTO_RANDOM_FULLY)
 
 struct nf_nat_ipv4_range {
 	unsigned int			flags;
-- 
cgit v1.2.3


From 02eca9d2cc541806e8f03b4131c7ee9120246df7 Mon Sep 17 00:00:00 2001
From: stephen hemminger <stephen@networkplumber.org>
Date: Mon, 30 Dec 2013 17:13:10 -0800
Subject: netfilter: ipset: remove unused code

Function never used in current upstream code.

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/ipset/ip_set.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
index c7174b816674..0c7d01eae56c 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -331,7 +331,6 @@ extern ip_set_id_t ip_set_get_byname(struct net *net,
 				     const char *name, struct ip_set **set);
 extern void ip_set_put_byindex(struct net *net, ip_set_id_t index);
 extern const char *ip_set_name_byindex(struct net *net, ip_set_id_t index);
-extern ip_set_id_t ip_set_nfnl_get(struct net *net, const char *name);
 extern ip_set_id_t ip_set_nfnl_get_byindex(struct net *net, ip_set_id_t index);
 extern void ip_set_nfnl_put(struct net *net, ip_set_id_t index);
 
-- 
cgit v1.2.3


From dcd93ed4cd1669b2c1510e801fe5f1132390761c Mon Sep 17 00:00:00 2001
From: stephen hemminger <stephen@networkplumber.org>
Date: Mon, 30 Dec 2013 17:16:08 -0800
Subject: netfilter: nf_conntrack: remove dead code

The following code is not used in current upstream code.
Some of this seems to be old hooks, other might be used by some
out of tree module (which I don't care about breaking), and
the need_ipv4_conntrack was used by old NAT code but no longer
called.

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/ipv4/nf_conntrack_ipv4.h | 2 --
 include/net/netfilter/nf_conntrack_l3proto.h   | 1 -
 2 files changed, 3 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/ipv4/nf_conntrack_ipv4.h b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h
index 6c3d12e2949f..981c327374da 100644
--- a/include/net/netfilter/ipv4/nf_conntrack_ipv4.h
+++ b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h
@@ -19,6 +19,4 @@ extern struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp;
 int nf_conntrack_ipv4_compat_init(void);
 void nf_conntrack_ipv4_compat_fini(void);
 
-void need_ipv4_conntrack(void);
-
 #endif /*_NF_CONNTRACK_IPV4_H*/
diff --git a/include/net/netfilter/nf_conntrack_l3proto.h b/include/net/netfilter/nf_conntrack_l3proto.h
index 3efab704b7eb..adc1fa3dd7ab 100644
--- a/include/net/netfilter/nf_conntrack_l3proto.h
+++ b/include/net/netfilter/nf_conntrack_l3proto.h
@@ -87,7 +87,6 @@ int nf_ct_l3proto_register(struct nf_conntrack_l3proto *proto);
 void nf_ct_l3proto_unregister(struct nf_conntrack_l3proto *proto);
 
 struct nf_conntrack_l3proto *nf_ct_l3proto_find_get(u_int16_t l3proto);
-void nf_ct_l3proto_put(struct nf_conntrack_l3proto *p);
 
 /* Existing built-in protocols */
 extern struct nf_conntrack_l3proto nf_conntrack_l3proto_generic;
-- 
cgit v1.2.3


From fe1217c4f3f7d7cbf8efdd8dd5fdc7204a1d65a8 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Sun, 29 Dec 2013 18:27:10 +0100
Subject: net: net_cls: move cgroupfs classid handling into core

Zefan Li requested [1] to perform the following cleanup/refactoring:

- Split cgroupfs classid handling into net core to better express a
  possible more generic use.

- Disable module support for cgroupfs bits as the majority of other
  cgroupfs subsystems do not have that, and seems to be not wished
  from cgroup side. Zefan probably might want to follow-up for netprio
  later on.

- By this, code can be further reduced which previously took care of
  functionality built when compiled as module.

cgroupfs bits are being placed under net/core/netclassid_cgroup.c, so
that we are consistent with {netclassid,netprio}_cgroup naming that is
under net/core/ as suggested by Zefan.

No change in functionality, but only code refactoring that is being
done here.

 [1] http://patchwork.ozlabs.org/patch/304825/

Suggested-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Cc: Zefan Li <lizefan@huawei.com>
Cc: Thomas Graf <tgraf@suug.ch>
Cc: cgroups@vger.kernel.org
Acked-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/cgroup_subsys.h |  2 +-
 include/net/cls_cgroup.h      | 40 ++++++++++++----------------------------
 2 files changed, 13 insertions(+), 29 deletions(-)

(limited to 'include')

diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index b613ffd402d1..58bf94de4b8e 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -31,7 +31,7 @@ SUBSYS(devices)
 SUBSYS(freezer)
 #endif
 
-#if IS_SUBSYS_ENABLED(CONFIG_NET_CLS_CGROUP)
+#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_NET_CLASSID)
 SUBSYS(net_cls)
 #endif
 
diff --git a/include/net/cls_cgroup.h b/include/net/cls_cgroup.h
index 33d03b648646..9cf2d5ef38d9 100644
--- a/include/net/cls_cgroup.h
+++ b/include/net/cls_cgroup.h
@@ -16,17 +16,16 @@
 #include <linux/cgroup.h>
 #include <linux/hardirq.h>
 #include <linux/rcupdate.h>
+#include <net/sock.h>
 
-#if IS_ENABLED(CONFIG_NET_CLS_CGROUP)
-struct cgroup_cls_state
-{
+#ifdef CONFIG_CGROUP_NET_CLASSID
+struct cgroup_cls_state {
 	struct cgroup_subsys_state css;
 	u32 classid;
 };
 
-void sock_update_classid(struct sock *sk);
+struct cgroup_cls_state *task_cls_state(struct task_struct *p);
 
-#if IS_BUILTIN(CONFIG_NET_CLS_CGROUP)
 static inline u32 task_cls_classid(struct task_struct *p)
 {
 	u32 classid;
@@ -41,33 +40,18 @@ static inline u32 task_cls_classid(struct task_struct *p)
 
 	return classid;
 }
-#elif IS_MODULE(CONFIG_NET_CLS_CGROUP)
-static inline u32 task_cls_classid(struct task_struct *p)
-{
-	struct cgroup_subsys_state *css;
-	u32 classid = 0;
-
-	if (in_interrupt())
-		return 0;
-
-	rcu_read_lock();
-	css = task_css(p, net_cls_subsys_id);
-	if (css)
-		classid = container_of(css,
-				       struct cgroup_cls_state, css)->classid;
-	rcu_read_unlock();
 
-	return classid;
-}
-#endif
-#else /* !CGROUP_NET_CLS_CGROUP */
 static inline void sock_update_classid(struct sock *sk)
 {
-}
+	u32 classid;
 
-static inline u32 task_cls_classid(struct task_struct *p)
+	classid = task_cls_classid(current);
+	if (classid != sk->sk_classid)
+		sk->sk_classid = classid;
+}
+#else /* !CONFIG_CGROUP_NET_CLASSID */
+static inline void sock_update_classid(struct sock *sk)
 {
-	return 0;
 }
-#endif /* CGROUP_NET_CLS_CGROUP */
+#endif /* CONFIG_CGROUP_NET_CLASSID */
 #endif  /* _NET_CLS_CGROUP_H */
-- 
cgit v1.2.3


From 86f8515f9721fa171483f0fe0391968fbb949cc9 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Sun, 29 Dec 2013 17:27:11 +0100
Subject: net: netprio: rename config to be more consistent with cgroup configs

While we're at it and introduced CGROUP_NET_CLASSID, lets also make
NETPRIO_CGROUP more consistent with the rest of cgroups and rename it
into CONFIG_CGROUP_NET_PRIO so that for networking, we now have
CONFIG_CGROUP_NET_{PRIO,CLASSID}. This not only makes the CONFIG
option consistent among networking cgroups, but also among cgroups
CONFIG conventions in general as the vast majority has a prefix of
CONFIG_CGROUP_<SUBSYS>.

Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Cc: Zefan Li <lizefan@huawei.com>
Cc: cgroups@vger.kernel.org
Acked-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/cgroup_subsys.h |  2 +-
 include/linux/netdevice.h     |  2 +-
 include/net/netprio_cgroup.h  | 18 ++++++------------
 include/net/sock.h            |  2 +-
 4 files changed, 9 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index 58bf94de4b8e..7b99d717411d 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -43,7 +43,7 @@ SUBSYS(blkio)
 SUBSYS(perf)
 #endif
 
-#if IS_SUBSYS_ENABLED(CONFIG_NETPRIO_CGROUP)
+#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_NET_PRIO)
 SUBSYS(net_prio)
 #endif
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 5260d2eae2e6..45cf68194aa8 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1444,7 +1444,7 @@ struct net_device {
 	/* max exchange id for FCoE LRO by ddp */
 	unsigned int		fcoe_ddp_xid;
 #endif
-#if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
+#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
 	struct netprio_map __rcu *priomap;
 #endif
 	/* phy device may attach itself for hardware timestamping */
diff --git a/include/net/netprio_cgroup.h b/include/net/netprio_cgroup.h
index 099d02782e22..dafc09f0fdbc 100644
--- a/include/net/netprio_cgroup.h
+++ b/include/net/netprio_cgroup.h
@@ -13,12 +13,12 @@
 
 #ifndef _NETPRIO_CGROUP_H
 #define _NETPRIO_CGROUP_H
+
 #include <linux/cgroup.h>
 #include <linux/hardirq.h>
 #include <linux/rcupdate.h>
 
-
-#if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
+#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
 struct netprio_map {
 	struct rcu_head rcu;
 	u32 priomap_len;
@@ -27,8 +27,7 @@ struct netprio_map {
 
 void sock_update_netprioidx(struct sock *sk);
 
-#if IS_BUILTIN(CONFIG_NETPRIO_CGROUP)
-
+#if IS_BUILTIN(CONFIG_CGROUP_NET_PRIO)
 static inline u32 task_netprioidx(struct task_struct *p)
 {
 	struct cgroup_subsys_state *css;
@@ -40,9 +39,7 @@ static inline u32 task_netprioidx(struct task_struct *p)
 	rcu_read_unlock();
 	return idx;
 }
-
-#elif IS_MODULE(CONFIG_NETPRIO_CGROUP)
-
+#elif IS_MODULE(CONFIG_CGROUP_NET_PRIO)
 static inline u32 task_netprioidx(struct task_struct *p)
 {
 	struct cgroup_subsys_state *css;
@@ -56,9 +53,7 @@ static inline u32 task_netprioidx(struct task_struct *p)
 	return idx;
 }
 #endif
-
-#else /* !CONFIG_NETPRIO_CGROUP */
-
+#else /* !CONFIG_CGROUP_NET_PRIO */
 static inline u32 task_netprioidx(struct task_struct *p)
 {
 	return 0;
@@ -66,6 +61,5 @@ static inline u32 task_netprioidx(struct task_struct *p)
 
 #define sock_update_netprioidx(sk)
 
-#endif /* CONFIG_NETPRIO_CGROUP */
-
+#endif /* CONFIG_CGROUP_NET_PRIO */
 #endif  /* _NET_CLS_CGROUP_H */
diff --git a/include/net/sock.h b/include/net/sock.h
index 2ef3c3eca47a..ef5e2be6eaf3 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -395,7 +395,7 @@ struct sock {
 	unsigned short		sk_ack_backlog;
 	unsigned short		sk_max_ack_backlog;
 	__u32			sk_priority;
-#if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
+#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
 	__u32			sk_cgrp_prioidx;
 #endif
 	struct pid		*sk_peer_pid;
-- 
cgit v1.2.3


From 82a37132f300ea53bdcd812917af5a6329ec80c3 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Sun, 29 Dec 2013 18:27:12 +0100
Subject: netfilter: x_tables: lightweight process control group matching

It would be useful e.g. in a server or desktop environment to have
a facility in the notion of fine-grained "per application" or "per
application group" firewall policies. Probably, users in the mobile,
embedded area (e.g. Android based) with different security policy
requirements for application groups could have great benefit from
that as well. For example, with a little bit of configuration effort,
an admin could whitelist well-known applications, and thus block
otherwise unwanted "hard-to-track" applications like [1] from a
user's machine. Blocking is just one example, but it is not limited
to that, meaning we can have much different scenarios/policies that
netfilter allows us than just blocking, e.g. fine grained settings
where applications are allowed to connect/send traffic to, application
traffic marking/conntracking, application-specific packet mangling,
and so on.

Implementation of PID-based matching would not be appropriate
as they frequently change, and child tracking would make that
even more complex and ugly. Cgroups would be a perfect candidate
for accomplishing that as they associate a set of tasks with a
set of parameters for one or more subsystems, in our case the
netfilter subsystem, which, of course, can be combined with other
cgroup subsystems into something more complex if needed.

As mentioned, to overcome this constraint, such processes could
be placed into one or multiple cgroups where different fine-grained
rules can be defined depending on the application scenario, while
e.g. everything else that is not part of that could be dropped (or
vice versa), thus making life harder for unwanted processes to
communicate to the outside world. So, we make use of cgroups here
to track jobs and limit their resources in terms of iptables
policies; in other words, limiting, tracking, etc what they are
allowed to communicate.

In our case we're working on outgoing traffic based on which local
socket that originated from. Also, one doesn't even need to have
an a-prio knowledge of the application internals regarding their
particular use of ports or protocols. Matching is *extremly*
lightweight as we just test for the sk_classid marker of sockets,
originating from net_cls. net_cls and netfilter do not contradict
each other; in fact, each construct can live as standalone or they
can be used in combination with each other, which is perfectly fine,
plus it serves Tejun's requirement to not introduce a new cgroups
subsystem. Through this, we result in a very minimal and efficient
module, and don't add anything except netfilter code.

One possible, minimal usage example (many other iptables options
can be applied obviously):

 1) Configuring cgroups if not already done, e.g.:

  mkdir /sys/fs/cgroup/net_cls
  mount -t cgroup -o net_cls net_cls /sys/fs/cgroup/net_cls
  mkdir /sys/fs/cgroup/net_cls/0
  echo 1 > /sys/fs/cgroup/net_cls/0/net_cls.classid
  (resp. a real flow handle id for tc)

 2) Configuring netfilter (iptables-nftables), e.g.:

  iptables -A OUTPUT -m cgroup ! --cgroup 1 -j DROP

 3) Running applications, e.g.:

  ping 208.67.222.222  <pid:1799>
  echo 1799 > /sys/fs/cgroup/net_cls/0/tasks
  64 bytes from 208.67.222.222: icmp_seq=44 ttl=49 time=11.9 ms
  [...]
  ping 208.67.220.220  <pid:1804>
  ping: sendmsg: Operation not permitted
  [...]
  echo 1804 > /sys/fs/cgroup/net_cls/0/tasks
  64 bytes from 208.67.220.220: icmp_seq=89 ttl=56 time=19.0 ms
  [...]

Of course, real-world deployments would make use of cgroups user
space toolsuite, or own custom policy daemons dynamically moving
applications from/to various cgroups.

  [1] http://www.blackhat.com/presentations/bh-europe-06/bh-eu-06-biondi/bh-eu-06-biondi-up.pdf

Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: cgroups@vger.kernel.org
Acked-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/Kbuild      |  1 +
 include/uapi/linux/netfilter/xt_cgroup.h | 11 +++++++++++
 2 files changed, 12 insertions(+)
 create mode 100644 include/uapi/linux/netfilter/xt_cgroup.h

(limited to 'include')

diff --git a/include/uapi/linux/netfilter/Kbuild b/include/uapi/linux/netfilter/Kbuild
index 91be8ce623f0..2344f5a319fc 100644
--- a/include/uapi/linux/netfilter/Kbuild
+++ b/include/uapi/linux/netfilter/Kbuild
@@ -39,6 +39,7 @@ header-y += xt_TEE.h
 header-y += xt_TPROXY.h
 header-y += xt_addrtype.h
 header-y += xt_bpf.h
+header-y += xt_cgroup.h
 header-y += xt_cluster.h
 header-y += xt_comment.h
 header-y += xt_connbytes.h
diff --git a/include/uapi/linux/netfilter/xt_cgroup.h b/include/uapi/linux/netfilter/xt_cgroup.h
new file mode 100644
index 000000000000..43acb7e175f6
--- /dev/null
+++ b/include/uapi/linux/netfilter/xt_cgroup.h
@@ -0,0 +1,11 @@
+#ifndef _UAPI_XT_CGROUP_H
+#define _UAPI_XT_CGROUP_H
+
+#include <linux/types.h>
+
+struct xt_cgroup_info {
+	__u32 id;
+	__u32 invert;
+};
+
+#endif /* _UAPI_XT_CGROUP_H */
-- 
cgit v1.2.3