From 24ea591d2201c3257d666466e8fac50a6cf3c52f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 6 Jul 2015 05:18:03 -0700 Subject: net: sched: extend percpu stats helpers qdisc_bstats_update_cpu() and other helpers were added to support percpu stats for qdisc. We want to add percpu stats for tc action, so this patch add common helpers. qdisc_bstats_update_cpu() is renamed to qdisc_bstats_cpu_update() qdisc_qstats_drop_cpu() is renamed to qdisc_qstats_cpu_drop() Signed-off-by: Eric Dumazet Cc: Alexei Starovoitov Acked-by: Jamal Hadi Salim Acked-by: John Fastabend Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/net/sch_generic.h | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) (limited to 'include/net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 2738f6f87908..2eab08c38e32 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -513,17 +513,20 @@ static inline void bstats_update(struct gnet_stats_basic_packed *bstats, bstats->packets += skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1; } -static inline void qdisc_bstats_update_cpu(struct Qdisc *sch, - const struct sk_buff *skb) +static inline void bstats_cpu_update(struct gnet_stats_basic_cpu *bstats, + const struct sk_buff *skb) { - struct gnet_stats_basic_cpu *bstats = - this_cpu_ptr(sch->cpu_bstats); - u64_stats_update_begin(&bstats->syncp); bstats_update(&bstats->bstats, skb); u64_stats_update_end(&bstats->syncp); } +static inline void qdisc_bstats_cpu_update(struct Qdisc *sch, + const struct sk_buff *skb) +{ + bstats_cpu_update(this_cpu_ptr(sch->cpu_bstats), skb); +} + static inline void qdisc_bstats_update(struct Qdisc *sch, const struct sk_buff *skb) { @@ -547,16 +550,24 @@ static inline void __qdisc_qstats_drop(struct Qdisc *sch, int count) sch->qstats.drops += count; } -static inline void qdisc_qstats_drop(struct Qdisc *sch) +static inline void qstats_drop_inc(struct gnet_stats_queue *qstats) { - sch->qstats.drops++; + qstats->drops++; } -static inline void qdisc_qstats_drop_cpu(struct Qdisc *sch) +static inline void qstats_overlimit_inc(struct gnet_stats_queue *qstats) { - struct gnet_stats_queue *qstats = this_cpu_ptr(sch->cpu_qstats); + qstats->overlimits++; +} - qstats->drops++; +static inline void qdisc_qstats_drop(struct Qdisc *sch) +{ + qstats_drop_inc(&sch->qstats); +} + +static inline void qdisc_qstats_cpu_drop(struct Qdisc *sch) +{ + qstats_drop_inc(this_cpu_ptr(sch->cpu_qstats)); } static inline void qdisc_qstats_overlimit(struct Qdisc *sch) -- cgit v1.2.3 From 519c818e8fb646eef1e8bfedd18519bec47bc9a9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 6 Jul 2015 05:18:04 -0700 Subject: net: sched: add percpu stats to actions Reuse existing percpu infrastructure John Fastabend added for qdisc. This patch adds a new cpustats parameter to tcf_hash_create() and all actions pass false, meaning this patch should have no effect yet. Signed-off-by: Eric Dumazet Cc: Alexei Starovoitov Acked-by: Jamal Hadi Salim Acked-by: John Fastabend Signed-off-by: David S. Miller --- include/net/act_api.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/act_api.h b/include/net/act_api.h index 3ee4c92afd1b..db2063ffd181 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -21,6 +21,8 @@ struct tcf_common { struct gnet_stats_rate_est64 tcfc_rate_est; spinlock_t tcfc_lock; struct rcu_head tcfc_rcu; + struct gnet_stats_basic_cpu __percpu *cpu_bstats; + struct gnet_stats_queue __percpu *cpu_qstats; }; #define tcf_head common.tcfc_head #define tcf_index common.tcfc_index @@ -103,7 +105,7 @@ int tcf_hash_release(struct tc_action *a, int bind); u32 tcf_hash_new_index(struct tcf_hashinfo *hinfo); int tcf_hash_check(u32 index, struct tc_action *a, int bind); int tcf_hash_create(u32 index, struct nlattr *est, struct tc_action *a, - int size, int bind); + int size, int bind, bool cpustats); void tcf_hash_cleanup(struct tc_action *a, struct nlattr *est); void tcf_hash_insert(struct tc_action *a); -- cgit v1.2.3 From cc6510a9504fd3c03d76bd68d99653148342eecc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 6 Jul 2015 05:18:06 -0700 Subject: net_sched: act_gact: use a separate packet counters for gact_determ() Second step for gact RCU operation : We want to get rid of the spinlock protecting gact operations. Stats (packets/bytes) will soon be per cpu. gact_determ() would not work without a central packet counter, so lets add it for this mode. Signed-off-by: Eric Dumazet Cc: Alexei Starovoitov Acked-by: Jamal Hadi Salim Acked-by: John Fastabend Signed-off-by: David S. Miller --- include/net/tc_act/tc_gact.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/tc_act/tc_gact.h b/include/net/tc_act/tc_gact.h index 9fc9b578908a..592a6bc02b0b 100644 --- a/include/net/tc_act/tc_gact.h +++ b/include/net/tc_act/tc_gact.h @@ -6,9 +6,10 @@ struct tcf_gact { struct tcf_common common; #ifdef CONFIG_GACT_PROB - u16 tcfg_ptype; - u16 tcfg_pval; - int tcfg_paction; + u16 tcfg_ptype; + u16 tcfg_pval; + int tcfg_paction; + atomic_t packets; #endif }; #define to_gact(a) \ -- cgit v1.2.3 From 56e5d1ca183d8616fab377d7d466c244b4dbb3b9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 6 Jul 2015 05:18:08 -0700 Subject: net_sched: act_gact: remove spinlock in fast path Final step for gact RCU operation : 1) Use percpu stats 2) update lastuse only every clock tick to avoid false sharing 3) Remove spinlock acquisition, as it is no longer needed. Since this is the last contended lock in packet RX when tc gact is used, this gives impressive gain. My host with 8 RX queues was handling 5 Mpps before the patch, and more than 11 Mpps after patch. Tested: On receiver : dev=eth0 tc qdisc del dev $dev ingress 2>/dev/null tc qdisc add dev $dev ingress tc filter del dev $dev root pref 10 2>/dev/null tc filter del dev $dev pref 10 2>/dev/null tc filter add dev $dev est 1sec 4sec parent ffff: protocol ip prio 1 \ u32 match ip src 7.0.0.0/8 flowid 1:15 action drop Sender sends packets flood from 7/8 network Signed-off-by: Eric Dumazet Acked-by: Alexei Starovoitov Acked-by: Jamal Hadi Salim Acked-by: John Fastabend Signed-off-by: David S. Miller --- include/net/act_api.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/net') diff --git a/include/net/act_api.h b/include/net/act_api.h index db2063ffd181..8d2a707a9e87 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -70,6 +70,17 @@ static inline void tcf_hashinfo_destroy(struct tcf_hashinfo *hf) kfree(hf->htab); } +/* Update lastuse only if needed, to avoid dirtying a cache line. + * We use a temp variable to avoid fetching jiffies twice. + */ +static inline void tcf_lastuse_update(struct tcf_t *tm) +{ + unsigned long now = jiffies; + + if (tm->lastuse != now) + tm->lastuse = now; +} + #ifdef CONFIG_NET_CLS_ACT #define ACT_P_CREATED 1 -- cgit v1.2.3 From 2ee22a90c7afac265bb6f7abea610b938195e2b8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 6 Jul 2015 05:18:09 -0700 Subject: net_sched: act_mirred: remove spinlock in fast path Like act_gact, act_mirred can be lockless in packet processing 1) Use percpu stats 2) update lastuse only every clock tick to avoid false sharing 3) use rcu to protect tcfm_dev 4) Remove spinlock usage, as it is no longer needed. Next step : add multi queue capability to ifb device Signed-off-by: Eric Dumazet Cc: Alexei Starovoitov Cc: Jamal Hadi Salim Cc: John Fastabend Acked-by: Jamal Hadi Salim Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/net/tc_act/tc_mirred.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/tc_act/tc_mirred.h b/include/net/tc_act/tc_mirred.h index 4dd77a1c106b..dae96bae1c19 100644 --- a/include/net/tc_act/tc_mirred.h +++ b/include/net/tc_act/tc_mirred.h @@ -8,7 +8,7 @@ struct tcf_mirred { int tcfm_eaction; int tcfm_ifindex; int tcfm_ok_push; - struct net_device *tcfm_dev; + struct net_device __rcu *tcfm_dev; struct list_head tcfm_list; }; #define to_mirred(a) \ -- cgit v1.2.3 From 071d5080e33d6f24139e4213c2d9f97a2c21b602 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Thu, 9 Jul 2015 13:16:29 -0700 Subject: tcp: add tcp_in_slow_start helper Add a helper to test the slow start condition in various congestion control modules and other places. This is to prepare a slight improvement in policy as to exactly when to slow start. Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: Eric Dumazet Signed-off-by: Nandita Dukkipati Signed-off-by: David S. Miller --- include/net/tcp.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 950cfecaad3c..dba22fc1b065 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -989,6 +989,11 @@ static inline unsigned int tcp_packets_in_flight(const struct tcp_sock *tp) #define TCP_INFINITE_SSTHRESH 0x7fffffff +static inline bool tcp_in_slow_start(const struct tcp_sock *tp) +{ + return tp->snd_cwnd <= tp->snd_ssthresh; +} + static inline bool tcp_in_initial_slowstart(const struct tcp_sock *tp) { return tp->snd_ssthresh >= TCP_INFINITE_SSTHRESH; @@ -1065,7 +1070,7 @@ static inline bool tcp_is_cwnd_limited(const struct sock *sk) const struct tcp_sock *tp = tcp_sk(sk); /* If in slow start, ensure cwnd grows to twice what was ACKed. */ - if (tp->snd_cwnd <= tp->snd_ssthresh) + if (tcp_in_slow_start(tp)) return tp->snd_cwnd < 2 * tp->max_packets_out; return tp->is_cwnd_limited; -- cgit v1.2.3 From 76174004a0f19785a328f40388e87e982bbf69b9 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Thu, 9 Jul 2015 13:16:30 -0700 Subject: tcp: do not slow start when cwnd equals ssthresh In the original design slow start is only used to raise cwnd when cwnd is stricly below ssthresh. It makes little sense to slow start when cwnd == ssthresh: especially when hystart has set ssthresh in the initial ramp, or after recovery when cwnd resets to ssthresh. Not doing so will also help reduce the buffer bloat slightly. Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: Eric Dumazet Signed-off-by: Nandita Dukkipati Signed-off-by: David S. Miller --- include/net/tcp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index dba22fc1b065..364426a2be5a 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -991,7 +991,7 @@ static inline unsigned int tcp_packets_in_flight(const struct tcp_sock *tp) static inline bool tcp_in_slow_start(const struct tcp_sock *tp) { - return tp->snd_cwnd <= tp->snd_ssthresh; + return tp->snd_cwnd < tp->snd_ssthresh; } static inline bool tcp_in_initial_slowstart(const struct tcp_sock *tp) -- cgit v1.2.3 From 3fd2f1b9d91284afb957ab9899a83279d0e09f29 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 8 Jul 2015 14:28:28 -0700 Subject: inet: remove BUG_ON() in twsk_destructor() Kernel will crash the same if one of the pointer is NULL anyway. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/timewait_sock.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/net') diff --git a/include/net/timewait_sock.h b/include/net/timewait_sock.h index 68f0ecad6c6e..1a47946f95ba 100644 --- a/include/net/timewait_sock.h +++ b/include/net/timewait_sock.h @@ -33,9 +33,6 @@ static inline int twsk_unique(struct sock *sk, struct sock *sktw, void *twp) static inline void twsk_destructor(struct sock *sk) { - BUG_ON(sk == NULL); - BUG_ON(sk->sk_prot == NULL); - BUG_ON(sk->sk_prot->twsk_prot == NULL); if (sk->sk_prot->twsk_prot->twsk_destructor != NULL) sk->sk_prot->twsk_prot->twsk_destructor(sk); } -- cgit v1.2.3 From fc01538f9fb75572c969ca9988176ffc2a8741d6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 8 Jul 2015 14:28:29 -0700 Subject: inet: simplify timewait refcounting timewait sockets have a complex refcounting logic. Once we realize it should be similar to established and syn_recv sockets, we can use sk_nulls_del_node_init_rcu() and remove inet_twsk_unhash() In particular, deferred inet_twsk_put() added in commit 13475a30b66cd ("tcp: connect() race with timewait reuse") looks unecessary : When removing a timewait socket from ehash or bhash, caller must own a reference on the socket anyway. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_hashtables.h | 4 ++-- include/net/inet_timewait_sock.h | 6 ++---- 2 files changed, 4 insertions(+), 6 deletions(-) (limited to 'include/net') diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index b73c88a19dd4..b07d126694a7 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -205,8 +205,8 @@ void inet_put_port(struct sock *sk); void inet_hashinfo_init(struct inet_hashinfo *h); -int __inet_hash_nolisten(struct sock *sk, struct inet_timewait_sock *tw); -int __inet_hash(struct sock *sk, struct inet_timewait_sock *tw); +void __inet_hash_nolisten(struct sock *sk, struct sock *osk); +void __inet_hash(struct sock *sk, struct sock *osk); void inet_hash(struct sock *sk); void inet_unhash(struct sock *sk); diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index 360c4802288d..96f52a4711c8 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -100,10 +100,8 @@ static inline struct inet_timewait_sock *inet_twsk(const struct sock *sk) void inet_twsk_free(struct inet_timewait_sock *tw); void inet_twsk_put(struct inet_timewait_sock *tw); -int inet_twsk_unhash(struct inet_timewait_sock *tw); - -int inet_twsk_bind_unhash(struct inet_timewait_sock *tw, - struct inet_hashinfo *hashinfo); +void inet_twsk_bind_unhash(struct inet_timewait_sock *tw, + struct inet_hashinfo *hashinfo); struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, struct inet_timewait_death_row *dr, -- cgit v1.2.3 From dbe7faa4045ea83a37b691b12bb02a8f86c2d2e9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 8 Jul 2015 14:28:30 -0700 Subject: inet: inet_twsk_deschedule factorization inet_twsk_deschedule() calls are followed by inet_twsk_put(). Only particular case is in inet_twsk_purge() but there is no point to defer the inet_twsk_put() after re-enabling BH. Lets rename inet_twsk_deschedule() to inet_twsk_deschedule_put() and move the inet_twsk_put() inside. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_timewait_sock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index 96f52a4711c8..879d6e5a973b 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -111,7 +111,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, struct inet_hashinfo *hashinfo); void inet_twsk_schedule(struct inet_timewait_sock *tw, const int timeo); -void inet_twsk_deschedule(struct inet_timewait_sock *tw); +void inet_twsk_deschedule_put(struct inet_timewait_sock *tw); void inet_twsk_purge(struct inet_hashinfo *hashinfo, struct inet_timewait_death_row *twdr, int family); -- cgit v1.2.3 From 35a256fee52c7c207796302681fa95189c85b408 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Wed, 8 Jul 2015 16:58:22 -0700 Subject: ipv6: Nonlocal bind Add support to allow non-local binds similar to how this was done for IPv4. Non-local binds are very useful in emulating the Internet in a box, etc. This add the ip_nonlocal_bind sysctl under ipv6. Testing: Set up nonlocal binding and receive routing on a host, e.g.: ip -6 rule add from ::/0 iif eth0 lookup 200 ip -6 route add local 2001:0:0:1::/64 dev lo proto kernel scope host table 200 sysctl -w net.ipv6.ip_nonlocal_bind=1 Set up routing to 2001:0:0:1::/64 on peer to go to first host ping6 -I 2001:0:0:1::1 peer-address -- to verify Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/netns/ipv6.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index 8d93544a2d2b..c0368db6df54 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -31,6 +31,7 @@ struct netns_sysctl_ipv6 { int auto_flowlabels; int icmpv6_time; int anycast_src_echo_reply; + int ip_nonlocal_bind; int fwmark_reflect; int idgen_retries; int idgen_delay; -- cgit v1.2.3 From 085db2c04557d31db61541f361bd8b4de92c9939 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 10 Jul 2015 18:15:06 -0500 Subject: netfilter: Per network namespace netfilter hooks. - Add a new set of functions for registering and unregistering per network namespace hooks. - Modify the old global namespace hook functions to use the per network namespace hooks in their implementation, so their remains a single list that needs to be walked for any hook (this is important for keeping the hook priority working and for keeping the code walking the hooks simple). - Only allow registering the per netdevice hooks in the network namespace where the network device lives. - Dynamically allocate the structures in the per network namespace hook list in nf_register_net_hook, and unregister them in nf_unregister_net_hook. Dynamic allocate is required somewhere as the number of network namespaces are not fixed so we might as well allocate them in the registration function. The chain of registered hooks on any list is expected to be small so the cost of walking that list to find the entry we are unregistering should also be small. Performing the management of the dynamically allocated list entries in the registration and unregistration functions keeps the complexity from spreading. Signed-off-by: "Eric W. Biederman" --- include/net/netns/netfilter.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/netns/netfilter.h b/include/net/netns/netfilter.h index 532e4ba64f49..38aa4983e2a9 100644 --- a/include/net/netns/netfilter.h +++ b/include/net/netns/netfilter.h @@ -14,5 +14,6 @@ struct netns_nf { #ifdef CONFIG_SYSCTL struct ctl_table_header *nf_log_dir_header; #endif + struct list_head hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; }; #endif -- cgit v1.2.3 From 5c48f1201744233d4f235c7dd916d5196ed20716 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 17 Jun 2015 09:58:06 +0200 Subject: mac80211: remove exposing 'mfp' to drivers There's no driver using this, so remove it. Signed-off-by: Johannes Berg --- include/net/mac80211.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 6b1077c2a63f..43dbddfa06c0 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1675,7 +1675,6 @@ struct ieee80211_sta_rates { * @tdls: indicates whether the STA is a TDLS peer * @tdls_initiator: indicates the STA is an initiator of the TDLS link. Only * valid if the STA is a TDLS peer in the first place. - * @mfp: indicates whether the STA uses management frame protection or not. * @txq: per-TID data TX queues (if driver uses the TXQ abstraction) */ struct ieee80211_sta { @@ -1693,7 +1692,6 @@ struct ieee80211_sta { struct ieee80211_sta_rates __rcu *rates; bool tdls; bool tdls_initiator; - bool mfp; struct ieee80211_txq *txq[IEEE80211_NUM_TIDS]; -- cgit v1.2.3 From af9f9b22beee70aae58651cdbb9d6375e6e51797 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 11 Jun 2015 16:02:32 +0200 Subject: mac80211: don't store napi struct When introducing multiple RX queues, a single NAPI struct will not be sufficient. Instead of trying to store multiple, simply change the API to have the NAPI struct passed to the RX function. This of course means that drivers using rx_irqsafe() cannot use NAPI, but that seems a reasonable trade-off, particularly since only two of all drivers are currently using it at all. While at it, we can now remove the IEEE80211_RX_REORDER_TIMER flag again since this code path cannot have a napi struct anyway. Signed-off-by: Johannes Berg --- include/net/mac80211.h | 37 ++++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 13 deletions(-) (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 43dbddfa06c0..ff68b8c4ab35 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -3694,20 +3694,28 @@ void ieee80211_free_hw(struct ieee80211_hw *hw); void ieee80211_restart_hw(struct ieee80211_hw *hw); /** - * ieee80211_napi_add - initialize mac80211 NAPI context - * @hw: the hardware to initialize the NAPI context on - * @napi: the NAPI context to initialize - * @napi_dev: dummy NAPI netdevice, here to not waste the space if the - * driver doesn't use NAPI - * @poll: poll function - * @weight: default weight + * ieee80211_rx_napi - receive frame from NAPI context * - * See also netif_napi_add(). + * Use this function to hand received frames to mac80211. The receive + * buffer in @skb must start with an IEEE 802.11 header. In case of a + * paged @skb is used, the driver is recommended to put the ieee80211 + * header of the frame on the linear part of the @skb to avoid memory + * allocation and/or memcpy by the stack. + * + * This function may not be called in IRQ context. Calls to this function + * for a single hardware must be synchronized against each other. Calls to + * this function, ieee80211_rx_ni() and ieee80211_rx_irqsafe() may not be + * mixed for a single hardware. Must not run concurrently with + * ieee80211_tx_status() or ieee80211_tx_status_ni(). + * + * This function must be called with BHs disabled. + * + * @hw: the hardware this frame came in on + * @skb: the buffer to receive, owned by mac80211 after this call + * @napi: the NAPI context */ -void ieee80211_napi_add(struct ieee80211_hw *hw, struct napi_struct *napi, - struct net_device *napi_dev, - int (*poll)(struct napi_struct *, int), - int weight); +void ieee80211_rx_napi(struct ieee80211_hw *hw, struct sk_buff *skb, + struct napi_struct *napi); /** * ieee80211_rx - receive frame @@ -3729,7 +3737,10 @@ void ieee80211_napi_add(struct ieee80211_hw *hw, struct napi_struct *napi, * @hw: the hardware this frame came in on * @skb: the buffer to receive, owned by mac80211 after this call */ -void ieee80211_rx(struct ieee80211_hw *hw, struct sk_buff *skb); +static inline void ieee80211_rx(struct ieee80211_hw *hw, struct sk_buff *skb) +{ + ieee80211_rx_napi(hw, skb, NULL); +} /** * ieee80211_rx_irqsafe - receive frame -- cgit v1.2.3 From 0c028b5fd1bd10d5777756e571c6c1971f04062b Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 12 Jun 2015 14:33:54 +0200 Subject: mac80211: remove zero-length A-MPDU subframe reporting As there's no driver using this capability and reporting zero-length A-MPDU subframes for radiotap monitoring, remove the capability to free up two RX flags. Signed-off-by: Johannes Berg --- include/net/mac80211.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index ff68b8c4ab35..7417fee18185 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -997,9 +997,6 @@ ieee80211_tx_info_clear_status(struct ieee80211_tx_info *info) * @RX_FLAG_AMPDU_DETAILS: A-MPDU details are known, in particular the reference * number (@ampdu_reference) must be populated and be a distinct number for * each A-MPDU - * @RX_FLAG_AMPDU_REPORT_ZEROLEN: driver reports 0-length subframes - * @RX_FLAG_AMPDU_IS_ZEROLEN: This is a zero-length subframe, for - * monitoring purposes only * @RX_FLAG_AMPDU_LAST_KNOWN: last subframe is known, should be set on all * subframes of a single A-MPDU * @RX_FLAG_AMPDU_IS_LAST: this subframe is the last subframe of the A-MPDU @@ -1039,8 +1036,7 @@ enum mac80211_rx_flags { RX_FLAG_NO_SIGNAL_VAL = BIT(12), RX_FLAG_HT_GF = BIT(13), RX_FLAG_AMPDU_DETAILS = BIT(14), - RX_FLAG_AMPDU_REPORT_ZEROLEN = BIT(15), - RX_FLAG_AMPDU_IS_ZEROLEN = BIT(16), + /* bits 15/16 free */ RX_FLAG_AMPDU_LAST_KNOWN = BIT(17), RX_FLAG_AMPDU_IS_LAST = BIT(18), RX_FLAG_AMPDU_DELIM_CRC_ERROR = BIT(19), -- cgit v1.2.3 From 981d94a80174e4f33bd5015fb49051bfc2eb00d2 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 12 Jun 2015 14:39:02 +0200 Subject: mac80211: support device/driver PN check for CCMP/GCMP When there are multiple RX queues, the PN checks in mac80211 cannot be used since packets might be processed out of order on different CPUs. Allow the driver to report that the PN has been checked, drivers that will use multi-queue RX will have to set this flag. For now, the flag is only valid when the frame has been decrypted, in theory that restriction doesn't have to be there, but in practice the hardware will have decrypted the frame already. Signed-off-by: Johannes Berg --- include/net/mac80211.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 7417fee18185..4d3d2686f278 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -973,6 +973,10 @@ ieee80211_tx_info_clear_status(struct ieee80211_tx_info *info) * @RX_FLAG_IV_STRIPPED: The IV/ICV are stripped from this frame. * If this flag is set, the stack cannot do any replay detection * hence the driver or hardware will have to do that. + * @RX_FLAG_PN_VALIDATED: Currently only valid for CCMP/GCMP frames, this + * flag indicates that the PN was verified for replay protection. + * Note that this flag is also currently only supported when a frame + * is also decrypted (ie. @RX_FLAG_DECRYPTED must be set) * @RX_FLAG_FAILED_FCS_CRC: Set this flag if the FCS check failed on * the frame. * @RX_FLAG_FAILED_PLCP_CRC: Set this flag if the PCLP check failed on @@ -1036,7 +1040,8 @@ enum mac80211_rx_flags { RX_FLAG_NO_SIGNAL_VAL = BIT(12), RX_FLAG_HT_GF = BIT(13), RX_FLAG_AMPDU_DETAILS = BIT(14), - /* bits 15/16 free */ + RX_FLAG_PN_VALIDATED = BIT(15), + /* bit 16 free */ RX_FLAG_AMPDU_LAST_KNOWN = BIT(17), RX_FLAG_AMPDU_IS_LAST = BIT(18), RX_FLAG_AMPDU_DELIM_CRC_ERROR = BIT(19), -- cgit v1.2.3 From b98fb44ffceeac717789e8f2fb3497e6b8c5c65b Mon Sep 17 00:00:00 2001 From: Arik Nemtsov Date: Wed, 10 Jun 2015 20:42:59 +0300 Subject: mac80211: define TDLS wider BW support bits Allow a device to specify support for the TDLS wider-bandwidth feature. Indicate this support during TDLS setup in the ext-capab IE and set an appropriate station flag when our TDLS peer supports it. This feature gives TDLS peers the ability to use a wider channel than the base width of the BSS. For instance VHT capable TDLS peers connected on a 20MHz channel can extend the channel to 80MHz, if regulatory considerations allow it. Do not cap the bandwidth of such stations by the current BSS channel width in mac80211. Signed-off-by: Arik Nemtsov Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- include/net/mac80211.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 4d3d2686f278..8f61a230c482 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1887,6 +1887,9 @@ struct ieee80211_txq { * @IEEE80211_HW_SINGLE_SCAN_ON_ALL_BANDS: The HW supports scanning on all bands * in one command, mac80211 doesn't have to run separate scans per band. * + * @IEEE80211_HW_TDLS_WIDER_BW: The device/driver supports wider bandwidth + * than then BSS bandwidth for a TDLS link on the base channel. + * * @NUM_IEEE80211_HW_FLAGS: number of hardware flags, used for sizing arrays */ enum ieee80211_hw_flags { @@ -1919,6 +1922,7 @@ enum ieee80211_hw_flags { IEEE80211_HW_CHANCTX_STA_CSA, IEEE80211_HW_SUPPORTS_CLONED_SKBS, IEEE80211_HW_SINGLE_SCAN_ON_ALL_BANDS, + IEEE80211_HW_TDLS_WIDER_BW, /* keep last, obviously */ NUM_IEEE80211_HW_FLAGS -- cgit v1.2.3 From f9a060f4b2003eb7350762e60dfc576447e44bad Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 12 Jun 2015 14:55:34 +0200 Subject: mac80211: add pointer for driver use to key Some drivers may need to store data per key, for example for PN validation. Allow this by adding a pointer to the struct that the driver can assign. Signed-off-by: Johannes Berg --- include/net/mac80211.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 8f61a230c482..484cc14fb947 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1492,8 +1492,10 @@ enum ieee80211_key_flags { * - Temporal Authenticator Rx MIC Key (64 bits) * @icv_len: The ICV length for this key type * @iv_len: The IV length for this key type + * @drv_priv: pointer for driver use */ struct ieee80211_key_conf { + void *drv_priv; atomic64_t tx_pn; u32 cipher; u8 icv_len; -- cgit v1.2.3 From 33d8783c58427683b533664f67f8c4378ed64495 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 23 Jun 2015 17:47:05 +0200 Subject: cfg80211: allow mgmt_frame_register callback to sleep This callback is currently not allowed to sleep, which makes it more difficult to implement proper driver methods in mac80211 than it has to be. Instead of doing asynchronous work here in mac80211, make it possible for the callback to sleep by doing some asynchronous work in cfg80211. This also enables improvements to other drivers, like ath6kl, that would like to sleep in this callback. While at it, also fix the code to call the driver on the implicit unregistration when an interface is removed, and do that also when a P2P-Device wdev is destroyed (otherwise we leak the structs.) Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index a741678f24a2..9a529c48f6ca 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -2369,8 +2369,7 @@ struct cfg80211_qos_map { * method returns 0.) * * @mgmt_frame_register: Notify driver that a management frame type was - * registered. Note that this callback may not sleep, and cannot run - * concurrently with itself. + * registered. The callback is allowed to sleep. * * @set_antenna: Set antenna configuration (tx_ant, rx_ant) on the device. * Parameters are bitmaps of allowed antennas to use for TX/RX. Drivers may -- cgit v1.2.3 From b87a173e25d6bf5c26f13d329cdddf57dbd4061a Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 15 Jul 2015 14:21:41 +0200 Subject: cls_cgroup: factor out classid retrieval Split out retrieving the cgroups net_cls classid retrieval into its own function, so that it can be reused later on from other parts of the traffic control subsystem. If there's no skb->sk, then the small helper returns 0 as well, which in cls_cgroup terms means 'could not classify'. Signed-off-by: Daniel Borkmann Cc: Thomas Graf Signed-off-by: David S. Miller --- include/net/cls_cgroup.h | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'include/net') diff --git a/include/net/cls_cgroup.h b/include/net/cls_cgroup.h index c15d39456e14..ccd6d8bffa4d 100644 --- a/include/net/cls_cgroup.h +++ b/include/net/cls_cgroup.h @@ -49,9 +49,38 @@ static inline void sock_update_classid(struct sock *sk) if (classid != sk->sk_classid) sk->sk_classid = classid; } + +static inline u32 task_get_classid(const struct sk_buff *skb) +{ + u32 classid = task_cls_state(current)->classid; + + /* Due to the nature of the classifier it is required to ignore all + * packets originating from softirq context as accessing `current' + * would lead to false results. + * + * This test assumes that all callers of dev_queue_xmit() explicitly + * disable bh. Knowing this, it is possible to detect softirq based + * calls by looking at the number of nested bh disable calls because + * softirqs always disables bh. + */ + if (in_serving_softirq()) { + /* If there is an sk_classid we'll use that. */ + if (!skb->sk) + return 0; + + classid = skb->sk->sk_classid; + } + + return classid; +} #else /* !CONFIG_CGROUP_NET_CLASSID */ static inline void sock_update_classid(struct sock *sk) { } + +static inline u32 task_get_classid(const struct sk_buff *skb) +{ + return 0; +} #endif /* CONFIG_CGROUP_NET_CLASSID */ #endif /* _NET_CLS_CGROUP_H */ -- cgit v1.2.3 From 1a3b2ec93d4277b121979321b4024b438cb09504 Mon Sep 17 00:00:00 2001 From: Scott Feldman Date: Sat, 18 Jul 2015 18:24:50 -0700 Subject: switchdev: add offload_fwd_mark generator helper skb->offload_fwd_mark and dev->offload_fwd_mark are 32-bit and should be unique for device and may even be unique for a sub-set of ports within device, so add switchdev helper function to generate unique marks based on port's switch ID and group_ifindex. group_ifindex would typically be the container dev's ifindex, such as the bridge's ifindex. The generator uses a global hash table to store offload_fwd_marks hashed by {switch ID, group_ifindex} key. Signed-off-by: Scott Feldman Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/switchdev.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/net') diff --git a/include/net/switchdev.h b/include/net/switchdev.h index d5671f118bfc..89da8934519b 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -157,6 +157,9 @@ int switchdev_port_fdb_del(struct ndmsg *ndm, struct nlattr *tb[], int switchdev_port_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, struct net_device *filter_dev, int idx); +void switchdev_port_fwd_mark_set(struct net_device *dev, + struct net_device *group_dev, + bool joining); #else @@ -271,6 +274,12 @@ static inline int switchdev_port_fdb_dump(struct sk_buff *skb, return -EOPNOTSUPP; } +static inline void switchdev_port_fwd_mark_set(struct net_device *dev, + struct net_device *group_dev, + bool joining) +{ +} + #endif #endif /* _LINUX_SWITCHDEV_H_ */ -- cgit v1.2.3 From 499a24256862714539e902c0499b67da2bb3ab72 Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Tue, 21 Jul 2015 10:43:46 +0200 Subject: lwtunnel: infrastructure for handling light weight tunnels like mpls Provides infrastructure to parse/dump/store encap information for light weight tunnels like mpls. Encap information for such tunnels is associated with fib routes. This infrastructure is based on previous suggestions from Eric Biederman to follow the xfrm infrastructure. Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- include/net/lwtunnel.h | 132 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 132 insertions(+) create mode 100644 include/net/lwtunnel.h (limited to 'include/net') diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h new file mode 100644 index 000000000000..df24b3611ff4 --- /dev/null +++ b/include/net/lwtunnel.h @@ -0,0 +1,132 @@ +#ifndef __NET_LWTUNNEL_H +#define __NET_LWTUNNEL_H 1 + +#include +#include +#include +#include +#include + +#define LWTUNNEL_HASH_BITS 7 +#define LWTUNNEL_HASH_SIZE (1 << LWTUNNEL_HASH_BITS) + +/* lw tunnel state flags */ +#define LWTUNNEL_STATE_OUTPUT_REDIRECT 0x1 + +struct lwtunnel_state { + __u16 type; + __u16 flags; + atomic_t refcnt; + int len; + __u8 data[0]; +}; + +struct lwtunnel_encap_ops { + int (*build_state)(struct net_device *dev, struct nlattr *encap, + struct lwtunnel_state **ts); + int (*output)(struct sock *sk, struct sk_buff *skb); + int (*fill_encap)(struct sk_buff *skb, + struct lwtunnel_state *lwtstate); + int (*get_encap_size)(struct lwtunnel_state *lwtstate); + int (*cmp_encap)(struct lwtunnel_state *a, struct lwtunnel_state *b); +}; + +extern const struct lwtunnel_encap_ops __rcu * + lwtun_encaps[LWTUNNEL_ENCAP_MAX+1]; + +#ifdef CONFIG_LWTUNNEL +static inline void lwtunnel_state_get(struct lwtunnel_state *lws) +{ + atomic_inc(&lws->refcnt); +} + +static inline void lwtunnel_state_put(struct lwtunnel_state *lws) +{ + if (!lws) + return; + + if (atomic_dec_and_test(&lws->refcnt)) + kfree(lws); +} + +static inline bool lwtunnel_output_redirect(struct lwtunnel_state *lwtstate) +{ + if (lwtstate && (lwtstate->flags & LWTUNNEL_STATE_OUTPUT_REDIRECT)) + return true; + + return false; +} + +int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *op, + unsigned int num); +int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *op, + unsigned int num); +int lwtunnel_build_state(struct net_device *dev, u16 encap_type, + struct nlattr *encap, + struct lwtunnel_state **lws); +int lwtunnel_fill_encap(struct sk_buff *skb, + struct lwtunnel_state *lwtstate); +int lwtunnel_get_encap_size(struct lwtunnel_state *lwtstate); +struct lwtunnel_state *lwtunnel_state_alloc(int hdr_len); +int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b); + +#else + +static inline void lwtunnel_state_get(struct lwtunnel_state *lws) +{ +} + +static inline void lwtunnel_state_put(struct lwtunnel_state *lws) +{ +} + +static inline bool lwtunnel_output_redirect(struct lwtunnel_state *lwtstate) +{ + return false; +} + +static inline int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *op, + unsigned int num) +{ + return -EOPNOTSUPP; + +} + +static inline int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *op, + unsigned int num) +{ + return -EOPNOTSUPP; +} + +static inline int lwtunnel_build_state(struct net_device *dev, u16 encap_type, + struct nlattr *encap, + struct lwtunnel_state **lws) +{ + return -EOPNOTSUPP; +} + +static inline int lwtunnel_fill_encap(struct sk_buff *skb, + struct lwtunnel_state *lwtstate) +{ + return 0; +} + +static inline int lwtunnel_get_encap_size(struct lwtunnel_state *lwtstate) +{ + return 0; +} + +static inline struct lwtunnel_state *lwtunnel_state_alloc(int hdr_len) +{ + return NULL; +} + +static inline int lwtunnel_cmp_encap(struct lwtunnel_state *a, + struct lwtunnel_state *b) +{ + return 0; +} + +#endif + +#endif /* __NET_LWTUNNEL_H */ -- cgit v1.2.3 From 571e722676fe386bb66f72a75b64a6ebf535c077 Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Tue, 21 Jul 2015 10:43:47 +0200 Subject: ipv4: support for fib route lwtunnel encap attributes This patch adds support in ipv4 fib functions to parse user provided encap attributes and attach encap state data to fib_nh and rtable. Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- include/net/ip_fib.h | 5 ++++- include/net/route.h | 1 + 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 49c142bdf01e..5e0196084f1e 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -44,7 +44,9 @@ struct fib_config { u32 fc_flow; u32 fc_nlflags; struct nl_info fc_nlinfo; - }; + struct nlattr *fc_encap; + u16 fc_encap_type; +}; struct fib_info; struct rtable; @@ -89,6 +91,7 @@ struct fib_nh { struct rtable __rcu * __percpu *nh_pcpu_rth_output; struct rtable __rcu *nh_rth_input; struct fnhe_hash_bucket __rcu *nh_exceptions; + struct lwtunnel_state *nh_lwtstate; }; /* diff --git a/include/net/route.h b/include/net/route.h index fe22d03afb6a..2d45f419477f 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -66,6 +66,7 @@ struct rtable { struct list_head rt_uncached; struct uncached_list *rt_uncached_list; + struct lwtunnel_state *rt_lwtstate; }; static inline bool rt_is_input_route(const struct rtable *rt) -- cgit v1.2.3 From 19e42e45150672124b6a4341e2bc7982d247f0ac Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Tue, 21 Jul 2015 10:43:48 +0200 Subject: ipv6: support for fib route lwtunnel encap attributes This patch adds support in ipv6 fib functions to parse Netlink RTA encap attributes and attach encap state data to rt6_info. Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/net') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 3b76849c190f..276328e3daa6 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -51,6 +51,8 @@ struct fib6_config { struct nlattr *fc_mp; struct nl_info fc_nlinfo; + struct nlattr *fc_encap; + u16 fc_encap_type; }; struct fib6_node { @@ -131,6 +133,7 @@ struct rt6_info { /* more non-fragment space at head required */ unsigned short rt6i_nfheader_len; u8 rt6i_protocol; + struct lwtunnel_state *rt6i_lwtstate; }; static inline struct inet6_dev *ip6_dst_idev(struct dst_entry *dst) -- cgit v1.2.3 From ffce41962ef64b8e685e5b621caf24bf381addd9 Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Tue, 21 Jul 2015 10:43:49 +0200 Subject: lwtunnel: support dst output redirect function This patch introduces lwtunnel_output function to call corresponding lwtunnels output function to xmit the packet. It adds two variants lwtunnel_output and lwtunnel_output6 for ipv4 and ipv6 respectively today. But this is subject to change when lwtstate will reside in dst or dst_metadata (as per upstream discussions). Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- include/net/lwtunnel.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/net') diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h index df24b3611ff4..918e03c1dafa 100644 --- a/include/net/lwtunnel.h +++ b/include/net/lwtunnel.h @@ -69,6 +69,8 @@ int lwtunnel_fill_encap(struct sk_buff *skb, int lwtunnel_get_encap_size(struct lwtunnel_state *lwtstate); struct lwtunnel_state *lwtunnel_state_alloc(int hdr_len); int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b); +int lwtunnel_output(struct sock *sk, struct sk_buff *skb); +int lwtunnel_output6(struct sock *sk, struct sk_buff *skb); #else @@ -127,6 +129,16 @@ static inline int lwtunnel_cmp_encap(struct lwtunnel_state *a, return 0; } +static inline int lwtunnel_output(struct sock *sk, struct sk_buff *skb) +{ + return -EOPNOTSUPP; +} + +static inline int lwtunnel_output6(struct sock *sk, struct sk_buff *skb) +{ + return -EOPNOTSUPP; +} + #endif #endif /* __NET_LWTUNNEL_H */ -- cgit v1.2.3 From e3e4712ec0961ed586a8db340bd994c4ad7f5dba Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Tue, 21 Jul 2015 10:43:53 +0200 Subject: mpls: ip tunnel support This implementation uses lwtunnel infrastructure to register hooks for mpls tunnel encaps. It picks cues from iptunnel_encaps infrastructure and previous mpls iptunnel RFC patches from Eric W. Biederman and Robert Shearman Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- include/net/mpls_iptunnel.h | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 include/net/mpls_iptunnel.h (limited to 'include/net') diff --git a/include/net/mpls_iptunnel.h b/include/net/mpls_iptunnel.h new file mode 100644 index 000000000000..4757997f76ed --- /dev/null +++ b/include/net/mpls_iptunnel.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2015 Cumulus Networks, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#ifndef _NET_MPLS_IPTUNNEL_H +#define _NET_MPLS_IPTUNNEL_H 1 + +#define MAX_NEW_LABELS 2 + +struct mpls_iptunnel_encap { + u32 label[MAX_NEW_LABELS]; + u32 labels; +}; + +static inline struct mpls_iptunnel_encap *mpls_lwtunnel_encap(struct lwtunnel_state *lwtstate) +{ + return (struct mpls_iptunnel_encap *)lwtstate->data; +} + +#endif -- cgit v1.2.3 From 1d8fff907342d2339796dbd27ea47d0e76a6a2d0 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Tue, 21 Jul 2015 10:43:54 +0200 Subject: ip_tunnel: Make ovs_tunnel_info and ovs_key_ipv4_tunnel generic Rename the tunnel metadata data structures currently internal to OVS and make them generic for use by all IP tunnels. Both structures are kernel internal and will stay that way. Their members are exposed to user space through individual Netlink attributes by OVS. It will therefore be possible to extend/modify these structures without affecting user ABI. Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- include/net/ip_tunnels.h | 63 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) (limited to 'include/net') diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index d8214cb88bbc..6b9d559ce5f5 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -22,6 +22,28 @@ /* Keep error state on tunnel for 30 sec */ #define IPTUNNEL_ERR_TIMEO (30*HZ) +/* Used to memset ip_tunnel padding. */ +#define IP_TUNNEL_KEY_SIZE \ + (offsetof(struct ip_tunnel_key, tp_dst) + \ + FIELD_SIZEOF(struct ip_tunnel_key, tp_dst)) + +struct ip_tunnel_key { + __be64 tun_id; + __be32 ipv4_src; + __be32 ipv4_dst; + __be16 tun_flags; + __u8 ipv4_tos; + __u8 ipv4_ttl; + __be16 tp_src; + __be16 tp_dst; +} __packed __aligned(4); /* Minimize padding. */ + +struct ip_tunnel_info { + struct ip_tunnel_key key; + const void *options; + u8 options_len; +}; + /* 6rd prefix/relay information */ #ifdef CONFIG_IPV6_SIT_6RD struct ip_tunnel_6rd_parm { @@ -136,6 +158,47 @@ int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *op, int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *op, unsigned int num); +static inline void __ip_tunnel_info_init(struct ip_tunnel_info *tun_info, + __be32 saddr, __be32 daddr, + u8 tos, u8 ttl, + __be16 tp_src, __be16 tp_dst, + __be64 tun_id, __be16 tun_flags, + const void *opts, u8 opts_len) +{ + tun_info->key.tun_id = tun_id; + tun_info->key.ipv4_src = saddr; + tun_info->key.ipv4_dst = daddr; + tun_info->key.ipv4_tos = tos; + tun_info->key.ipv4_ttl = ttl; + tun_info->key.tun_flags = tun_flags; + + /* For the tunnel types on the top of IPsec, the tp_src and tp_dst of + * the upper tunnel are used. + * E.g: GRE over IPSEC, the tp_src and tp_port are zero. + */ + tun_info->key.tp_src = tp_src; + tun_info->key.tp_dst = tp_dst; + + /* Clear struct padding. */ + if (sizeof(tun_info->key) != IP_TUNNEL_KEY_SIZE) + memset((unsigned char *)&tun_info->key + IP_TUNNEL_KEY_SIZE, + 0, sizeof(tun_info->key) - IP_TUNNEL_KEY_SIZE); + + tun_info->options = opts; + tun_info->options_len = opts_len; +} + +static inline void ip_tunnel_info_init(struct ip_tunnel_info *tun_info, + const struct iphdr *iph, + __be16 tp_src, __be16 tp_dst, + __be64 tun_id, __be16 tun_flags, + const void *opts, u8 opts_len) +{ + __ip_tunnel_info_init(tun_info, iph->saddr, iph->daddr, + iph->tos, iph->ttl, tp_src, tp_dst, + tun_id, tun_flags, opts, opts_len); +} + #ifdef CONFIG_INET int ip_tunnel_init(struct net_device *dev); -- cgit v1.2.3 From f38a9eb1f77b296ff07e000823884a0f64d67b2a Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Tue, 21 Jul 2015 10:43:56 +0200 Subject: dst: Metadata destinations Introduces a new dst_metadata which enables to carry per packet metadata between forwarding and processing elements via the skb->dst pointer. The structure is set up to be a union. Thus, each separate type of metadata requires its own dst instance. If demand arises to carry multiple types of metadata concurrently, metadata dst entries can be made stackable. The metadata dst entry is refcnt'ed as expected for now but a non reference counted use is possible if the reference is forced before queueing the skb. In order to allow allocating dsts with variable length, the existing dst_alloc() is split into a dst_alloc() and dst_init() function. The existing dst_init() function to initialize the subsystem is being renamed to dst_subsys_init() to make it clear what is what. The check before ip_route_input() is changed to ignore metadata dsts and drop the dst inside the routing function thus allowing to interpret metadata in a later commit. Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- include/net/dst.h | 6 +++++- include/net/dst_metadata.h | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 1 deletion(-) create mode 100644 include/net/dst_metadata.h (limited to 'include/net') diff --git a/include/net/dst.h b/include/net/dst.h index 2bc73f8a00a9..2578811cef51 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -57,6 +57,7 @@ struct dst_entry { #define DST_FAKE_RTABLE 0x0040 #define DST_XFRM_TUNNEL 0x0080 #define DST_XFRM_QUEUE 0x0100 +#define DST_METADATA 0x0200 unsigned short pending_confirm; @@ -356,6 +357,9 @@ static inline int dst_discard(struct sk_buff *skb) } void *dst_alloc(struct dst_ops *ops, struct net_device *dev, int initial_ref, int initial_obsolete, unsigned short flags); +void dst_init(struct dst_entry *dst, struct dst_ops *ops, + struct net_device *dev, int initial_ref, int initial_obsolete, + unsigned short flags); void __dst_free(struct dst_entry *dst); struct dst_entry *dst_destroy(struct dst_entry *dst); @@ -457,7 +461,7 @@ static inline struct dst_entry *dst_check(struct dst_entry *dst, u32 cookie) return dst; } -void dst_init(void); +void dst_subsys_init(void); /* Flags for xfrm_lookup flags argument. */ enum { diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h new file mode 100644 index 000000000000..4f7694f3c7d0 --- /dev/null +++ b/include/net/dst_metadata.h @@ -0,0 +1,32 @@ +#ifndef __NET_DST_METADATA_H +#define __NET_DST_METADATA_H 1 + +#include +#include +#include + +struct metadata_dst { + struct dst_entry dst; + size_t opts_len; +}; + +static inline struct metadata_dst *skb_metadata_dst(struct sk_buff *skb) +{ + struct metadata_dst *md_dst = (struct metadata_dst *) skb_dst(skb); + + if (md_dst && md_dst->dst.flags & DST_METADATA) + return md_dst; + + return NULL; +} + +static inline bool skb_valid_dst(const struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + + return dst && !(dst->flags & DST_METADATA); +} + +struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags); + +#endif /* __NET_DST_METADATA_H */ -- cgit v1.2.3 From ee122c79d4227f6ec642157834b6a90fcffa4382 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Tue, 21 Jul 2015 10:43:58 +0200 Subject: vxlan: Flow based tunneling Allows putting a VXLAN device into a new flow-based mode in which skbs with a ip_tunnel_info dst metadata attached will be encapsulated according to the instructions stored in there with the VXLAN device defaults taken into consideration. Similar on the receive side, if the VXLAN_F_COLLECT_METADATA flag is set, the packet processing will populate a ip_tunnel_info struct for each packet received and attach it to the skb using the new metadata dst. The metadata structure will contain the outer header and tunnel header fields which have been stripped off. Layers further up in the stack such as routing, tc or netfitler can later match on these fields and perform forwarding. It is the responsibility of upper layers to ensure that the flag is set if the metadata is needed. The flag limits the additional cost of metadata collecting based on demand. This prepares the VXLAN device to be steered by the routing and other subsystems which allows to support encapsulation for a large number of tunnel endpoints and tunnel ids through a single net_device which improves the scalability. It also allows for OVS to leverage this mode which in turn allows for the removal of the OVS specific VXLAN code. Because the skb is currently scrubed in vxlan_rcv(), the attachment of the new dst metadata is postponed until after scrubing which requires the temporary addition of a new member to vxlan_metadata. This member is removed again in a later commit after the indirect VXLAN receive API has been removed. Signed-off-by: Thomas Graf Signed-off-by: Pravin B Shelar Signed-off-by: David S. Miller --- include/net/dst_metadata.h | 13 +++++++++++++ include/net/ip_tunnels.h | 14 ++++++++++++++ include/net/vxlan.h | 10 +++++++++- 3 files changed, 36 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h index 4f7694f3c7d0..e843937fb30a 100644 --- a/include/net/dst_metadata.h +++ b/include/net/dst_metadata.h @@ -8,6 +8,9 @@ struct metadata_dst { struct dst_entry dst; size_t opts_len; + union { + struct ip_tunnel_info tun_info; + } u; }; static inline struct metadata_dst *skb_metadata_dst(struct sk_buff *skb) @@ -20,6 +23,16 @@ static inline struct metadata_dst *skb_metadata_dst(struct sk_buff *skb) return NULL; } +static inline struct ip_tunnel_info *skb_tunnel_info(struct sk_buff *skb) +{ + struct metadata_dst *md_dst = skb_metadata_dst(skb); + + if (md_dst) + return &md_dst->u.tun_info; + + return NULL; +} + static inline bool skb_valid_dst(const struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 6b9d559ce5f5..d11530f1c1e2 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -38,10 +38,19 @@ struct ip_tunnel_key { __be16 tp_dst; } __packed __aligned(4); /* Minimize padding. */ +/* Indicates whether the tunnel info structure represents receive + * or transmit tunnel parameters. + */ +enum { + IP_TUNNEL_INFO_RX, + IP_TUNNEL_INFO_TX, +}; + struct ip_tunnel_info { struct ip_tunnel_key key; const void *options; u8 options_len; + u8 mode; }; /* 6rd prefix/relay information */ @@ -284,6 +293,11 @@ static inline void iptunnel_xmit_stats(int err, } } +static inline void *ip_tunnel_info_opts(struct ip_tunnel_info *info, size_t n) +{ + return info + 1; +} + #endif /* CONFIG_INET */ #endif /* __NET_IP_TUNNELS_H */ diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 0082b5d33d7d..80a2da29e088 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -7,6 +7,7 @@ #include #include #include +#include #define VNI_HASH_BITS 10 #define VNI_HASH_SIZE (1< Date: Tue, 21 Jul 2015 10:43:59 +0200 Subject: route: Extend flow representation with tunnel key Add a new flowi_tunnel structure which is a subset of ip_tunnel_key to allow routes to match on tunnel metadata. For now, the tunnel id is added to flowi_tunnel which allows for routes to be bound to specific virtual tunnels. Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- include/net/flow.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/net') diff --git a/include/net/flow.h b/include/net/flow.h index 8109a159d1b3..3098ae33a178 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -19,6 +19,10 @@ #define LOOPBACK_IFINDEX 1 +struct flowi_tunnel { + __be64 tun_id; +}; + struct flowi_common { int flowic_oif; int flowic_iif; @@ -30,6 +34,7 @@ struct flowi_common { #define FLOWI_FLAG_ANYSRC 0x01 #define FLOWI_FLAG_KNOWN_NH 0x02 __u32 flowic_secid; + struct flowi_tunnel flowic_tun_key; }; union flowi_uli { @@ -66,6 +71,7 @@ struct flowi4 { #define flowi4_proto __fl_common.flowic_proto #define flowi4_flags __fl_common.flowic_flags #define flowi4_secid __fl_common.flowic_secid +#define flowi4_tun_key __fl_common.flowic_tun_key /* (saddr,daddr) must be grouped, same order as in IP header */ __be32 saddr; @@ -95,6 +101,7 @@ static inline void flowi4_init_output(struct flowi4 *fl4, int oif, fl4->flowi4_proto = proto; fl4->flowi4_flags = flags; fl4->flowi4_secid = 0; + fl4->flowi4_tun_key.tun_id = 0; fl4->daddr = daddr; fl4->saddr = saddr; fl4->fl4_dport = dport; @@ -165,6 +172,7 @@ struct flowi { #define flowi_proto u.__fl_common.flowic_proto #define flowi_flags u.__fl_common.flowic_flags #define flowi_secid u.__fl_common.flowic_secid +#define flowi_tun_key u.__fl_common.flowic_tun_key } __attribute__((__aligned__(BITS_PER_LONG/8))); static inline struct flowi *flowi4_to_flowi(struct flowi4 *fl4) -- cgit v1.2.3 From 3093fbe7ff4bc7d1571fc217dade1cf80330a714 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Tue, 21 Jul 2015 10:44:00 +0200 Subject: route: Per route IP tunnel metadata via lightweight tunnel This introduces a new IP tunnel lightweight tunnel type which allows to specify IP tunnel instructions per route. Only IPv4 is supported at this point. Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- include/net/dst_metadata.h | 12 +++++++++++- include/net/ip_tunnels.h | 7 ++++++- 2 files changed, 17 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h index e843937fb30a..7b0306894663 100644 --- a/include/net/dst_metadata.h +++ b/include/net/dst_metadata.h @@ -23,13 +23,23 @@ static inline struct metadata_dst *skb_metadata_dst(struct sk_buff *skb) return NULL; } -static inline struct ip_tunnel_info *skb_tunnel_info(struct sk_buff *skb) +static inline struct ip_tunnel_info *skb_tunnel_info(struct sk_buff *skb, + int family) { struct metadata_dst *md_dst = skb_metadata_dst(skb); + struct rtable *rt; if (md_dst) return &md_dst->u.tun_info; + switch (family) { + case AF_INET: + rt = (struct rtable *)skb_dst(skb); + if (rt && rt->rt_lwtstate) + return lwt_tun_info(rt->rt_lwtstate); + break; + } + return NULL; } diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index d11530f1c1e2..0b7e18cfa0b4 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -9,9 +9,9 @@ #include #include #include -#include #include #include +#include #if IS_ENABLED(CONFIG_IPV6) #include @@ -298,6 +298,11 @@ static inline void *ip_tunnel_info_opts(struct ip_tunnel_info *info, size_t n) return info + 1; } +static inline struct ip_tunnel_info *lwt_tun_info(struct lwtunnel_state *lwtstate) +{ + return (struct ip_tunnel_info *)lwtstate->data; +} + #endif /* CONFIG_INET */ #endif /* __NET_IP_TUNNELS_H */ -- cgit v1.2.3 From e7030878fc8448492b6e5cecd574043f63271298 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Tue, 21 Jul 2015 10:44:01 +0200 Subject: fib: Add fib rule match on tunnel id This add the ability to select a routing table based on the tunnel id which allows to maintain separate routing tables for each virtual tunnel network. ip rule add from all tunnel-id 100 lookup 100 ip rule add from all tunnel-id 200 lookup 200 A new static key controls the collection of metadata at tunnel level upon demand. Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- include/net/fib_rules.h | 1 + include/net/ip_tunnels.h | 11 +++++++++++ 2 files changed, 12 insertions(+) (limited to 'include/net') diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h index 903a55efbffe..4e8f804f4589 100644 --- a/include/net/fib_rules.h +++ b/include/net/fib_rules.h @@ -19,6 +19,7 @@ struct fib_rule { u8 action; /* 3 bytes hole, try to use */ u32 target; + __be64 tun_id; struct fib_rule __rcu *ctarget; struct net *fr_net; diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 0b7e18cfa0b4..0a5a7763eec2 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -303,6 +303,17 @@ static inline struct ip_tunnel_info *lwt_tun_info(struct lwtunnel_state *lwtstat return (struct ip_tunnel_info *)lwtstate->data; } +extern struct static_key ip_tunnel_metadata_cnt; + +/* Returns > 0 if metadata should be collected */ +static inline int ip_tunnel_collect_metadata(void) +{ + return static_key_false(&ip_tunnel_metadata_cnt); +} + +void ip_tunnel_need_metadata(void); +void ip_tunnel_unneed_metadata(void); + #endif /* CONFIG_INET */ #endif /* __NET_IP_TUNNELS_H */ -- cgit v1.2.3 From 0dfbdf4102b9303d3ddf2177c0220098ff99f6de Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Tue, 21 Jul 2015 10:44:02 +0200 Subject: vxlan: Factor out device configuration This factors out the device configuration out of the RTNL newlink API which allows for in-kernel creation of VXLAN net_devices. Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- include/net/vxlan.h | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) (limited to 'include/net') diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 80a2da29e088..19535f85eb2c 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -95,6 +95,11 @@ struct vxlanhdr { #define VXLAN_VNI_MASK (VXLAN_VID_MASK << 8) #define VXLAN_HLEN (sizeof(struct udphdr) + sizeof(struct vxlanhdr)) +#define VNI_HASH_BITS 10 +#define VNI_HASH_SIZE (1< Date: Tue, 21 Jul 2015 10:44:06 +0200 Subject: openvswitch: Use regular VXLAN net_device device This gets rid of all OVS specific VXLAN code in the receive and transmit path by using a VXLAN net_device to represent the vport. Only a small shim layer remains which takes care of handling the VXLAN specific OVS Netlink configuration. Unexports vxlan_sock_add(), vxlan_sock_release(), vxlan_xmit_skb() since they are no longer needed. Signed-off-by: Thomas Graf Signed-off-by: Pravin B Shelar Signed-off-by: David S. Miller --- include/net/rtnetlink.h | 1 + include/net/vxlan.h | 24 ++++-------------------- 2 files changed, 5 insertions(+), 20 deletions(-) (limited to 'include/net') diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index 343d922d15c2..18fdb98185ab 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -141,6 +141,7 @@ struct net_device *rtnl_create_link(struct net *net, const char *ifname, unsigned char name_assign_type, const struct rtnl_link_ops *ops, struct nlattr *tb[]); +int rtnl_delete_link(struct net_device *dev); int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm); int rtnl_nla_parse_ifla(struct nlattr **tb, const struct nlattr *head, int len); diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 19535f85eb2c..eb8d721cdb67 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -101,22 +101,12 @@ struct vxlanhdr { #define FDB_HASH_SIZE (1<vn_sock->sock->sk)->inet_sport; +} static inline netdev_features_t vxlan_features_check(struct sk_buff *skb, netdev_features_t features) -- cgit v1.2.3 From e181a5430491f038c198f0eacc3142d6e871c2da Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Sun, 19 Jul 2015 22:21:13 +0200 Subject: net: #ifdefify sk_classid member of struct sock The sk_classid member is only required when CONFIG_CGROUP_NET_CLASSID is enabled. #ifdefify it to reduce the size of struct sock on 32 bit systems, at least. Signed-off-by: Mathias Krause Signed-off-by: David S. Miller --- include/net/sock.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/sock.h b/include/net/sock.h index 05a8c1aea251..4353ef70bf48 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -429,7 +429,9 @@ struct sock { void *sk_security; #endif __u32 sk_mark; +#ifdef CONFIG_CGROUP_NET_CLASSID u32 sk_classid; +#endif struct cg_proto *sk_cgrp; void (*sk_state_change)(struct sock *sk); void (*sk_data_ready)(struct sock *sk); -- cgit v1.2.3 From 052831879945be0d9fad2216b127147c565ec1b1 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Wed, 22 Jul 2015 14:43:58 +0200 Subject: ip_tunnel: Provide tunnel metadata API for CONFIG_INET=n Account for the configuration FIB_RULES=y && INET=n as FIB_RULES can be selected by IPV6 or DECNET without INET. Fixes: e7030878fc84 ("fib: Add fib rule match on tunnel id") Fixes: 3093fbe7ff4b ("route: Per route IP tunnel metadata via lightweight tunnel") Reported-by: kbuild test robot Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- include/net/ip_tunnels.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/net') diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 0a5a7763eec2..d975b3ebd6c7 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -314,6 +314,21 @@ static inline int ip_tunnel_collect_metadata(void) void ip_tunnel_need_metadata(void); void ip_tunnel_unneed_metadata(void); +#else /* CONFIG_INET */ + +static inline struct ip_tunnel_info *lwt_tun_info(struct lwtunnel_state *lwtstate) +{ + return NULL; +} + +static inline void ip_tunnel_need_metadata(void) +{ +} + +static inline void ip_tunnel_unneed_metadata(void) +{ +} + #endif /* CONFIG_INET */ #endif /* __NET_IP_TUNNELS_H */ -- cgit v1.2.3 From 045a0fa0c5f5ea0f16c009f924ea579634afbba8 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Thu, 23 Jul 2015 10:08:44 +0200 Subject: ip_tunnel: Call ip_tunnel_core_init() from inet_init() Convert the module_init() to a invocation from inet_init() since ip_tunnel_core is part of the INET built-in. Fixes: 3093fbe7ff4 ("route: Per route IP tunnel metadata via lightweight tunnel") Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- include/net/ip_tunnels.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index d975b3ebd6c7..47984415f5d1 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -311,6 +311,8 @@ static inline int ip_tunnel_collect_metadata(void) return static_key_false(&ip_tunnel_metadata_cnt); } +void __init ip_tunnel_core_init(void); + void ip_tunnel_need_metadata(void); void ip_tunnel_unneed_metadata(void); -- cgit v1.2.3 From a6cb869b3b7c16fd7c3ee766dd9f9a4fdda7edf9 Mon Sep 17 00:00:00 2001 From: Varka Bhadram Date: Wed, 24 Jun 2015 11:36:35 +0200 Subject: cfg802154: add PM hooks This patch help to implement suspend/resume in mac802154, these hooks will be run before the device is suspended and after it resumes. Signed-off-by: Varka Bhadram Signed-off-by: Alexander Aring Signed-off-by: Marcel Holtmann --- include/net/cfg802154.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/cfg802154.h b/include/net/cfg802154.h index 290a9a69af07..382f94b59f2f 100644 --- a/include/net/cfg802154.h +++ b/include/net/cfg802154.h @@ -34,6 +34,8 @@ struct cfg802154_ops { int type); void (*del_virtual_intf_deprecated)(struct wpan_phy *wpan_phy, struct net_device *dev); + int (*suspend)(struct wpan_phy *wpan_phy); + int (*resume)(struct wpan_phy *wpan_phy); int (*add_virtual_intf)(struct wpan_phy *wpan_phy, const char *name, unsigned char name_assign_type, -- cgit v1.2.3 From 729a8989b3fa8ae7965c537dfccbd08512e84d3c Mon Sep 17 00:00:00 2001 From: Varka Bhadram Date: Tue, 7 Jul 2015 10:50:42 +0530 Subject: mac802154: do not export ieee802154_rx() Right now there are no other users for ieee802154_rx() in kernel. So lets remove EXPORT_SYMBOL() for this. Also it moves the function prototype from global header file to local header file. Signed-off-by: Varka Bhadram Acked-by: Alexander Aring Signed-off-by: Marcel Holtmann --- include/net/mac802154.h | 17 ----------------- 1 file changed, 17 deletions(-) (limited to 'include/net') diff --git a/include/net/mac802154.h b/include/net/mac802154.h index f534a46911dc..b7f99615224b 100644 --- a/include/net/mac802154.h +++ b/include/net/mac802154.h @@ -320,23 +320,6 @@ int ieee802154_register_hw(struct ieee802154_hw *hw); */ void ieee802154_unregister_hw(struct ieee802154_hw *hw); -/** - * ieee802154_rx - receive frame - * - * Use this function to hand received frames to mac802154. The receive - * buffer in @skb must start with an IEEE 802.15.4 header. In case of a - * paged @skb is used, the driver is recommended to put the ieee802154 - * header of the frame on the linear part of the @skb to avoid memory - * allocation and/or memcpy by the stack. - * - * This function may not be called in IRQ context. Calls to this function - * for a single hardware must be synchronized against each other. - * - * @hw: the hardware this frame came in on - * @skb: the buffer to receive, owned by mac802154 after this call - */ -void ieee802154_rx(struct ieee802154_hw *hw, struct sk_buff *skb); - /** * ieee802154_rx_irqsafe - receive frame * -- cgit v1.2.3 From cb02a25583b59ce48267472cd092485d754964f9 Mon Sep 17 00:00:00 2001 From: Dean Jenkins Date: Tue, 23 Jun 2015 17:59:38 +0100 Subject: Bluetooth: __l2cap_wait_ack() use msecs_to_jiffies() Use msecs_to_jiffies() instead of using HZ so that it is easier to specify the time in milliseconds. Also add a #define L2CAP_WAIT_ACK_POLL_PERIOD to specify the 200ms polling period so that it is defined in a single place. Signed-off-by: Dean Jenkins Signed-off-by: Marcel Holtmann --- include/net/bluetooth/l2cap.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/bluetooth/l2cap.h b/include/net/bluetooth/l2cap.h index 2239a3753092..3dcad4159b0b 100644 --- a/include/net/bluetooth/l2cap.h +++ b/include/net/bluetooth/l2cap.h @@ -55,6 +55,7 @@ #define L2CAP_INFO_TIMEOUT msecs_to_jiffies(4000) #define L2CAP_MOVE_TIMEOUT msecs_to_jiffies(4000) #define L2CAP_MOVE_ERTX_TIMEOUT msecs_to_jiffies(60000) +#define L2CAP_WAIT_ACK_POLL_PERIOD msecs_to_jiffies(200) #define L2CAP_A2MP_DEFAULT_MTU 670 -- cgit v1.2.3 From e432c72c464d2deb6c66d1e2a5f548dc1f0ef4dc Mon Sep 17 00:00:00 2001 From: Dean Jenkins Date: Tue, 23 Jun 2015 17:59:39 +0100 Subject: Bluetooth: __l2cap_wait_ack() add defensive timeout Add a timeout to prevent the do while loop running in an infinite loop. This ensures that the channel will be instructed to close within 10 seconds so prevents l2cap_sock_shutdown() getting stuck forever. Returns -ENOLINK when the timeout is reached. The channel will be subequently closed and not all data will be ACK'ed. Signed-off-by: Dean Jenkins Signed-off-by: Marcel Holtmann --- include/net/bluetooth/l2cap.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/bluetooth/l2cap.h b/include/net/bluetooth/l2cap.h index 3dcad4159b0b..c98afc08cc26 100644 --- a/include/net/bluetooth/l2cap.h +++ b/include/net/bluetooth/l2cap.h @@ -56,6 +56,7 @@ #define L2CAP_MOVE_TIMEOUT msecs_to_jiffies(4000) #define L2CAP_MOVE_ERTX_TIMEOUT msecs_to_jiffies(60000) #define L2CAP_WAIT_ACK_POLL_PERIOD msecs_to_jiffies(200) +#define L2CAP_WAIT_ACK_TIMEOUT msecs_to_jiffies(10000) #define L2CAP_A2MP_DEFAULT_MTU 670 -- cgit v1.2.3 From 8757825b128e71319150219b0745d3ecb87f34aa Mon Sep 17 00:00:00 2001 From: Seungyoun Ju Date: Mon, 13 Jul 2015 17:28:13 +0900 Subject: Bluetooth: hci_check_conn_params() check proper range Slave latency range has been changed in Core Spec. 4.2 by Erratum 5419 of ESR08_V1.0.0. And it should be applied to Core Spec. 4.0 and 4.1. Before: connSlaveLatency <= ((connSupervisionTimeout / connIntervalMax) - 1) After: connSlaveLatency <= ((connSupervisionTimeout / (connIntervalMax*2)) - 1) This patch makes hci_check_conn_params() check the allowable slave latency range using the changed way. Signed-off-by: Seungyoun Ju Signed-off-by: Johan Hedberg --- include/net/bluetooth/hci_core.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 3bd618d3e55d..2a6b0919e23f 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1297,7 +1297,7 @@ static inline int hci_check_conn_params(u16 min, u16 max, u16 latency, if (max >= to_multiplier * 8) return -EINVAL; - max_latency = (to_multiplier * 8 / max) - 1; + max_latency = (to_multiplier * 4 / max) - 1; if (latency > 499 || latency > max_latency) return -EINVAL; -- cgit v1.2.3 From 5a6228a0b472062646434cd2536d109c102b606e Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Fri, 24 Jul 2015 12:28:36 +0200 Subject: lwtunnel: change prototype of lwtunnel_state_get() It saves some lines and simplify a bit the code when the state is returning by this function. It's also useful to handle a NULL entry. To avoid too long lines, I've also renamed lwtunnel_state_get() and lwtunnel_state_put() to lwtstate_get() and lwtstate_put(). CC: Thomas Graf CC: Roopa Prabhu Signed-off-by: Nicolas Dichtel Acked-by: Thomas Graf Acked-by: Roopa Prabhu Signed-off-by: David S. Miller --- include/net/lwtunnel.h | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'include/net') diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h index 918e03c1dafa..b02039081b04 100644 --- a/include/net/lwtunnel.h +++ b/include/net/lwtunnel.h @@ -35,12 +35,16 @@ extern const struct lwtunnel_encap_ops __rcu * lwtun_encaps[LWTUNNEL_ENCAP_MAX+1]; #ifdef CONFIG_LWTUNNEL -static inline void lwtunnel_state_get(struct lwtunnel_state *lws) +static inline struct lwtunnel_state * +lwtstate_get(struct lwtunnel_state *lws) { - atomic_inc(&lws->refcnt); + if (lws) + atomic_inc(&lws->refcnt); + + return lws; } -static inline void lwtunnel_state_put(struct lwtunnel_state *lws) +static inline void lwtstate_put(struct lwtunnel_state *lws) { if (!lws) return; @@ -74,11 +78,13 @@ int lwtunnel_output6(struct sock *sk, struct sk_buff *skb); #else -static inline void lwtunnel_state_get(struct lwtunnel_state *lws) +static inline struct lwtunnel_state * +lwtstate_get(struct lwtunnel_state *lws) { + return lws; } -static inline void lwtunnel_state_put(struct lwtunnel_state *lws) +static inline void lwtstate_put(struct lwtunnel_state *lws) { } -- cgit v1.2.3 From 205845a34763432040496908c8f52f1f97e5ee62 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Fri, 24 Jul 2015 15:50:31 +0200 Subject: bonding: convert num_grat_arp to the new bonding option API num_grat_arp wasn't converted to the new bonding option API, so do this now and remove the specific sysfs store option in order to use the standard one. num_grat_arp is the same as num_unsol_na so add it as an alias with the same option settings. An important difference is the option name which is matched in bond_sysfs_store_option(). Signed-off-by: Nikolay Aleksandrov Acked-by: Veaceslav Falico Signed-off-by: David S. Miller --- include/net/bond_options.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/bond_options.h b/include/net/bond_options.h index c28aca25320e..1797235cd590 100644 --- a/include/net/bond_options.h +++ b/include/net/bond_options.h @@ -66,6 +66,7 @@ enum { BOND_OPT_AD_ACTOR_SYS_PRIO, BOND_OPT_AD_ACTOR_SYSTEM, BOND_OPT_AD_USER_PORT_KEY, + BOND_OPT_NUM_PEER_NOTIF_ALIAS, BOND_OPT_LAST }; -- cgit v1.2.3 From 877d1f6291f8e391237e324be58479a3e3a7407c Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Tue, 28 Jul 2015 16:02:05 -0700 Subject: net: Set sk_txhash from a random number This patch creates sk_set_txhash and eliminates protocol specific inet_set_txhash and ip6_set_txhash. sk_set_txhash simply sets a random number instead of performing flow dissection. sk_set_txash is also allowed to be called multiple times for the same socket, we'll need this when redoing the hash for negative routing advice. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/ip.h | 16 ---------------- include/net/ipv6.h | 19 ------------------- include/net/sock.h | 8 ++++++++ 3 files changed, 8 insertions(+), 35 deletions(-) (limited to 'include/net') diff --git a/include/net/ip.h b/include/net/ip.h index d5fe9f2ab699..bee5f3582e38 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -370,22 +370,6 @@ static inline void iph_to_flow_copy_v4addrs(struct flow_keys *flow, flow->control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; } -static inline void inet_set_txhash(struct sock *sk) -{ - struct inet_sock *inet = inet_sk(sk); - struct flow_keys keys; - - memset(&keys, 0, sizeof(keys)); - - keys.addrs.v4addrs.src = inet->inet_saddr; - keys.addrs.v4addrs.dst = inet->inet_daddr; - keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; - keys.ports.src = inet->inet_sport; - keys.ports.dst = inet->inet_dport; - - sk->sk_txhash = flow_hash_from_keys(&keys); -} - static inline __wsum inet_gro_compute_pseudo(struct sk_buff *skb, int proto) { const struct iphdr *iph = skb_gro_network_header(skb); diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 82dbdb092a5d..7c79798bcaab 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -707,25 +707,6 @@ static inline void iph_to_flow_copy_v6addrs(struct flow_keys *flow, } #if IS_ENABLED(CONFIG_IPV6) -static inline void ip6_set_txhash(struct sock *sk) -{ - struct inet_sock *inet = inet_sk(sk); - struct ipv6_pinfo *np = inet6_sk(sk); - struct flow_keys keys; - - memset(&keys, 0, sizeof(keys)); - - memcpy(&keys.addrs.v6addrs.src, &np->saddr, - sizeof(keys.addrs.v6addrs.src)); - memcpy(&keys.addrs.v6addrs.dst, &sk->sk_v6_daddr, - sizeof(keys.addrs.v6addrs.dst)); - keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; - keys.ports.src = inet->inet_sport; - keys.ports.dst = inet->inet_dport; - - sk->sk_txhash = flow_hash_from_keys(&keys); -} - static inline __be32 ip6_make_flowlabel(struct net *net, struct sk_buff *skb, __be32 flowlabel, bool autolabel) { diff --git a/include/net/sock.h b/include/net/sock.h index 4353ef70bf48..fe735c4841f6 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1687,6 +1687,14 @@ static inline void sock_graft(struct sock *sk, struct socket *parent) kuid_t sock_i_uid(struct sock *sk); unsigned long sock_i_ino(struct sock *sk); +static inline void sk_set_txhash(struct sock *sk) +{ + sk->sk_txhash = prandom_u32(); + + if (unlikely(!sk->sk_txhash)) + sk->sk_txhash = 1; +} + static inline struct dst_entry * __sk_dst_get(struct sock *sk) { -- cgit v1.2.3 From 265f94ff54d62503663d9c788ba1f082e448f8b8 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Tue, 28 Jul 2015 16:02:06 -0700 Subject: net: Recompute sk_txhash on negative routing advice When a connection is failing a transport protocol calls dst_negative_advice to try to get a better route. This patch includes changing the sk_txhash in that function. This provides a rudimentary method to try to find a different path in the network since sk_txhash affects ECMP on the local host and through the network (via flow labels or UDP source port in encapsulation). Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/sock.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/net') diff --git a/include/net/sock.h b/include/net/sock.h index fe735c4841f6..24aa75c5317a 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1695,6 +1695,12 @@ static inline void sk_set_txhash(struct sock *sk) sk->sk_txhash = 1; } +static inline void sk_rethink_txhash(struct sock *sk) +{ + if (sk->sk_txhash) + sk_set_txhash(sk); +} + static inline struct dst_entry * __sk_dst_get(struct sock *sk) { @@ -1719,6 +1725,8 @@ static inline void dst_negative_advice(struct sock *sk) { struct dst_entry *ndst, *dst = __sk_dst_get(sk); + sk_rethink_txhash(sk); + if (dst && dst->ops->negative_advice) { ndst = dst->ops->negative_advice(dst); -- cgit v1.2.3 From 92a99bf3bae7c1267db87bb3e3babda2c6dcc8a7 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Wed, 29 Jul 2015 09:45:40 +0200 Subject: lwtunnel: Make lwtun_encaps[] static Any external user should use the registration API instead of accessing this directly. Cc: Roopa Prabhu Signed-off-by: Thomas Graf Acked-by: Roopa Prabhu Signed-off-by: David S. Miller --- include/net/lwtunnel.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/net') diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h index b02039081b04..33bd30963a95 100644 --- a/include/net/lwtunnel.h +++ b/include/net/lwtunnel.h @@ -31,9 +31,6 @@ struct lwtunnel_encap_ops { int (*cmp_encap)(struct lwtunnel_state *a, struct lwtunnel_state *b); }; -extern const struct lwtunnel_encap_ops __rcu * - lwtun_encaps[LWTUNNEL_ENCAP_MAX+1]; - #ifdef CONFIG_LWTUNNEL static inline struct lwtunnel_state * lwtstate_get(struct lwtunnel_state *lws) -- cgit v1.2.3 From d3aa45ce6b94c65b83971257317867db13e5f492 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 30 Jul 2015 15:36:57 -0700 Subject: bpf: add helpers to access tunnel metadata Introduce helpers to let eBPF programs attached to TC manipulate tunnel metadata: bpf_skb_[gs]et_tunnel_key(skb, key, size, flags) skb: pointer to skb key: pointer to 'struct bpf_tunnel_key' size: size of 'struct bpf_tunnel_key' flags: room for future extensions First eBPF program that uses these helpers will allocate per_cpu metadata_dst structures that will be used on TX. On RX metadata_dst is allocated by tunnel driver. Typical usage for TX: struct bpf_tunnel_key tkey; ... populate tkey ... bpf_skb_set_tunnel_key(skb, &tkey, sizeof(tkey), 0); bpf_clone_redirect(skb, vxlan_dev_ifindex, 0); RX: struct bpf_tunnel_key tkey = {}; bpf_skb_get_tunnel_key(skb, &tkey, sizeof(tkey), 0); ... lookup or redirect based on tkey ... 'struct bpf_tunnel_key' will be extended in the future by adding elements to the end and the 'size' argument will indicate which fields are populated, thereby keeping backwards compatibility. The 'flags' argument may be used as well when the 'size' is not enough or to indicate completely different layout of bpf_tunnel_key. Signed-off-by: Alexei Starovoitov Acked-by: Thomas Graf Signed-off-by: David S. Miller --- include/net/dst_metadata.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h index 7b0306894663..075f523ff23f 100644 --- a/include/net/dst_metadata.h +++ b/include/net/dst_metadata.h @@ -51,5 +51,6 @@ static inline bool skb_valid_dst(const struct sk_buff *skb) } struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags); +struct metadata_dst __percpu *metadata_dst_alloc_percpu(u8 optslen, gfp_t flags); #endif /* __NET_DST_METADATA_H */ -- cgit v1.2.3 From 343d60aada5a358ca186d6e9e353230379c426d8 Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Thu, 30 Jul 2015 13:34:53 -0700 Subject: ipv6: change ipv6_stub_impl.ipv6_dst_lookup to take net argument This patch adds net argument to ipv6_stub_impl.ipv6_dst_lookup for use cases where sk is not available (like mpls). sk appears to be needed to get the namespace 'net' and is optional otherwise. This patch series changes ipv6_stub_impl.ipv6_dst_lookup to take net argument. sk remains optional. All callers of ipv6_stub_impl.ipv6_dst_lookup have been modified to pass net. I have modified them to use already available 'net' in the scope of the call. I can change them to sock_net(sk) to avoid any unintended change in behaviour if sock namespace is different. They dont seem to be from code inspection. Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- include/net/addrconf.h | 4 ++-- include/net/ipv6.h | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/addrconf.h b/include/net/addrconf.h index def59d3a34d5..0c3ac5acb85f 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -158,8 +158,8 @@ struct ipv6_stub { const struct in6_addr *addr); int (*ipv6_sock_mc_drop)(struct sock *sk, int ifindex, const struct in6_addr *addr); - int (*ipv6_dst_lookup)(struct sock *sk, struct dst_entry **dst, - struct flowi6 *fl6); + int (*ipv6_dst_lookup)(struct net *net, struct sock *sk, + struct dst_entry **dst, struct flowi6 *fl6); void (*udpv6_encap_enable)(void); void (*ndisc_send_na)(struct net_device *dev, struct neighbour *neigh, const struct in6_addr *daddr, diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 7c79798bcaab..eecdfc92f807 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -813,7 +813,8 @@ static inline struct sk_buff *ip6_finish_skb(struct sock *sk) &inet6_sk(sk)->cork); } -int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6); +int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst, + struct flowi6 *fl6); struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, const struct in6_addr *final_dst); struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, -- cgit v1.2.3 From 67800f9b1f4eb5bbefc32e3f5044097354bc85b3 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Fri, 31 Jul 2015 16:52:11 -0700 Subject: ipv6: Call skb_get_hash_flowi6 to get skb->hash in ip6_make_flowlabel We can't call skb_get_hash here since the packet is not complete to do flow_dissector. Create hash based on flowi6 instead. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/ipv6.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index eecdfc92f807..3e334b33ef3a 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -708,12 +708,13 @@ static inline void iph_to_flow_copy_v6addrs(struct flow_keys *flow, #if IS_ENABLED(CONFIG_IPV6) static inline __be32 ip6_make_flowlabel(struct net *net, struct sk_buff *skb, - __be32 flowlabel, bool autolabel) + __be32 flowlabel, bool autolabel, + struct flowi6 *fl6) { if (!flowlabel && (autolabel || net->ipv6.sysctl.auto_flowlabels)) { u32 hash; - hash = skb_get_hash(skb); + hash = skb_get_hash_flowi6(skb, fl6); /* Since this is being sent on the wire obfuscate hash a bit * to minimize possbility that any useful information to an -- cgit v1.2.3 From 42240901f7c438636715b9cb6ed93f4441ffc091 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Fri, 31 Jul 2015 16:52:12 -0700 Subject: ipv6: Implement different admin modes for automatic flow labels Change the meaning of net.ipv6.auto_flowlabels to provide a mode for automatic flow labels generation. There are four modes: 0: flow labels are disabled 1: flow labels are enabled, sockets can opt-out 2: flow labels are allowed, sockets can opt-in 3: flow labels are enabled and enforced, no opt-out for sockets np->autoflowlabel is initialized according to the sysctl value. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/ipv6.h | 59 ++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 46 insertions(+), 13 deletions(-) (limited to 'include/net') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 3e334b33ef3a..c02c1c03363a 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -707,36 +707,69 @@ static inline void iph_to_flow_copy_v6addrs(struct flow_keys *flow, } #if IS_ENABLED(CONFIG_IPV6) + +/* Sysctl settings for net ipv6.auto_flowlabels */ +#define IP6_AUTO_FLOW_LABEL_OFF 0 +#define IP6_AUTO_FLOW_LABEL_OPTOUT 1 +#define IP6_AUTO_FLOW_LABEL_OPTIN 2 +#define IP6_AUTO_FLOW_LABEL_FORCED 3 + +#define IP6_AUTO_FLOW_LABEL_MAX IP6_AUTO_FLOW_LABEL_FORCED + +#define IP6_DEFAULT_AUTO_FLOW_LABELS IP6_AUTO_FLOW_LABEL_OFF + static inline __be32 ip6_make_flowlabel(struct net *net, struct sk_buff *skb, __be32 flowlabel, bool autolabel, struct flowi6 *fl6) { - if (!flowlabel && (autolabel || net->ipv6.sysctl.auto_flowlabels)) { - u32 hash; + u32 hash; - hash = skb_get_hash_flowi6(skb, fl6); + if (flowlabel || + net->ipv6.sysctl.auto_flowlabels == IP6_AUTO_FLOW_LABEL_OFF || + (!autolabel && + net->ipv6.sysctl.auto_flowlabels != IP6_AUTO_FLOW_LABEL_FORCED)) + return flowlabel; - /* Since this is being sent on the wire obfuscate hash a bit - * to minimize possbility that any useful information to an - * attacker is leaked. Only lower 20 bits are relevant. - */ - hash ^= hash >> 12; + hash = skb_get_hash_flowi6(skb, fl6); - flowlabel = (__force __be32)hash & IPV6_FLOWLABEL_MASK; + /* Since this is being sent on the wire obfuscate hash a bit + * to minimize possbility that any useful information to an + * attacker is leaked. Only lower 20 bits are relevant. + */ + rol32(hash, 16); - if (net->ipv6.sysctl.flowlabel_state_ranges) - flowlabel |= IPV6_FLOWLABEL_STATELESS_FLAG; - } + flowlabel = (__force __be32)hash & IPV6_FLOWLABEL_MASK; + + if (net->ipv6.sysctl.flowlabel_state_ranges) + flowlabel |= IPV6_FLOWLABEL_STATELESS_FLAG; return flowlabel; } + +static inline int ip6_default_np_autolabel(struct net *net) +{ + switch (net->ipv6.sysctl.auto_flowlabels) { + case IP6_AUTO_FLOW_LABEL_OFF: + case IP6_AUTO_FLOW_LABEL_OPTIN: + default: + return 0; + case IP6_AUTO_FLOW_LABEL_OPTOUT: + case IP6_AUTO_FLOW_LABEL_FORCED: + return 1; + } +} #else static inline void ip6_set_txhash(struct sock *sk) { } static inline __be32 ip6_make_flowlabel(struct net *net, struct sk_buff *skb, - __be32 flowlabel, bool autolabel) + __be32 flowlabel, bool autolabel, + struct flowi6 *fl6) { return flowlabel; } +static inline int ip6_default_np_autolabel(struct net *net) +{ + return 0; +} #endif -- cgit v1.2.3 From b56774163f994efce3f5603f35aa4e677c3e725a Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Fri, 31 Jul 2015 16:52:14 -0700 Subject: ipv6: Enable auto flow labels by default Initialize auto_flowlabels to one. This enables automatic flow labels, individual socket may disable them using the IPV6_AUTOFLOWLABEL socket option. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/ipv6.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index c02c1c03363a..711cca428cc8 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -716,7 +716,7 @@ static inline void iph_to_flow_copy_v6addrs(struct flow_keys *flow, #define IP6_AUTO_FLOW_LABEL_MAX IP6_AUTO_FLOW_LABEL_FORCED -#define IP6_DEFAULT_AUTO_FLOW_LABELS IP6_AUTO_FLOW_LABEL_OFF +#define IP6_DEFAULT_AUTO_FLOW_LABELS IP6_AUTO_FLOW_LABEL_OPTOUT static inline __be32 ip6_make_flowlabel(struct net *net, struct sk_buff *skb, __be32 flowlabel, bool autolabel, -- cgit v1.2.3 From bbde9fc1824aab58bc78c084163007dd6c03fe5b Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 31 May 2015 17:54:44 +0200 Subject: netfilter: factor out packet duplication for IPv4/IPv6 Extracted from the xtables TEE target. This creates two new modules for IPv4 and IPv6 that are shared between the TEE target and the new nf_tables dup expressions. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/ipv4/nf_dup_ipv4.h | 7 +++++++ include/net/netfilter/ipv6/nf_dup_ipv6.h | 7 +++++++ 2 files changed, 14 insertions(+) create mode 100644 include/net/netfilter/ipv4/nf_dup_ipv4.h create mode 100644 include/net/netfilter/ipv6/nf_dup_ipv6.h (limited to 'include/net') diff --git a/include/net/netfilter/ipv4/nf_dup_ipv4.h b/include/net/netfilter/ipv4/nf_dup_ipv4.h new file mode 100644 index 000000000000..42008f10dfc4 --- /dev/null +++ b/include/net/netfilter/ipv4/nf_dup_ipv4.h @@ -0,0 +1,7 @@ +#ifndef _NF_DUP_IPV4_H_ +#define _NF_DUP_IPV4_H_ + +void nf_dup_ipv4(struct sk_buff *skb, unsigned int hooknum, + const struct in_addr *gw, int oif); + +#endif /* _NF_DUP_IPV4_H_ */ diff --git a/include/net/netfilter/ipv6/nf_dup_ipv6.h b/include/net/netfilter/ipv6/nf_dup_ipv6.h new file mode 100644 index 000000000000..ed6bd66fa5a0 --- /dev/null +++ b/include/net/netfilter/ipv6/nf_dup_ipv6.h @@ -0,0 +1,7 @@ +#ifndef _NF_DUP_IPV6_H_ +#define _NF_DUP_IPV6_H_ + +void nf_dup_ipv6(struct sk_buff *skb, unsigned int hooknum, + const struct in6_addr *gw, int oif); + +#endif /* _NF_DUP_IPV6_H_ */ -- cgit v1.2.3 From d877f07112f1e5a247c6b585c971a93895c9f738 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 31 May 2015 18:04:11 +0200 Subject: netfilter: nf_tables: add nft_dup expression This new expression uses the nf_dup engine to clone packets to a given gateway. Unlike xt_TEE, we use an index to indicate output interface which should be fine at this stage. Moreover, change to the preemtion-safe this_cpu_read(nf_skb_duplicated) from nf_dup_ipv{4,6} to silence a lockdep splat. Based on the original tee expression from Arturo Borrero Gonzalez, although this patch has diverted quite a bit from this initial effort due to the change to support maps. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nft_dup.h | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 include/net/netfilter/nft_dup.h (limited to 'include/net') diff --git a/include/net/netfilter/nft_dup.h b/include/net/netfilter/nft_dup.h new file mode 100644 index 000000000000..6b84cf6491a2 --- /dev/null +++ b/include/net/netfilter/nft_dup.h @@ -0,0 +1,9 @@ +#ifndef _NFT_DUP_H_ +#define _NFT_DUP_H_ + +struct nft_dup_inet { + enum nft_registers sreg_addr:8; + enum nft_registers sreg_dev:8; +}; + +#endif /* _NFT_DUP_H_ */ -- cgit v1.2.3 From 3499abb249bb5ed9d21031944bc3059ec4aa2909 Mon Sep 17 00:00:00 2001 From: Andreas Schultz Date: Wed, 5 Aug 2015 17:51:45 +0200 Subject: netfilter: nfacct: per network namespace support - Move the nfnl_acct_list into the network namespace, initialize and destroy it per namespace - Keep track of refcnt on nfacct objects, the old logic does not longer work with a per namespace list - Adjust xt_nfacct to pass the namespace when registring objects Signed-off-by: Andreas Schultz Signed-off-by: Pablo Neira Ayuso --- include/net/net_namespace.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/net') diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index e951453e0a23..2dcea635ecce 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -118,6 +118,9 @@ struct net { #endif struct sock *nfnl; struct sock *nfnl_stash; +#if IS_ENABLED(CONFIG_NETFILTER_NETLINK_ACCT) + struct list_head nfnl_acct_list; +#endif #endif #ifdef CONFIG_WEXT_CORE struct sk_buff_head wext_nlevents; -- cgit v1.2.3 From da8b43c0e1dcea3bcac5f37ea59934ddaa137aed Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 4 Aug 2015 22:51:07 -0700 Subject: vxlan: combine VXLAN_FLOWBASED into VXLAN_COLLECT_METADATA IFLA_VXLAN_FLOWBASED is useless without IFLA_VXLAN_COLLECT_METADATA, so combine them into single IFLA_VXLAN_COLLECT_METADATA flag. 'flowbased' doesn't convey real meaning of the vxlan tunnel mode. This mode can be used by routing, tc+bpf and ovs. Only ovs is strictly flow based, so 'collect metadata' is a better name for this tunnel mode. Signed-off-by: Alexei Starovoitov Acked-by: Thomas Graf Signed-off-by: David S. Miller --- include/net/vxlan.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/vxlan.h b/include/net/vxlan.h index eb8d721cdb67..e4534f1b2d8c 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -181,7 +181,6 @@ struct vxlan_dev { #define VXLAN_F_GBP 0x800 #define VXLAN_F_REMCSUM_NOPARTIAL 0x1000 #define VXLAN_F_COLLECT_METADATA 0x2000 -#define VXLAN_F_FLOW_BASED 0x4000 /* Flags that are used in the receive path. These flags must match in * order for a socket to be shareable @@ -190,8 +189,7 @@ struct vxlan_dev { VXLAN_F_UDP_ZERO_CSUM6_RX | \ VXLAN_F_REMCSUM_RX | \ VXLAN_F_REMCSUM_NOPARTIAL | \ - VXLAN_F_COLLECT_METADATA | \ - VXLAN_F_FLOW_BASED) + VXLAN_F_COLLECT_METADATA) struct net_device *vxlan_dev_create(struct net *net, const char *name, u8 name_assign_type, struct vxlan_config *conf); -- cgit v1.2.3 From 1525c386a1f01612c6f3f27241113d7fc8e6d72d Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Thu, 6 Aug 2015 01:44:02 -0400 Subject: net: switchdev: change fdb addr for a byte array The address in the switchdev_obj_fdb structure is currently represented as a pointer. Replacing it for a 6-byte array allows switchdev to carry addresses directly read from hardware registers, not stored by the switch chip driver (as in Rocker). Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- include/net/switchdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/switchdev.h b/include/net/switchdev.h index 89da8934519b..e90e1a0fa579 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -70,7 +70,7 @@ struct switchdev_obj { u32 tb_id; } ipv4_fib; struct switchdev_obj_fdb { /* PORT_FDB */ - const unsigned char *addr; + u8 addr[ETH_ALEN]; u16 vid; } fdb; } u; -- cgit v1.2.3 From 890248261a18c7ae22923095dfadea2c0a2a304a Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Thu, 6 Aug 2015 01:44:03 -0400 Subject: net: switchdev: support static FDB addresses This patch adds a is_static boolean to the switchdev_obj_fdb structure, in order to set the ndm_state to either NUD_NOARP or NUD_REACHABLE. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- include/net/switchdev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/switchdev.h b/include/net/switchdev.h index e90e1a0fa579..0e296b82aef3 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -72,6 +72,7 @@ struct switchdev_obj { struct switchdev_obj_fdb { /* PORT_FDB */ u8 addr[ETH_ALEN]; u16 vid; + bool is_static; } fdb; } u; }; -- cgit v1.2.3 From 55045ddded0f39d84c2ca019508973be8c595a78 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Thu, 6 Aug 2015 01:44:04 -0400 Subject: net: dsa: add support for switchdev FDB objects Remove the fdb_{add,del,getnext} function pointer in favor of new port_fdb_{add,del,getnext}. Implement the switchdev_port_obj_{add,del,dump} functions in DSA to support the SWITCHDEV_OBJ_PORT_FDB objects. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- include/net/dsa.h | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index fbca63ba8f73..091d35f77180 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -296,12 +296,16 @@ struct dsa_switch_driver { u32 br_port_mask); int (*port_stp_update)(struct dsa_switch *ds, int port, u8 state); - int (*fdb_add)(struct dsa_switch *ds, int port, - const unsigned char *addr, u16 vid); - int (*fdb_del)(struct dsa_switch *ds, int port, - const unsigned char *addr, u16 vid); - int (*fdb_getnext)(struct dsa_switch *ds, int port, - unsigned char *addr, bool *is_static); + + /* + * Forwarding database + */ + int (*port_fdb_add)(struct dsa_switch *ds, int port, u16 vid, + const u8 addr[ETH_ALEN]); + int (*port_fdb_del)(struct dsa_switch *ds, int port, u16 vid, + const u8 addr[ETH_ALEN]); + int (*port_fdb_getnext)(struct dsa_switch *ds, int port, u16 *vid, + u8 addr[ETH_ALEN], bool *is_static); }; void register_switch_driver(struct dsa_switch_driver *type); -- cgit v1.2.3 From 51e0e5d8124ece158927a4c2288c0929d3b53aa3 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Mon, 10 Aug 2015 21:15:53 +0200 Subject: ieee802154: 6lowpan: remove multiple lowpan per wpan support We currently supports multiple lowpan interfaces per wpan interface. I never saw any use case into such functionality. We drop this feature now because it's much easier do deal with address changes inside the under laying wpan interface. This patch removes the multiple lowpan interface and adds a lowpan_dev netdev pointer into the wpan_dev, if this pointer isn't null the wpan interface belongs to the assigned lowpan interface. Reviewed-by: Stefan Schmidt Tested-by: Stefan Schmidt Signed-off-by: Alexander Aring Signed-off-by: Marcel Holtmann --- include/net/cfg802154.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/net') diff --git a/include/net/cfg802154.h b/include/net/cfg802154.h index 382f94b59f2f..e53b6bfda976 100644 --- a/include/net/cfg802154.h +++ b/include/net/cfg802154.h @@ -173,6 +173,9 @@ struct wpan_dev { struct list_head list; struct net_device *netdev; + /* lowpan interface, set when the wpan_dev belongs to one lowpan_dev */ + struct net_device *lowpan_dev; + u32 identifier; /* MAC PIB */ -- cgit v1.2.3 From c91208d819c814e7f418c7a083059cf533ad0396 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Mon, 10 Aug 2015 21:15:58 +0200 Subject: ieee802154: add ack request default handling This patch introduce a new mib entry which isn't part of 802.15.4 but useful as default behaviour to set the ack request bit or not if we don't know if the ack request bit should set. This is currently used for stacks like IEEE 802.15.4 6LoWPAN. Reviewed-by: Stefan Schmidt Signed-off-by: Alexander Aring Signed-off-by: Marcel Holtmann --- include/net/cfg802154.h | 5 +++++ include/net/nl802154.h | 4 ++++ 2 files changed, 9 insertions(+) (limited to 'include/net') diff --git a/include/net/cfg802154.h b/include/net/cfg802154.h index e53b6bfda976..76b1ffaea863 100644 --- a/include/net/cfg802154.h +++ b/include/net/cfg802154.h @@ -63,6 +63,8 @@ struct cfg802154_ops { s8 max_frame_retries); int (*set_lbt_mode)(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, bool mode); + int (*set_ackreq_default)(struct wpan_phy *wpan_phy, + struct wpan_dev *wpan_dev, bool ackreq); }; static inline bool @@ -196,6 +198,9 @@ struct wpan_dev { bool lbt; bool promiscuous_mode; + + /* fallback for acknowledgment bit setting */ + bool ackreq; }; #define to_phy(_dev) container_of(_dev, struct wpan_phy, dev) diff --git a/include/net/nl802154.h b/include/net/nl802154.h index b0ab530d28cd..cf2713d8b975 100644 --- a/include/net/nl802154.h +++ b/include/net/nl802154.h @@ -52,6 +52,8 @@ enum nl802154_commands { NL802154_CMD_SET_LBT_MODE, + NL802154_CMD_SET_ACKREQ_DEFAULT, + /* add new commands above here */ /* used to define NL802154_CMD_MAX below */ @@ -104,6 +106,8 @@ enum nl802154_attrs { NL802154_ATTR_SUPPORTED_COMMANDS, + NL802154_ATTR_ACKREQ_DEFAULT, + /* add attributes here, update the policy in nl802154.c */ __NL802154_ATTR_AFTER_LAST, -- cgit v1.2.3 From 158e92185075184ebc5f25bab61fdd598693e28d Mon Sep 17 00:00:00 2001 From: Jakub Pawlowski Date: Fri, 7 Aug 2015 20:22:51 +0200 Subject: Bluetooth: preparation for new connect procedure Currently, when trying to connect to already paired device that just rotated its RPA MAC address, old address would be used and connection would fail. In order to fix that, kernel must scan and receive advertisement with fresh RPA before connecting. This patch adds some fields to hci_conn_params, in preparation to new connect procedure. explicit_connect will be used to override any current auto_connect action, and connect to device when ad is received. HCI_AUTO_CONN_EXPLICIT was added to auto_connect enum. When this value will be used, explicit connect is the only action, and params can be removed after successful connection. HCI_CONN_SCANNING is added to hci_conn flags. When it's set, connect is scan phase. It gets cleared when advertisement is received, and HCI_OP_LE_CREATE_CONN is sent. Signed-off-by: Jakub Pawlowski Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_core.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/net') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 2a6b0919e23f..c8d2b5a89d08 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -512,9 +512,11 @@ struct hci_conn_params { HCI_AUTO_CONN_DIRECT, HCI_AUTO_CONN_ALWAYS, HCI_AUTO_CONN_LINK_LOSS, + HCI_AUTO_CONN_EXPLICIT, } auto_connect; struct hci_conn *conn; + bool explicit_connect; }; extern struct list_head hci_dev_list; @@ -639,6 +641,7 @@ enum { HCI_CONN_DROP, HCI_CONN_PARAM_REMOVAL_PEND, HCI_CONN_NEW_LINK_KEY, + HCI_CONN_SCANNING, }; static inline bool hci_conn_ssp_enabled(struct hci_conn *conn) -- cgit v1.2.3 From e7d9ab731ac7babaf2e1b7b5e2280f5f555d263f Mon Sep 17 00:00:00 2001 From: Jakub Pawlowski Date: Fri, 7 Aug 2015 20:22:52 +0200 Subject: Bluetooth: add hci_lookup_le_connect This patch adds hci_lookup_le_connect method, that will be used to check wether outgoing le connection attempt is in progress. Signed-off-by: Jakub Pawlowski Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_core.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'include/net') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index c8d2b5a89d08..f0a9fc1d06e0 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -811,6 +811,26 @@ static inline struct hci_conn *hci_conn_hash_lookup_state(struct hci_dev *hdev, return NULL; } +static inline struct hci_conn *hci_lookup_le_connect(struct hci_dev *hdev) +{ + struct hci_conn_hash *h = &hdev->conn_hash; + struct hci_conn *c; + + rcu_read_lock(); + + list_for_each_entry_rcu(c, &h->list, list) { + if (c->type == LE_LINK && c->state == BT_CONNECT && + !test_bit(HCI_CONN_SCANNING, &c->flags)) { + rcu_read_unlock(); + return c; + } + } + + rcu_read_unlock(); + + return NULL; +} + int hci_disconnect(struct hci_conn *conn, __u8 reason); bool hci_setup_sync(struct hci_conn *conn, __u16 handle); void hci_sco_setup(struct hci_conn *conn, __u8 status); -- cgit v1.2.3 From f75113a26008980ca13834fb6573145523596776 Mon Sep 17 00:00:00 2001 From: Jakub Pawlowski Date: Fri, 7 Aug 2015 20:22:53 +0200 Subject: Bluetooth: add hci_connect_le_scan Currently, when trying to connect to already paired device that just rotated its RPA MAC address, old address would be used and connection would fail. In order to fix that, kernel must scan and receive advertisement with fresh RPA before connecting. This patch adds hci_connect_le_scan with dependencies, new method that will be used to connect to remote LE devices. Instead of just sending connect request, it adds a device to whitelist. Later patches will make use of this whitelist to send conenct request when advertisement is received, and properly handle timeouts. Signed-off-by: Jakub Pawlowski Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_core.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/net') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index f0a9fc1d06e0..9e1a59e01fa2 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -846,6 +846,9 @@ void hci_chan_del(struct hci_chan *chan); void hci_chan_list_flush(struct hci_conn *conn); struct hci_chan *hci_chan_lookup_handle(struct hci_dev *hdev, __u16 handle); +struct hci_conn *hci_connect_le_scan(struct hci_dev *hdev, bdaddr_t *dst, + u8 dst_type, u8 sec_level, + u16 conn_timeout, u8 role); struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst, u8 dst_type, u8 sec_level, u16 conn_timeout, u8 role); @@ -1011,6 +1014,9 @@ void hci_conn_params_clear_disabled(struct hci_dev *hdev); struct hci_conn_params *hci_pend_le_action_lookup(struct list_head *list, bdaddr_t *addr, u8 addr_type); +struct hci_conn_params *hci_explicit_connect_lookup(struct hci_dev *hdev, + bdaddr_t *addr, + u8 addr_type); void hci_uuids_clear(struct hci_dev *hdev); -- cgit v1.2.3 From fb811395cd5a71b9e94a068f524a6f4a21b67bdb Mon Sep 17 00:00:00 2001 From: Rick Jones Date: Fri, 7 Aug 2015 11:10:37 -0700 Subject: net: add explicit logging and stat for neighbour table overflow Add an explicit neighbour table overflow message (ratelimited) and statistic to make diagnosing neighbour table overflows tractable in the wild. Diagnosing a neighbour table overflow can be quite difficult in the wild because there is no explicit dmesg logged. Callers to neighbour code seem to use net_dbg_ratelimit when the neighbour call fails which means the "base message" is not emitted and the callback suppressed messages from the ratelimiting can end-up juxtaposed with unrelated messages. Further, a forced garbage collection will increment a stat on each call whether it was successful in freeing-up a table entry or not, so that statistic is only a hint. So, add a net_info_ratelimited message and explicit statistic to the neighbour code. Signed-off-by: Rick Jones Signed-off-by: David S. Miller --- include/net/neighbour.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/neighbour.h b/include/net/neighbour.h index bd33e66f49aa..8b683841e574 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -125,6 +125,7 @@ struct neigh_statistics { unsigned long forced_gc_runs; /* number of forced GC runs */ unsigned long unres_discards; /* number of unresolved drops */ + unsigned long table_fulls; /* times even gc couldn't help */ }; #define NEIGH_CACHE_STAT_INC(tbl, field) this_cpu_inc((tbl)->stats->field) -- cgit v1.2.3 From 2e15ea390e6f4466655066d97e22ec66870a042c Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Fri, 7 Aug 2015 23:51:42 -0700 Subject: ip_gre: Add support to collect tunnel metadata. Following patch create new tunnel flag which enable tunnel metadata collection on given device. Signed-off-by: Pravin B Shelar Acked-by: Thomas Graf Signed-off-by: David S. Miller --- include/net/ip_tunnels.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 47984415f5d1..984dbfa15e13 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -82,6 +82,8 @@ struct ip_tunnel_dst { __be32 saddr; }; +struct metadata_dst; + struct ip_tunnel { struct ip_tunnel __rcu *next; struct hlist_node hash_node; @@ -115,6 +117,7 @@ struct ip_tunnel { unsigned int prl_count; /* # of entries in PRL */ int ip_tnl_net_id; struct gro_cells gro_cells; + bool collect_md; }; #define TUNNEL_CSUM __cpu_to_be16(0x01) @@ -149,6 +152,7 @@ struct tnl_ptk_info { struct ip_tunnel_net { struct net_device *fb_tunnel_dev; struct hlist_head tunnels[IP_TNL_HASH_SIZE]; + struct ip_tunnel __rcu *collect_md_tun; }; struct ip_tunnel_encap_ops { @@ -235,7 +239,8 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn, __be32 key); int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb, - const struct tnl_ptk_info *tpi, bool log_ecn_error); + const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst, + bool log_ecn_error); int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[], struct ip_tunnel_parm *p); int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[], -- cgit v1.2.3 From b2acd1dc3949cd60c571844d495594f05f0351f4 Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Fri, 7 Aug 2015 23:51:47 -0700 Subject: openvswitch: Use regular GRE net_device instead of vport Using GRE tunnel meta data collection feature, we can implement OVS GRE vport. This patch removes all of the OVS specific GRE code and make OVS use a ip_gre net_device. Minimal GRE vport is kept to handle compatibility with current userspace application. Signed-off-by: Pravin B Shelar Acked-by: Thomas Graf Signed-off-by: David S. Miller --- include/net/gre.h | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'include/net') diff --git a/include/net/gre.h b/include/net/gre.h index b53182018743..e3e08459bf67 100644 --- a/include/net/gre.h +++ b/include/net/gre.h @@ -33,16 +33,8 @@ struct gre_cisco_protocol { int gre_cisco_register(struct gre_cisco_protocol *proto); int gre_cisco_unregister(struct gre_cisco_protocol *proto); -void gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi, - int hdr_len); - -static inline struct sk_buff *gre_handle_offloads(struct sk_buff *skb, - bool csum) -{ - return iptunnel_handle_offloads(skb, csum, - csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE); -} - +struct net_device *gretap_fb_dev_create(struct net *net, const char *name, + u8 name_assign_type); static inline int ip_gre_calc_hlen(__be16 o_flags) { -- cgit v1.2.3 From 9f57c67c379d88a10e8ad676426fee5ae7341b14 Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Fri, 7 Aug 2015 23:51:52 -0700 Subject: gre: Remove support for sharing GRE protocol hook. Support for sharing GREPROTO_CISCO port was added so that OVS gre port and kernel GRE devices can co-exist. After flow-based tunneling patches OVS GRE protocol processing is completely moved to ip_gre module. so there is no need for GRE protocol hook. Following patch consolidates GRE protocol related functions into ip_gre module. Signed-off-by: Pravin B Shelar Acked-by: Thomas Graf Signed-off-by: David S. Miller --- include/net/gre.h | 80 +++++-------------------------------------------------- 1 file changed, 6 insertions(+), 74 deletions(-) (limited to 'include/net') diff --git a/include/net/gre.h b/include/net/gre.h index e3e08459bf67..97eafdc47eea 100644 --- a/include/net/gre.h +++ b/include/net/gre.h @@ -4,6 +4,12 @@ #include #include +struct gre_base_hdr { + __be16 flags; + __be16 protocol; +}; +#define GRE_HEADER_SECTION 4 + #define GREPROTO_CISCO 0 #define GREPROTO_PPTP 1 #define GREPROTO_MAX 2 @@ -14,83 +20,9 @@ struct gre_protocol { void (*err_handler)(struct sk_buff *skb, u32 info); }; -struct gre_base_hdr { - __be16 flags; - __be16 protocol; -}; -#define GRE_HEADER_SECTION 4 - int gre_add_protocol(const struct gre_protocol *proto, u8 version); int gre_del_protocol(const struct gre_protocol *proto, u8 version); -struct gre_cisco_protocol { - int (*handler)(struct sk_buff *skb, const struct tnl_ptk_info *tpi); - int (*err_handler)(struct sk_buff *skb, u32 info, - const struct tnl_ptk_info *tpi); - u8 priority; -}; - -int gre_cisco_register(struct gre_cisco_protocol *proto); -int gre_cisco_unregister(struct gre_cisco_protocol *proto); - struct net_device *gretap_fb_dev_create(struct net *net, const char *name, u8 name_assign_type); - -static inline int ip_gre_calc_hlen(__be16 o_flags) -{ - int addend = 4; - - if (o_flags&TUNNEL_CSUM) - addend += 4; - if (o_flags&TUNNEL_KEY) - addend += 4; - if (o_flags&TUNNEL_SEQ) - addend += 4; - return addend; -} - -static inline __be16 gre_flags_to_tnl_flags(__be16 flags) -{ - __be16 tflags = 0; - - if (flags & GRE_CSUM) - tflags |= TUNNEL_CSUM; - if (flags & GRE_ROUTING) - tflags |= TUNNEL_ROUTING; - if (flags & GRE_KEY) - tflags |= TUNNEL_KEY; - if (flags & GRE_SEQ) - tflags |= TUNNEL_SEQ; - if (flags & GRE_STRICT) - tflags |= TUNNEL_STRICT; - if (flags & GRE_REC) - tflags |= TUNNEL_REC; - if (flags & GRE_VERSION) - tflags |= TUNNEL_VERSION; - - return tflags; -} - -static inline __be16 tnl_flags_to_gre_flags(__be16 tflags) -{ - __be16 flags = 0; - - if (tflags & TUNNEL_CSUM) - flags |= GRE_CSUM; - if (tflags & TUNNEL_ROUTING) - flags |= GRE_ROUTING; - if (tflags & TUNNEL_KEY) - flags |= GRE_KEY; - if (tflags & TUNNEL_SEQ) - flags |= GRE_SEQ; - if (tflags & TUNNEL_STRICT) - flags |= GRE_STRICT; - if (tflags & TUNNEL_REC) - flags |= GRE_REC; - if (tflags & TUNNEL_VERSION) - flags |= GRE_VERSION; - - return flags; -} - #endif -- cgit v1.2.3 From 308ac9143ee2208f54d061eca54a89da509b5d92 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 8 Aug 2015 21:40:01 +0200 Subject: netfilter: nf_conntrack: push zone object into functions This patch replaces the zone id which is pushed down into functions with the actual zone object. It's a bigger one-time change, but needed for later on extending zones with a direction parameter, and thus decoupling this additional information from all call-sites. No functional changes in this patch. The default zone becomes a global const object, namely nf_ct_zone_dflt and will be returned directly in various cases, one being, when there's f.e. no zoning support. Signed-off-by: Daniel Borkmann Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack.h | 10 +++++++-- include/net/netfilter/nf_conntrack_core.h | 3 ++- include/net/netfilter/nf_conntrack_expect.h | 11 +++++++--- include/net/netfilter/nf_conntrack_zones.h | 33 ++++++++++++++++++++--------- 4 files changed, 41 insertions(+), 16 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index 37cd3911d5c5..f5e23c6dee8b 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -250,8 +250,12 @@ void nf_ct_untracked_status_or(unsigned long bits); void nf_ct_iterate_cleanup(struct net *net, int (*iter)(struct nf_conn *i, void *data), void *data, u32 portid, int report); + +struct nf_conntrack_zone; + void nf_conntrack_free(struct nf_conn *ct); -struct nf_conn *nf_conntrack_alloc(struct net *net, u16 zone, +struct nf_conn *nf_conntrack_alloc(struct net *net, + const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *orig, const struct nf_conntrack_tuple *repl, gfp_t gfp); @@ -291,7 +295,9 @@ extern unsigned int nf_conntrack_max; extern unsigned int nf_conntrack_hash_rnd; void init_nf_conntrack_hash_rnd(void); -struct nf_conn *nf_ct_tmpl_alloc(struct net *net, u16 zone, gfp_t flags); +struct nf_conn *nf_ct_tmpl_alloc(struct net *net, + const struct nf_conntrack_zone *zone, + gfp_t flags); #define NF_CT_STAT_INC(net, count) __this_cpu_inc((net)->ct.stat->count) #define NF_CT_STAT_INC_ATOMIC(net, count) this_cpu_inc((net)->ct.stat->count) diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h index f2f0fa3bb150..c03f9c42b3cd 100644 --- a/include/net/netfilter/nf_conntrack_core.h +++ b/include/net/netfilter/nf_conntrack_core.h @@ -52,7 +52,8 @@ bool nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse, /* Find a connection corresponding to a tuple. */ struct nf_conntrack_tuple_hash * -nf_conntrack_find_get(struct net *net, u16 zone, +nf_conntrack_find_get(struct net *net, + const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple); int __nf_conntrack_confirm(struct sk_buff *skb); diff --git a/include/net/netfilter/nf_conntrack_expect.h b/include/net/netfilter/nf_conntrack_expect.h index 3f3aecbc8632..dce56f09ac9a 100644 --- a/include/net/netfilter/nf_conntrack_expect.h +++ b/include/net/netfilter/nf_conntrack_expect.h @@ -4,7 +4,9 @@ #ifndef _NF_CONNTRACK_EXPECT_H #define _NF_CONNTRACK_EXPECT_H + #include +#include extern unsigned int nf_ct_expect_hsize; extern unsigned int nf_ct_expect_max; @@ -76,15 +78,18 @@ int nf_conntrack_expect_init(void); void nf_conntrack_expect_fini(void); struct nf_conntrack_expect * -__nf_ct_expect_find(struct net *net, u16 zone, +__nf_ct_expect_find(struct net *net, + const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple); struct nf_conntrack_expect * -nf_ct_expect_find_get(struct net *net, u16 zone, +nf_ct_expect_find_get(struct net *net, + const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple); struct nf_conntrack_expect * -nf_ct_find_expectation(struct net *net, u16 zone, +nf_ct_find_expectation(struct net *net, + const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple); void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp, diff --git a/include/net/netfilter/nf_conntrack_zones.h b/include/net/netfilter/nf_conntrack_zones.h index 034efe8d45a5..0788bb0f267d 100644 --- a/include/net/netfilter/nf_conntrack_zones.h +++ b/include/net/netfilter/nf_conntrack_zones.h @@ -1,25 +1,38 @@ #ifndef _NF_CONNTRACK_ZONES_H #define _NF_CONNTRACK_ZONES_H -#define NF_CT_DEFAULT_ZONE 0 - -#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) -#include +#define NF_CT_DEFAULT_ZONE_ID 0 struct nf_conntrack_zone { u16 id; }; -static inline u16 nf_ct_zone(const struct nf_conn *ct) +extern const struct nf_conntrack_zone nf_ct_zone_dflt; + +#if IS_ENABLED(CONFIG_NF_CONNTRACK) +#include + +static inline const struct nf_conntrack_zone * +nf_ct_zone(const struct nf_conn *ct) { + const struct nf_conntrack_zone *nf_ct_zone = NULL; + #ifdef CONFIG_NF_CONNTRACK_ZONES - struct nf_conntrack_zone *nf_ct_zone; nf_ct_zone = nf_ct_ext_find(ct, NF_CT_EXT_ZONE); - if (nf_ct_zone) - return nf_ct_zone->id; #endif - return NF_CT_DEFAULT_ZONE; + return nf_ct_zone ? nf_ct_zone : &nf_ct_zone_dflt; } -#endif /* CONFIG_NF_CONNTRACK || CONFIG_NF_CONNTRACK_MODULE */ +static inline const struct nf_conntrack_zone * +nf_ct_zone_tmpl(const struct nf_conn *tmpl) +{ + return tmpl ? nf_ct_zone(tmpl) : &nf_ct_zone_dflt; +} + +static inline bool nf_ct_zone_equal(const struct nf_conn *a, + const struct nf_conntrack_zone *b) +{ + return nf_ct_zone(a)->id == b->id; +} +#endif /* IS_ENABLED(CONFIG_NF_CONNTRACK) */ #endif /* _NF_CONNTRACK_ZONES_H */ -- cgit v1.2.3 From 42a7b32b73d6bf22e4bdd7bf68746e2d71f4cd8d Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 10 Aug 2015 16:58:11 -0600 Subject: xfrm: Add oif to dst lookups Rules can be installed that direct route lookups to specific tables based on oif. Plumb the oif through the xfrm lookups so it gets set in the flow struct and passed to the resolver routines. Signed-off-by: David Ahern Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index f0ee97eec24d..312e3fee9ccf 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -285,10 +285,13 @@ struct xfrm_policy_afinfo { unsigned short family; struct dst_ops *dst_ops; void (*garbage_collect)(struct net *net); - struct dst_entry *(*dst_lookup)(struct net *net, int tos, + struct dst_entry *(*dst_lookup)(struct net *net, + int tos, int oif, const xfrm_address_t *saddr, const xfrm_address_t *daddr); - int (*get_saddr)(struct net *net, xfrm_address_t *saddr, xfrm_address_t *daddr); + int (*get_saddr)(struct net *net, int oif, + xfrm_address_t *saddr, + xfrm_address_t *daddr); void (*decode_session)(struct sk_buff *skb, struct flowi *fl, int reverse); -- cgit v1.2.3 From cdf0969763e020923abe28fddc605add572febc2 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 11 Aug 2015 12:00:37 -0700 Subject: Revert "Merge branch 'mv88e6xxx-switchdev-fdb'" This reverts commit f1d5ca434413b20cd3f8c18ff2b634b7782149a5, reversing changes made to 4933d85c5173832ebd261756522095837583c458. I applied v2 instead of v3. Signed-off-by: David S. Miller --- include/net/dsa.h | 16 ++++++---------- include/net/switchdev.h | 3 +-- 2 files changed, 7 insertions(+), 12 deletions(-) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 091d35f77180..fbca63ba8f73 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -296,16 +296,12 @@ struct dsa_switch_driver { u32 br_port_mask); int (*port_stp_update)(struct dsa_switch *ds, int port, u8 state); - - /* - * Forwarding database - */ - int (*port_fdb_add)(struct dsa_switch *ds, int port, u16 vid, - const u8 addr[ETH_ALEN]); - int (*port_fdb_del)(struct dsa_switch *ds, int port, u16 vid, - const u8 addr[ETH_ALEN]); - int (*port_fdb_getnext)(struct dsa_switch *ds, int port, u16 *vid, - u8 addr[ETH_ALEN], bool *is_static); + int (*fdb_add)(struct dsa_switch *ds, int port, + const unsigned char *addr, u16 vid); + int (*fdb_del)(struct dsa_switch *ds, int port, + const unsigned char *addr, u16 vid); + int (*fdb_getnext)(struct dsa_switch *ds, int port, + unsigned char *addr, bool *is_static); }; void register_switch_driver(struct dsa_switch_driver *type); diff --git a/include/net/switchdev.h b/include/net/switchdev.h index 0e296b82aef3..89da8934519b 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -70,9 +70,8 @@ struct switchdev_obj { u32 tb_id; } ipv4_fib; struct switchdev_obj_fdb { /* PORT_FDB */ - u8 addr[ETH_ALEN]; + const unsigned char *addr; u16 vid; - bool is_static; } fdb; } u; }; -- cgit v1.2.3 From 2a778e1b58990e15de5cba4badec1fa7ecb87e80 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Mon, 10 Aug 2015 09:09:49 -0400 Subject: net: dsa: change FDB routines prototypes Change the prototype of port_getnext to include a vid parameter. This is necessary to introduce the support for VLAN. Also rename the fdb_{add,del,getnext} function pointers to port_fdb_{add,del,getnext} since they are specific to a given port. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- include/net/dsa.h | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index fbca63ba8f73..6356f437e911 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -296,12 +296,17 @@ struct dsa_switch_driver { u32 br_port_mask); int (*port_stp_update)(struct dsa_switch *ds, int port, u8 state); - int (*fdb_add)(struct dsa_switch *ds, int port, - const unsigned char *addr, u16 vid); - int (*fdb_del)(struct dsa_switch *ds, int port, - const unsigned char *addr, u16 vid); - int (*fdb_getnext)(struct dsa_switch *ds, int port, - unsigned char *addr, bool *is_static); + + /* + * Forwarding database + */ + int (*port_fdb_add)(struct dsa_switch *ds, int port, + const unsigned char *addr, u16 vid); + int (*port_fdb_del)(struct dsa_switch *ds, int port, + const unsigned char *addr, u16 vid); + int (*port_fdb_getnext)(struct dsa_switch *ds, int port, + unsigned char *addr, u16 *vid, + bool *is_static); }; void register_switch_driver(struct dsa_switch_driver *type); -- cgit v1.2.3 From ce80e7bc57e25062c361de8fb6444129a63bac6d Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Mon, 10 Aug 2015 09:09:52 -0400 Subject: net: switchdev: support static FDB addresses This patch adds an ndm_state member to the switchdev_obj_fdb structure, in order to support static FDB addresses. Set Rocker ndm_state to NUD_REACHABLE. Signed-off-by: Vivien Didelot Acked-by: Scott Feldman Signed-off-by: David S. Miller --- include/net/switchdev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/switchdev.h b/include/net/switchdev.h index 89da8934519b..319baab3b48e 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -72,6 +72,7 @@ struct switchdev_obj { struct switchdev_obj_fdb { /* PORT_FDB */ const unsigned char *addr; u16 vid; + u16 ndm_state; } fdb; } u; }; -- cgit v1.2.3 From b72f6f51dc5abce94c1b5ee0186e9407ea0f919f Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Tue, 11 Aug 2015 21:44:08 +0200 Subject: 6lowpan: add generic 6lowpan netdev private data This patch introduced the 6lowpan netdev private data struct. We name it lowpan_priv and it's placed at the beginning of netdev private data. All lowpan interfaces should allocate this room at first of netdev private data. 6LoWPAN LL private data can be allocate by additional netdev private data, e.g. dev->priv_size should be "sizeof(struct lowpan_priv) + sizeof(LL_LOWPAN_PRIVATE_DATA)". Signed-off-by: Alexander Aring Signed-off-by: Marcel Holtmann --- include/net/6lowpan.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'include/net') diff --git a/include/net/6lowpan.h b/include/net/6lowpan.h index dc03d77ad23b..a2f59ec98d24 100644 --- a/include/net/6lowpan.h +++ b/include/net/6lowpan.h @@ -197,6 +197,27 @@ #define LOWPAN_NHC_UDP_CS_P_11 0xF3 /* source & dest = 0xF0B + 4bit inline */ #define LOWPAN_NHC_UDP_CS_C 0x04 /* checksum elided */ +#define LOWPAN_PRIV_SIZE(llpriv_size) \ + (sizeof(struct lowpan_priv) + llpriv_size) + +enum lowpan_lltypes { + LOWPAN_LLTYPE_BTLE, + LOWPAN_LLTYPE_IEEE802154, +}; + +struct lowpan_priv { + enum lowpan_lltypes lltype; + + /* must be last */ + u8 priv[0] __aligned(sizeof(void *)); +}; + +static inline +struct lowpan_priv *lowpan_priv(const struct net_device *dev) +{ + return netdev_priv(dev); +} + #ifdef DEBUG /* print data in line */ static inline void raw_dump_inline(const char *caller, char *msg, @@ -372,6 +393,8 @@ lowpan_uncompress_size(const struct sk_buff *skb, u16 *dgram_offset) return skb->len + uncomp_header - ret; } +void lowpan_netdev_setup(struct net_device *dev, enum lowpan_lltypes lltype); + int lowpan_header_decompress(struct sk_buff *skb, struct net_device *dev, const u8 *saddr, const u8 saddr_type, -- cgit v1.2.3 From 4b58c37bb9d4282446f7a0194dbc44325787ac8c Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 8 Jul 2015 15:41:48 +0300 Subject: mac80211: remove ieee80211_aes_cmac_calculate_k1_k2() The iwlwifi driver was the only driver that used this, but as it turns out it never needed it, so we can remove it. Signed-off-by: Johannes Berg --- include/net/mac80211.h | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 484cc14fb947..e3314e516681 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -4330,19 +4330,6 @@ void ieee80211_get_tkip_rx_p1k(struct ieee80211_key_conf *keyconf, void ieee80211_get_tkip_p2k(struct ieee80211_key_conf *keyconf, struct sk_buff *skb, u8 *p2k); -/** - * ieee80211_aes_cmac_calculate_k1_k2 - calculate the AES-CMAC sub keys - * - * This function computes the two AES-CMAC sub-keys, based on the - * previously installed master key. - * - * @keyconf: the parameter passed with the set key - * @k1: a buffer to be filled with the 1st sub-key - * @k2: a buffer to be filled with the 2nd sub-key - */ -void ieee80211_aes_cmac_calculate_k1_k2(struct ieee80211_key_conf *keyconf, - u8 *k1, u8 *k2); - /** * ieee80211_get_key_tx_seq - get key TX sequence counter * -- cgit v1.2.3 From 111495361598205967f1be4e07d4726b0f762d60 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Thu, 13 Aug 2015 12:52:17 -0400 Subject: net: dsa: add support for switchdev VLAN objects Add new functions in DSA drivers to access hardware VLAN entries through SWITCHDEV_OBJ_PORT_VLAN objects: - port_pvid_get() and vlan_getnext() to dump a VLAN - port_vlan_del() to exclude a port from a VLAN - port_pvid_set() and port_vlan_add() to join a port to a VLAN The DSA infrastructure will ensure that each VLAN of the given range does not already belong to another bridge. If it does, it will fallback to software VLAN and won't program the hardware. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- include/net/dsa.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 6356f437e911..bd9b76502458 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -297,6 +297,17 @@ struct dsa_switch_driver { int (*port_stp_update)(struct dsa_switch *ds, int port, u8 state); + /* + * VLAN support + */ + int (*port_pvid_get)(struct dsa_switch *ds, int port, u16 *pvid); + int (*port_pvid_set)(struct dsa_switch *ds, int port, u16 pvid); + int (*port_vlan_add)(struct dsa_switch *ds, int port, u16 vid, + bool untagged); + int (*port_vlan_del)(struct dsa_switch *ds, int port, u16 vid); + int (*vlan_getnext)(struct dsa_switch *ds, u16 *vid, + unsigned long *ports, unsigned long *untagged); + /* * Forwarding database */ -- cgit v1.2.3 From 4e3c89920cd3a6cfce22c6f537690747c26128dd Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 13 Aug 2015 14:59:00 -0600 Subject: net: Introduce VRF related flags and helpers Add a VRF_MASTER flag for interfaces and helper functions for determining if a device is a VRF_MASTER. Add link attribute for passing VRF_TABLE id. Add vrf_ptr to netdevice. Add various macros for determining if a device is a VRF device, the index of the master VRF device and table associated with VRF device. Signed-off-by: Shrijeet Mukherjee Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/vrf.h | 139 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 139 insertions(+) create mode 100644 include/net/vrf.h (limited to 'include/net') diff --git a/include/net/vrf.h b/include/net/vrf.h new file mode 100644 index 000000000000..0484d29d4589 --- /dev/null +++ b/include/net/vrf.h @@ -0,0 +1,139 @@ +/* + * include/net/net_vrf.h - adds vrf dev structure definitions + * Copyright (c) 2015 Cumulus Networks + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#ifndef __LINUX_NET_VRF_H +#define __LINUX_NET_VRF_H + +struct net_vrf_dev { + struct rcu_head rcu; + int ifindex; /* ifindex of master dev */ + u32 tb_id; /* table id for VRF */ +}; + +struct slave { + struct list_head list; + struct net_device *dev; +}; + +struct slave_queue { + struct list_head all_slaves; + int num_slaves; +}; + +struct net_vrf { + struct slave_queue queue; + struct rtable *rth; + u32 tb_id; +}; + + +#if IS_ENABLED(CONFIG_NET_VRF) +/* called with rcu_read_lock() */ +static inline int vrf_master_ifindex_rcu(const struct net_device *dev) +{ + struct net_vrf_dev *vrf_ptr; + int ifindex = 0; + + if (!dev) + return 0; + + if (netif_is_vrf(dev)) + ifindex = dev->ifindex; + else { + vrf_ptr = rcu_dereference(dev->vrf_ptr); + if (vrf_ptr) + ifindex = vrf_ptr->ifindex; + } + + return ifindex; +} + +/* called with rcu_read_lock */ +static inline int vrf_dev_table_rcu(const struct net_device *dev) +{ + int tb_id = 0; + + if (dev) { + struct net_vrf_dev *vrf_ptr; + + vrf_ptr = rcu_dereference(dev->vrf_ptr); + if (vrf_ptr) + tb_id = vrf_ptr->tb_id; + } + return tb_id; +} + +static inline int vrf_dev_table(const struct net_device *dev) +{ + int tb_id; + + rcu_read_lock(); + tb_id = vrf_dev_table_rcu(dev); + rcu_read_unlock(); + + return tb_id; +} + +/* called with rtnl */ +static inline int vrf_dev_table_rtnl(const struct net_device *dev) +{ + int tb_id = 0; + + if (dev) { + struct net_vrf_dev *vrf_ptr; + + vrf_ptr = rtnl_dereference(dev->vrf_ptr); + if (vrf_ptr) + tb_id = vrf_ptr->tb_id; + } + return tb_id; +} + +/* caller has already checked netif_is_vrf(dev) */ +static inline struct rtable *vrf_dev_get_rth(const struct net_device *dev) +{ + struct rtable *rth = ERR_PTR(-ENETUNREACH); + struct net_vrf *vrf = netdev_priv(dev); + + if (vrf) { + rth = vrf->rth; + atomic_inc(&rth->dst.__refcnt); + } + return rth; +} + +#else +static inline int vrf_master_ifindex_rcu(const struct net_device *dev) +{ + return 0; +} + +static inline int vrf_dev_table_rcu(const struct net_device *dev) +{ + return 0; +} + +static inline int vrf_dev_table(const struct net_device *dev) +{ + return 0; +} + +static inline int vrf_dev_table_rtnl(const struct net_device *dev) +{ + return 0; +} + +static inline struct rtable *vrf_dev_get_rth(const struct net_device *dev) +{ + return ERR_PTR(-ENETUNREACH); +} +#endif + +#endif /* __LINUX_NET_VRF_H */ -- cgit v1.2.3 From 613d09b30f8b589d5a9b49775054c8865db95d1c Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 13 Aug 2015 14:59:02 -0600 Subject: net: Use VRF device index for lookups on TX As with ingress use the index of VRF master device for route lookups on egress. However, the oif should only be used to direct the lookups to a specific table. Routes in the table are not based on the VRF device but rather interfaces that are part of the VRF so do not consider the oif for lookups within the table. The FLOWI_FLAG_VRFSRC is used to control this latter part. Signed-off-by: Shrijeet Mukherjee Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/flow.h | 1 + include/net/route.h | 3 +++ 2 files changed, 4 insertions(+) (limited to 'include/net') diff --git a/include/net/flow.h b/include/net/flow.h index 3098ae33a178..f305588fc162 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -33,6 +33,7 @@ struct flowi_common { __u8 flowic_flags; #define FLOWI_FLAG_ANYSRC 0x01 #define FLOWI_FLAG_KNOWN_NH 0x02 +#define FLOWI_FLAG_VRFSRC 0x04 __u32 flowic_secid; struct flowi_tunnel flowic_tun_key; }; diff --git a/include/net/route.h b/include/net/route.h index 2d45f419477f..94189d4bd899 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -251,6 +251,9 @@ static inline void ip_route_connect_init(struct flowi4 *fl4, __be32 dst, __be32 if (inet_sk(sk)->transparent) flow_flags |= FLOWI_FLAG_ANYSRC; + if (netif_index_is_vrf(sock_net(sk), oif)) + flow_flags |= FLOWI_FLAG_VRFSRC; + flowi4_init_output(fl4, oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE, protocol, flow_flags, dst, src, dport, sport); } -- cgit v1.2.3 From 15be405eb2ea943ac5fa2aab7d0ba282e9ef1301 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 13 Aug 2015 14:59:04 -0600 Subject: net: Add inet_addr lookup by table Currently inet_addr_type and inet_dev_addr_type expect local addresses to be in the local table. With the VRF device local routes for devices associated with a VRF will be in the table associated with the VRF. Provide an alternate inet_addr lookup to use a specific table rather than defaulting to the local table. Signed-off-by: Shrijeet Mukherjee Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/route.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/route.h b/include/net/route.h index 94189d4bd899..6ba681f0b98d 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -189,6 +189,7 @@ void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk); void ip_rt_send_redirect(struct sk_buff *skb); unsigned int inet_addr_type(struct net *net, __be32 addr); +unsigned int inet_addr_type_table(struct net *net, __be32 addr, int tb_id); unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev, __be32 addr); void ip_rt_multicast_event(struct in_device *); -- cgit v1.2.3 From 30bbaa19500559d7625c65632195413f639b3b97 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 13 Aug 2015 14:59:05 -0600 Subject: net: Fix up inet_addr_type checks Currently inet_addr_type and inet_dev_addr_type expect local addresses to be in the local table. With the VRF device local routes for devices associated with a VRF will be in the table associated with the VRF. Provide an alternate inet_addr lookup to use a specific table rather than defaulting to the local table. inet_addr_type_dev_table keeps the same semantics as inet_addr_type but if the passed in device is enslaved to a VRF then the table for that VRF is used for the lookup. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/route.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/net') diff --git a/include/net/route.h b/include/net/route.h index 6ba681f0b98d..6dda2c1bf8c6 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -192,6 +192,9 @@ unsigned int inet_addr_type(struct net *net, __be32 addr); unsigned int inet_addr_type_table(struct net *net, __be32 addr, int tb_id); unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev, __be32 addr); +unsigned int inet_addr_type_dev_table(struct net *net, + const struct net_device *dev, + __be32 addr); void ip_rt_multicast_event(struct in_device *); int ip_rt_ioctl(struct net *, unsigned int cmd, void __user *arg); void ip_rt_get_source(u8 *src, struct sk_buff *skb, struct rtable *rt); -- cgit v1.2.3 From dc028da54ed353edd44dca88b7eb19fd5126c354 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Sun, 16 Aug 2015 17:13:27 -0600 Subject: inet: Move VRF table lookup to inlined function Table lookup compiles out when VRF is not enabled. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/vrf.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'include/net') diff --git a/include/net/vrf.h b/include/net/vrf.h index 0484d29d4589..40e3793c7a05 100644 --- a/include/net/vrf.h +++ b/include/net/vrf.h @@ -81,6 +81,25 @@ static inline int vrf_dev_table(const struct net_device *dev) return tb_id; } +static inline int vrf_dev_table_ifindex(struct net *net, int ifindex) +{ + struct net_device *dev; + int tb_id = 0; + + if (!ifindex) + return 0; + + rcu_read_lock(); + + dev = dev_get_by_index_rcu(net, ifindex); + if (dev) + tb_id = vrf_dev_table_rcu(dev); + + rcu_read_unlock(); + + return tb_id; +} + /* called with rtnl */ static inline int vrf_dev_table_rtnl(const struct net_device *dev) { @@ -125,6 +144,11 @@ static inline int vrf_dev_table(const struct net_device *dev) return 0; } +static inline int vrf_dev_table_ifindex(struct net *net, int ifindex) +{ + return 0; +} + static inline int vrf_dev_table_rtnl(const struct net_device *dev) { return 0; -- cgit v1.2.3 From deedb59039f111c41aa5a54ee384c8e7c08bc78a Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 14 Aug 2015 16:03:39 +0200 Subject: netfilter: nf_conntrack: add direction support for zones This work adds a direction parameter to netfilter zones, so identity separation can be performed only in original/reply or both directions (default). This basically opens up the possibility of doing NAT with conflicting IP address/port tuples from multiple, isolated tenants on a host (e.g. from a netns) without requiring each tenant to NAT twice resp. to use its own dedicated IP address to SNAT to, meaning overlapping tuples can be made unique with the zone identifier in original direction, where the NAT engine will then allocate a unique tuple in the commonly shared default zone for the reply direction. In some restricted, local DNAT cases, also port redirection could be used for making the reply traffic unique w/o requiring SNAT. The consensus we've reached and discussed at NFWS and since the initial implementation [1] was to directly integrate the direction meta data into the existing zones infrastructure, as opposed to the ct->mark approach we proposed initially. As we pass the nf_conntrack_zone object directly around, we don't have to touch all call-sites, but only those, that contain equality checks of zones. Thus, based on the current direction (original or reply), we either return the actual id, or the default NF_CT_DEFAULT_ZONE_ID. CT expectations are direction-agnostic entities when expectations are being compared among themselves, so we can only use the identifier in this case. Note that zone identifiers can not be included into the hash mix anymore as they don't contain a "stable" value that would be equal for both directions at all times, f.e. if only zone->id would unconditionally be xor'ed into the table slot hash, then replies won't find the corresponding conntracking entry anymore. If no particular direction is specified when configuring zones, the behaviour is exactly as we expect currently (both directions). Support has been added for the CT netlink interface as well as the x_tables raw CT target, which both already offer existing interfaces to user space for the configuration of zones. Below a minimal, simplified collision example (script in [2]) with netperf sessions: +--- tenant-1 ---+ mark := 1 | netperf |--+ +----------------+ | CT zone := mark [ORIGINAL] [ip,sport] := X +--------------+ +--- gateway ---+ | mark routing |--| SNAT |-- ... + +--------------+ +---------------+ | +--- tenant-2 ---+ | ~~~|~~~ | netperf |--+ +-----------+ | +----------------+ mark := 2 | netserver |------ ... + [ip,sport] := X +-----------+ [ip,port] := Y On the gateway netns, example: iptables -t raw -A PREROUTING -j CT --zone mark --zone-dir ORIGINAL iptables -t nat -A POSTROUTING -o -j SNAT --to-source --random-fully iptables -t mangle -A PREROUTING -m conntrack --ctdir ORIGINAL -j CONNMARK --save-mark iptables -t mangle -A POSTROUTING -m conntrack --ctdir REPLY -j CONNMARK --restore-mark conntrack dump from gateway netns: netperf -H 10.1.1.2 -t TCP_STREAM -l60 -p12865,5555 from each tenant netns tcp 6 431995 ESTABLISHED src=40.1.1.1 dst=10.1.1.2 sport=5555 dport=12865 zone-orig=1 src=10.1.1.2 dst=10.1.1.1 sport=12865 dport=1024 [ASSURED] mark=1 secctx=system_u:object_r:unlabeled_t:s0 use=1 tcp 6 431994 ESTABLISHED src=40.1.1.1 dst=10.1.1.2 sport=5555 dport=12865 zone-orig=2 src=10.1.1.2 dst=10.1.1.1 sport=12865 dport=5555 [ASSURED] mark=2 secctx=system_u:object_r:unlabeled_t:s0 use=1 tcp 6 299 ESTABLISHED src=40.1.1.1 dst=10.1.1.2 sport=39438 dport=33768 zone-orig=1 src=10.1.1.2 dst=10.1.1.1 sport=33768 dport=39438 [ASSURED] mark=1 secctx=system_u:object_r:unlabeled_t:s0 use=1 tcp 6 300 ESTABLISHED src=40.1.1.1 dst=10.1.1.2 sport=32889 dport=40206 zone-orig=2 src=10.1.1.2 dst=10.1.1.1 sport=40206 dport=32889 [ASSURED] mark=2 secctx=system_u:object_r:unlabeled_t:s0 use=2 Taking this further, test script in [2] creates 200 tenants and runs original-tuple colliding netperf sessions each. A conntrack -L dump in the gateway netns also confirms 200 overlapping entries, all in ESTABLISHED state as expected. I also did run various other tests with some permutations of the script, to mention some: SNAT in random/random-fully/persistent mode, no zones (no overlaps), static zones (original, reply, both directions), etc. [1] http://thread.gmane.org/gmane.comp.security.firewalls.netfilter.devel/57412/ [2] https://paste.fedoraproject.org/242835/65657871/ Signed-off-by: Daniel Borkmann Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_zones.h | 31 +++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack_zones.h b/include/net/netfilter/nf_conntrack_zones.h index 0788bb0f267d..3942ddf0d4ff 100644 --- a/include/net/netfilter/nf_conntrack_zones.h +++ b/include/net/netfilter/nf_conntrack_zones.h @@ -1,10 +1,18 @@ #ifndef _NF_CONNTRACK_ZONES_H #define _NF_CONNTRACK_ZONES_H +#include + #define NF_CT_DEFAULT_ZONE_ID 0 +#define NF_CT_ZONE_DIR_ORIG (1 << IP_CT_DIR_ORIGINAL) +#define NF_CT_ZONE_DIR_REPL (1 << IP_CT_DIR_REPLY) + +#define NF_CT_DEFAULT_ZONE_DIR (NF_CT_ZONE_DIR_ORIG | NF_CT_ZONE_DIR_REPL) + struct nf_conntrack_zone { u16 id; + u16 dir; }; extern const struct nf_conntrack_zone nf_ct_zone_dflt; @@ -29,8 +37,29 @@ nf_ct_zone_tmpl(const struct nf_conn *tmpl) return tmpl ? nf_ct_zone(tmpl) : &nf_ct_zone_dflt; } +static inline bool nf_ct_zone_matches_dir(const struct nf_conntrack_zone *zone, + enum ip_conntrack_dir dir) +{ + return zone->dir & (1 << dir); +} + +static inline u16 nf_ct_zone_id(const struct nf_conntrack_zone *zone, + enum ip_conntrack_dir dir) +{ + return nf_ct_zone_matches_dir(zone, dir) ? + zone->id : NF_CT_DEFAULT_ZONE_ID; +} + static inline bool nf_ct_zone_equal(const struct nf_conn *a, - const struct nf_conntrack_zone *b) + const struct nf_conntrack_zone *b, + enum ip_conntrack_dir dir) +{ + return nf_ct_zone_id(nf_ct_zone(a), dir) == + nf_ct_zone_id(b, dir); +} + +static inline bool nf_ct_zone_equal_any(const struct nf_conn *a, + const struct nf_conntrack_zone *b) { return nf_ct_zone(a)->id == b->id; } -- cgit v1.2.3 From 5e8018fc61423e677398d4ad4d72df70b9788e77 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 14 Aug 2015 16:03:40 +0200 Subject: netfilter: nf_conntrack: add efficient mark to zone mapping This work adds the possibility of deriving the zone id from the skb->mark field in a scalable manner. This allows for having only a single template serving hundreds/thousands of different zones, for example, instead of the need to have one match for each zone as an extra CT jump target. Note that we'd need to have this information attached to the template as at the time when we're trying to lookup a possible ct object, we already need to know zone information for a possible match when going into __nf_conntrack_find_get(). This work provides a minimal implementation for a possible mapping. In order to not add/expose an extra ct->status bit, the zone structure has been extended to carry a flag for deriving the mark. Signed-off-by: Daniel Borkmann Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_zones.h | 45 ++++++++++++++++++++++++++++-- 1 file changed, 42 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack_zones.h b/include/net/netfilter/nf_conntrack_zones.h index 3942ddf0d4ff..5316c7b3a374 100644 --- a/include/net/netfilter/nf_conntrack_zones.h +++ b/include/net/netfilter/nf_conntrack_zones.h @@ -10,9 +10,12 @@ #define NF_CT_DEFAULT_ZONE_DIR (NF_CT_ZONE_DIR_ORIG | NF_CT_ZONE_DIR_REPL) +#define NF_CT_FLAG_MARK 1 + struct nf_conntrack_zone { u16 id; - u16 dir; + u8 flags; + u8 dir; }; extern const struct nf_conntrack_zone nf_ct_zone_dflt; @@ -32,9 +35,45 @@ nf_ct_zone(const struct nf_conn *ct) } static inline const struct nf_conntrack_zone * -nf_ct_zone_tmpl(const struct nf_conn *tmpl) +nf_ct_zone_init(struct nf_conntrack_zone *zone, u16 id, u8 dir, u8 flags) +{ + zone->id = id; + zone->flags = flags; + zone->dir = dir; + + return zone; +} + +static inline const struct nf_conntrack_zone * +nf_ct_zone_tmpl(const struct nf_conn *tmpl, const struct sk_buff *skb, + struct nf_conntrack_zone *tmp) +{ + const struct nf_conntrack_zone *zone; + + if (!tmpl) + return &nf_ct_zone_dflt; + + zone = nf_ct_zone(tmpl); + if (zone->flags & NF_CT_FLAG_MARK) + zone = nf_ct_zone_init(tmp, skb->mark, zone->dir, 0); + + return zone; +} + +static inline int nf_ct_zone_add(struct nf_conn *ct, gfp_t flags, + const struct nf_conntrack_zone *info) { - return tmpl ? nf_ct_zone(tmpl) : &nf_ct_zone_dflt; +#ifdef CONFIG_NF_CONNTRACK_ZONES + struct nf_conntrack_zone *nf_ct_zone; + + nf_ct_zone = nf_ct_ext_add(ct, NF_CT_EXT_ZONE, flags); + if (!nf_ct_zone) + return -ENOMEM; + + nf_ct_zone_init(nf_ct_zone, info->id, info->dir, + info->flags); +#endif + return 0; } static inline bool nf_ct_zone_matches_dir(const struct nf_conntrack_zone *zone, -- cgit v1.2.3 From 2536862311d2276454ddef9dc36d6551a4b400fd Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 17 Aug 2015 13:42:24 -0700 Subject: lwt: Add support to redirect dst.input This patch adds the capability to redirect dst input in the same way that dst output is redirected by LWT. Also, save the original dst.input and and dst.out when setting up lwtunnel redirection. These can be called by the client as a pass- through. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/lwtunnel.h | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h index 33bd30963a95..e25b60eb262d 100644 --- a/include/net/lwtunnel.h +++ b/include/net/lwtunnel.h @@ -11,12 +11,15 @@ #define LWTUNNEL_HASH_SIZE (1 << LWTUNNEL_HASH_BITS) /* lw tunnel state flags */ -#define LWTUNNEL_STATE_OUTPUT_REDIRECT 0x1 +#define LWTUNNEL_STATE_OUTPUT_REDIRECT BIT(0) +#define LWTUNNEL_STATE_INPUT_REDIRECT BIT(1) struct lwtunnel_state { __u16 type; __u16 flags; atomic_t refcnt; + int (*orig_output)(struct sock *sk, struct sk_buff *skb); + int (*orig_input)(struct sk_buff *); int len; __u8 data[0]; }; @@ -25,6 +28,7 @@ struct lwtunnel_encap_ops { int (*build_state)(struct net_device *dev, struct nlattr *encap, struct lwtunnel_state **ts); int (*output)(struct sock *sk, struct sk_buff *skb); + int (*input)(struct sk_buff *skb); int (*fill_encap)(struct sk_buff *skb, struct lwtunnel_state *lwtstate); int (*get_encap_size)(struct lwtunnel_state *lwtstate); @@ -58,6 +62,13 @@ static inline bool lwtunnel_output_redirect(struct lwtunnel_state *lwtstate) return false; } +static inline bool lwtunnel_input_redirect(struct lwtunnel_state *lwtstate) +{ + if (lwtstate && (lwtstate->flags & LWTUNNEL_STATE_INPUT_REDIRECT)) + return true; + + return false; +} int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *op, unsigned int num); int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *op, @@ -72,6 +83,8 @@ struct lwtunnel_state *lwtunnel_state_alloc(int hdr_len); int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b); int lwtunnel_output(struct sock *sk, struct sk_buff *skb); int lwtunnel_output6(struct sock *sk, struct sk_buff *skb); +int lwtunnel_input(struct sk_buff *skb); +int lwtunnel_input6(struct sk_buff *skb); #else @@ -90,6 +103,11 @@ static inline bool lwtunnel_output_redirect(struct lwtunnel_state *lwtstate) return false; } +static inline bool lwtunnel_input_redirect(struct lwtunnel_state *lwtstate) +{ + return false; +} + static inline int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *op, unsigned int num) { @@ -142,6 +160,16 @@ static inline int lwtunnel_output6(struct sock *sk, struct sk_buff *skb) return -EOPNOTSUPP; } +static inline int lwtunnel_input(struct sk_buff *skb) +{ + return -EOPNOTSUPP; +} + +static inline int lwtunnel_input6(struct sk_buff *skb) +{ + return -EOPNOTSUPP; +} + #endif #endif /* __NET_LWTUNNEL_H */ -- cgit v1.2.3 From 4b048d6d9d0b0b90e1e94f2393796bbf1fa8df4e Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 17 Aug 2015 13:42:25 -0700 Subject: net: Change pseudohdr argument of inet_proto_csum_replace* to be a bool inet_proto_csum_replace4,2,16 take a pseudohdr argument which indicates the checksum field carries a pseudo header. This argument should be a boolean instead of an int. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/checksum.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/checksum.h b/include/net/checksum.h index 2d1d73cb773e..619f3445d57e 100644 --- a/include/net/checksum.h +++ b/include/net/checksum.h @@ -140,14 +140,14 @@ static inline void csum_replace2(__sum16 *sum, __be16 old, __be16 new) struct sk_buff; void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb, - __be32 from, __be32 to, int pseudohdr); + __be32 from, __be32 to, bool pseudohdr); void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb, const __be32 *from, const __be32 *to, - int pseudohdr); + bool pseudohdr); static inline void inet_proto_csum_replace2(__sum16 *sum, struct sk_buff *skb, __be16 from, __be16 to, - int pseudohdr) + bool pseudohdr) { inet_proto_csum_replace4(sum, skb, (__force __be32)from, (__force __be32)to, pseudohdr); -- cgit v1.2.3 From abc5d1ff3e8f9b4a9d274818459b123e31981dc9 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 17 Aug 2015 13:42:26 -0700 Subject: net: Add inet_proto_csum_replace_by_diff utility function This function updates a checksum field value and skb->csum based on a value which is the difference between the old and new checksum. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/checksum.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/checksum.h b/include/net/checksum.h index 619f3445d57e..9fcaedf994ee 100644 --- a/include/net/checksum.h +++ b/include/net/checksum.h @@ -144,6 +144,8 @@ void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb, void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb, const __be32 *from, const __be32 *to, bool pseudohdr); +void inet_proto_csum_replace_by_diff(__sum16 *sum, struct sk_buff *skb, + __wsum diff, bool pseudohdr); static inline void inet_proto_csum_replace2(__sum16 *sum, struct sk_buff *skb, __be16 from, __be16 to, -- cgit v1.2.3 From 60045cbfc0b291dae8dd5b929d67b87c5ea954d4 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Mon, 17 Aug 2015 23:52:51 +0200 Subject: net: dsa: Add dsa_is_dsa_port() helper Add an inline helper for determining is a port is a DSA port. Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- include/net/dsa.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index bd9b76502458..b34d812bc5d0 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -171,6 +171,11 @@ static inline bool dsa_is_cpu_port(struct dsa_switch *ds, int p) return !!(ds->index == ds->dst->cpu_switch && p == ds->dst->cpu_port); } +static inline bool dsa_is_dsa_port(struct dsa_switch *ds, int p) +{ + return !!((ds->dsa_port_mask) & (1 << p)); +} + static inline bool dsa_is_port_initialized(struct dsa_switch *ds, int p) { return ds->phys_port_mask & (1 << p) && ds->ports[p]; -- cgit v1.2.3 From df383e6240ef222703648072dafd2a1ae21b0d2a Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Tue, 18 Aug 2015 18:41:13 +0200 Subject: lwtunnel: fix memory leak The built lwtunnel_state struct has to be freed after comparison. Fixes: 571e722676fe3 ("ipv4: support for fib route lwtunnel encap attributes") Signed-off-by: Jiri Benc Acked-by: Roopa Prabhu Signed-off-by: David S. Miller --- include/net/lwtunnel.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h index e25b60eb262d..34fd8f70c2ca 100644 --- a/include/net/lwtunnel.h +++ b/include/net/lwtunnel.h @@ -36,6 +36,11 @@ struct lwtunnel_encap_ops { }; #ifdef CONFIG_LWTUNNEL +static inline void lwtstate_free(struct lwtunnel_state *lws) +{ + kfree(lws); +} + static inline struct lwtunnel_state * lwtstate_get(struct lwtunnel_state *lws) { @@ -51,7 +56,7 @@ static inline void lwtstate_put(struct lwtunnel_state *lws) return; if (atomic_dec_and_test(&lws->refcnt)) - kfree(lws); + lwtstate_free(lws); } static inline bool lwtunnel_output_redirect(struct lwtunnel_state *lwtstate) -- cgit v1.2.3 From db5dbec5ef2d4565bb8d42709802de66b06f9965 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 18 Aug 2015 20:28:02 +0300 Subject: vrf: drop unused num_slaves member slave_queue has a num_slaves member which is unused, drop it. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/net/vrf.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/net') diff --git a/include/net/vrf.h b/include/net/vrf.h index 40e3793c7a05..3bb4af462ed6 100644 --- a/include/net/vrf.h +++ b/include/net/vrf.h @@ -24,7 +24,6 @@ struct slave { struct slave_queue { struct list_head all_slaves; - int num_slaves; }; struct net_vrf { -- cgit v1.2.3 From bf798657eb5ba57552096843c315f096fdf9b715 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 12 Aug 2015 17:41:00 +0200 Subject: netfilter: nf_tables: Use 32 bit addressing register from nft_type_to_reg() nft_type_to_reg() needs to return the register in the new 32 bit addressing, otherwise we hit EINVAL when using mappings. Fixes: 49499c3 ("netfilter: nf_tables: switch registers to 32 bit addressing") Reported-by: Andreas Schultz Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 2a246680a6c3..aa8bee72c9d3 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -125,7 +125,7 @@ static inline enum nft_data_types nft_dreg_to_type(enum nft_registers reg) static inline enum nft_registers nft_type_to_reg(enum nft_data_types type) { - return type == NFT_DATA_VERDICT ? NFT_REG_VERDICT : NFT_REG_1; + return type == NFT_DATA_VERDICT ? NFT_REG_VERDICT : NFT_REG_1 * NFT_REG_SIZE / NFT_REG32_SIZE; } unsigned int nft_parse_register(const struct nlattr *attr); -- cgit v1.2.3 From 18e1db67e93ed75d9dc0d34c8d783ccf10547c2b Mon Sep 17 00:00:00 2001 From: Bernhard Thaler Date: Thu, 13 Aug 2015 08:58:15 +0200 Subject: netfilter: bridge: fix IPv6 packets not being bridged with CONFIG_IPV6=n 230ac490f7fba introduced a dependency to CONFIG_IPV6 which breaks bridging of IPv6 packets on a bridge with CONFIG_IPV6=n. Sysctl entry /proc/sys/net/bridge/bridge-nf-call-ip6tables defaults to 1, for this reason packets are handled by br_nf_pre_routing_ipv6(). When compiled with CONFIG_IPV6=n this function returns NF_DROP but should return NF_ACCEPT to let packets through. Change CONFIG_IPV6=n br_nf_pre_routing_ipv6() return value to NF_ACCEPT. Tested with a simple bridge with two interfaces and IPv6 packets trying to pass from host on left side to host on right side of the bridge. Fixes: 230ac490f7fba ("netfilter: bridge: split ipv6 code into separated file") Signed-off-by: Bernhard Thaler Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/br_netfilter.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/netfilter/br_netfilter.h b/include/net/netfilter/br_netfilter.h index bab824bde92c..d4c6b5f30acd 100644 --- a/include/net/netfilter/br_netfilter.h +++ b/include/net/netfilter/br_netfilter.h @@ -59,7 +59,7 @@ static inline unsigned int br_nf_pre_routing_ipv6(const struct nf_hook_ops *ops, struct sk_buff *skb, const struct nf_hook_state *state) { - return NF_DROP; + return NF_ACCEPT; } #endif -- cgit v1.2.3 From 824e7383e92815cb591793c74cc836aa5165f7f8 Mon Sep 17 00:00:00 2001 From: Ying Xue Date: Wed, 19 Aug 2015 15:46:17 +0800 Subject: lwtunnel: Fix the sparse warnings in fib_encap_match MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When CONFIG_LWTUNNEL config is not enabled, the lwtstate_free() is not declared in lwtunnel.h at all. However, even in this case, the function is still referenced in fib_semantics.c so that there appears the following sparse warnings: net/ipv4/fib_semantics.c:553:17: error: undefined identifier 'lwtstate_free' CC net/ipv4/fib_semantics.o net/ipv4/fib_semantics.c: In function ‘fib_encap_match’: net/ipv4/fib_semantics.c:553:3: error: implicit declaration of function ‘lwtstate_free’ [-Werror=implicit-function-declaration] cc1: some warnings being treated as errors make[1]: *** [net/ipv4/fib_semantics.o] Error 1 make: *** [net/ipv4/fib_semantics.o] Error 2 To eliminate the error, we define an empty function for lwtstate_free() in lwtunnel.h when CONFIG_LWTUNNEL is disabled. Fixes: df383e6240ef ("lwtunnel: fix memory leak") Cc: Jiri Benc Reported-by: kbuild test robot Signed-off-by: Ying Xue Acked-by: Jiri Benc Signed-off-by: David S. Miller --- include/net/lwtunnel.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/net') diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h index 34fd8f70c2ca..cfee53916ba5 100644 --- a/include/net/lwtunnel.h +++ b/include/net/lwtunnel.h @@ -93,6 +93,10 @@ int lwtunnel_input6(struct sk_buff *skb); #else +static inline void lwtstate_free(struct lwtunnel_state *lws) +{ +} + static inline struct lwtunnel_state * lwtstate_get(struct lwtunnel_state *lws) { -- cgit v1.2.3 From 18041e31743d278b6323518d20a2ef656c3cc689 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 18 Aug 2015 21:40:16 +0300 Subject: vrf: vrf_master_ifindex_rcu is not always called with rcu read lock While running net-next I hit this: [ 634.073119] =============================== [ 634.073150] [ INFO: suspicious RCU usage. ] [ 634.073182] 4.2.0-rc6+ #45 Not tainted [ 634.073213] ------------------------------- [ 634.073244] include/net/vrf.h:38 suspicious rcu_dereference_check() usage! [ 634.073274] other info that might help us debug this: [ 634.073307] rcu_scheduler_active = 1, debug_locks = 1 [ 634.073338] 2 locks held by swapper/0/0: [ 634.073369] #0: (((&n->timer))){+.-...}, at: [] call_timer_fn+0x5/0x480 [ 634.073412] #1: (slock-AF_INET){+.-...}, at: [] icmp_send+0x155/0x5f0 [ 634.073450] stack backtrace: [ 634.073483] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.2.0-rc6+ #45 [ 634.073514] Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 [ 634.073545] 0000000000000000 0593ba8242d9ace4 ffff88002fc03b48 ffffffff81803f1b [ 634.073612] 0000000000000000 ffffffff81e12500 ffff88002fc03b78 ffffffff811003c5 [ 634.073642] 0000000000000000 ffff88002ec4e600 ffffffff81f00f80 ffff88002fc03cf0 [ 634.073669] Call Trace: [ 634.073694] [] dump_stack+0x4c/0x65 [ 634.073728] [] lockdep_rcu_suspicious+0xc5/0x100 [ 634.073763] [] icmp_route_lookup+0x176/0x5c0 [ 634.073793] [] ? icmp_send+0x35b/0x5f0 [ 634.073818] [] ? icmp_send+0x2d4/0x5f0 [ 634.073844] [] icmp_send+0x42e/0x5f0 [ 634.073873] [] ipv4_link_failure+0x22/0xa0 [ 634.073899] [] arp_error_report+0x3a/0x80 [ 634.073926] [] ? neigh_lookup+0x2c0/0x2c0 [ 634.073952] [] neigh_invalidate+0x8e/0x110 [ 634.073984] [] neigh_timer_handler+0x1ae/0x290 [ 634.074013] [] ? neigh_lookup+0x2c0/0x2c0 [ 634.074013] [] call_timer_fn+0xb3/0x480 [ 634.074013] [] ? call_timer_fn+0x5/0x480 [ 634.074013] [] ? neigh_lookup+0x2c0/0x2c0 [ 634.074013] [] run_timer_softirq+0x20c/0x430 [ 634.074013] [] __do_softirq+0xde/0x630 [ 634.074013] [] irq_exit+0x117/0x120 [ 634.074013] [] smp_apic_timer_interrupt+0x46/0x60 [ 634.074013] [] apic_timer_interrupt+0x70/0x80 [ 634.074013] [] ? native_safe_halt+0x6/0x10 [ 634.074013] [] ? trace_hardirqs_on+0xd/0x10 [ 634.074013] [] default_idle+0x23/0x200 [ 634.074013] [] arch_cpu_idle+0xf/0x20 [ 634.074013] [] default_idle_call+0x2a/0x40 [ 634.074013] [] cpu_startup_entry+0x39c/0x4c0 [ 634.074013] [] rest_init+0x13d/0x150 [ 634.074013] [] start_kernel+0x4a8/0x4c9 [ 634.074013] [] ? early_idt_handler_array+0x120/0x120 [ 634.074013] [] x86_64_start_reservations+0x2a/0x2c [ 634.074013] [] x86_64_start_kernel+0x14a/0x16d It would seem vrf_master_ifindex_rcu() can be called without RCU held in other contexts as well so introduce a new helper which acquires rcu and returns the ifindex. Also add curly braces around both the "if" and "else" parts as per the style guide. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/net/vrf.h | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/vrf.h b/include/net/vrf.h index 3bb4af462ed6..5bfb16237fd7 100644 --- a/include/net/vrf.h +++ b/include/net/vrf.h @@ -43,9 +43,9 @@ static inline int vrf_master_ifindex_rcu(const struct net_device *dev) if (!dev) return 0; - if (netif_is_vrf(dev)) + if (netif_is_vrf(dev)) { ifindex = dev->ifindex; - else { + } else { vrf_ptr = rcu_dereference(dev->vrf_ptr); if (vrf_ptr) ifindex = vrf_ptr->ifindex; @@ -54,6 +54,17 @@ static inline int vrf_master_ifindex_rcu(const struct net_device *dev) return ifindex; } +static inline int vrf_master_ifindex(const struct net_device *dev) +{ + int ifindex; + + rcu_read_lock(); + ifindex = vrf_master_ifindex_rcu(dev); + rcu_read_unlock(); + + return ifindex; +} + /* called with rcu_read_lock */ static inline int vrf_dev_table_rcu(const struct net_device *dev) { @@ -133,6 +144,11 @@ static inline int vrf_master_ifindex_rcu(const struct net_device *dev) return 0; } +static inline int vrf_master_ifindex(const struct net_device *dev) +{ + return 0; +} + static inline int vrf_dev_table_rcu(const struct net_device *dev) { return 0; -- cgit v1.2.3 From fdf79bd48876812acf0de58ed7a8bc1b3a3c67d6 Mon Sep 17 00:00:00 2001 From: Robert Baldyga Date: Thu, 20 Aug 2015 17:26:00 +0200 Subject: NFC: nci: Add post_setup handler Some drivers require non-standard configuration after NCI_CORE_INIT request, because they need to know ndev->manufact_specific_info or ndev->manufact_id. This patch adds post_setup handler allowing to do such custom configuration. Signed-off-by: Robert Baldyga Signed-off-by: Samuel Ortiz --- include/net/nfc/nci_core.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/nfc/nci_core.h b/include/net/nfc/nci_core.h index 01fc8c531115..1bdaa5f51107 100644 --- a/include/net/nfc/nci_core.h +++ b/include/net/nfc/nci_core.h @@ -79,6 +79,7 @@ struct nci_ops { int (*close)(struct nci_dev *ndev); int (*send)(struct nci_dev *ndev, struct sk_buff *skb); int (*setup)(struct nci_dev *ndev); + int (*post_setup)(struct nci_dev *ndev); int (*fw_download)(struct nci_dev *ndev, const char *firmware_name); __u32 (*get_rfprotocol)(struct nci_dev *ndev, __u8 rf_protocol); int (*discover_se)(struct nci_dev *ndev); -- cgit v1.2.3 From 025a0cb8380b7100d39fb426db9192b6c59595dc Mon Sep 17 00:00:00 2001 From: Robert Baldyga Date: Thu, 20 Aug 2015 17:26:01 +0200 Subject: NFC: nci: export nci_core_reset and nci_core_init Some drivers needs to have ability to reinit NCI core, for example after updating firmware in setup() of post_setup() callback. This patch makes nci_core_reset() and nci_core_init() functions public, to make it possible. Signed-off-by: Robert Baldyga Signed-off-by: Samuel Ortiz --- include/net/nfc/nci_core.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/nfc/nci_core.h b/include/net/nfc/nci_core.h index 1bdaa5f51107..d0d0f1e53bb9 100644 --- a/include/net/nfc/nci_core.h +++ b/include/net/nfc/nci_core.h @@ -278,6 +278,8 @@ int nci_request(struct nci_dev *ndev, unsigned long opt), unsigned long opt, __u32 timeout); int nci_prop_cmd(struct nci_dev *ndev, __u8 oid, size_t len, __u8 *payload); +int nci_core_reset(struct nci_dev *ndev); +int nci_core_init(struct nci_dev *ndev); int nci_recv_frame(struct nci_dev *ndev, struct sk_buff *skb); int nci_set_config(struct nci_dev *ndev, __u8 id, size_t len, __u8 *val); -- cgit v1.2.3 From 29e76924cf087bc6a9114a9244828fd13ae959bb Mon Sep 17 00:00:00 2001 From: Christophe Ricard Date: Wed, 19 Aug 2015 21:26:43 +0200 Subject: nfc: netlink: Add capability to reply to vendor_cmd with data A proprietary vendor command may send back useful data to the user application. For example, the field level applied on the NFC router antenna. Still based on net/wireless/nl80211.c implementation, add nfc_vendor_cmd_alloc_reply_skb and nfc_vendor_cmd_reply in order to send back over netlink data generated by a proprietary command. Signed-off-by: Christophe Ricard Signed-off-by: Samuel Ortiz --- include/net/nfc/nfc.h | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) (limited to 'include/net') diff --git a/include/net/nfc/nfc.h b/include/net/nfc/nfc.h index f9e58ae45f9c..30afc9a6718c 100644 --- a/include/net/nfc/nfc.h +++ b/include/net/nfc/nfc.h @@ -203,6 +203,7 @@ struct nfc_dev { int n_vendor_cmds; struct nfc_ops *ops; + struct genl_info *cur_cmd_info; }; #define to_nfc_dev(_dev) container_of(_dev, struct nfc_dev, dev) @@ -318,4 +319,44 @@ static inline int nfc_set_vendor_cmds(struct nfc_dev *dev, return 0; } +struct sk_buff *__nfc_alloc_vendor_cmd_reply_skb(struct nfc_dev *dev, + enum nfc_attrs attr, + u32 oui, u32 subcmd, + int approxlen); +int nfc_vendor_cmd_reply(struct sk_buff *skb); + +/** + * nfc_vendor_cmd_alloc_reply_skb - allocate vendor command reply + * @dev: nfc device + * @oui: vendor oui + * @approxlen: an upper bound of the length of the data that will + * be put into the skb + * + * This function allocates and pre-fills an skb for a reply to + * a vendor command. Since it is intended for a reply, calling + * it outside of a vendor command's doit() operation is invalid. + * + * The returned skb is pre-filled with some identifying data in + * a way that any data that is put into the skb (with skb_put(), + * nla_put() or similar) will end up being within the + * %NFC_ATTR_VENDOR_DATA attribute, so all that needs to be done + * with the skb is adding data for the corresponding userspace tool + * which can then read that data out of the vendor data attribute. + * You must not modify the skb in any other way. + * + * When done, call nfc_vendor_cmd_reply() with the skb and return + * its error code as the result of the doit() operation. + * + * Return: An allocated and pre-filled skb. %NULL if any errors happen. + */ +static inline struct sk_buff * +nfc_vendor_cmd_alloc_reply_skb(struct nfc_dev *dev, + u32 oui, u32 subcmd, int approxlen) +{ + return __nfc_alloc_vendor_cmd_reply_skb(dev, + NFC_ATTR_VENDOR_DATA, + oui, + subcmd, approxlen); +} + #endif /* __NET_NFC_H */ -- cgit v1.2.3 From ac1cf3990c99802eae3aa735b35c94a2131eb9fe Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Thu, 20 Aug 2015 13:56:20 +0200 Subject: ip_tunnels: remove custom alignment and packing The custom alignment of struct ip_tunnel_key is unnecessary. In struct sw_flow_key, it starts at offset 256, in struct ip_tunnel_info it's the first field. The structure is also packed even without the __packed keyword. Signed-off-by: Jiri Benc Acked-by: Thomas Graf Signed-off-by: David S. Miller --- include/net/ip_tunnels.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 984dbfa15e13..81cf11c931e4 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -36,7 +36,7 @@ struct ip_tunnel_key { __u8 ipv4_ttl; __be16 tp_src; __be16 tp_dst; -} __packed __aligned(4); /* Minimize padding. */ +}; /* Indicates whether the tunnel info structure represents receive * or transmit tunnel parameters. -- cgit v1.2.3 From 6b8847c5a2bafbbf92f4b779f87165093457ea68 Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Thu, 20 Aug 2015 13:56:21 +0200 Subject: ip_tunnels: use u8/u16/u32 The ip_tunnels.h include file uses mixture of __u16 and u16 (etc.) types. Unify it to the non-underscore variants. Signed-off-by: Jiri Benc Acked-by: Thomas Graf Signed-off-by: David S. Miller --- include/net/ip_tunnels.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'include/net') diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 81cf11c931e4..ca173f22f07f 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -32,8 +32,8 @@ struct ip_tunnel_key { __be32 ipv4_src; __be32 ipv4_dst; __be16 tun_flags; - __u8 ipv4_tos; - __u8 ipv4_ttl; + u8 ipv4_tos; + u8 ipv4_ttl; __be16 tp_src; __be16 tp_dst; }; @@ -64,8 +64,8 @@ struct ip_tunnel_6rd_parm { #endif struct ip_tunnel_encap { - __u16 type; - __u16 flags; + u16 type; + u16 flags; __be16 sport; __be16 dport; }; @@ -95,8 +95,8 @@ struct ip_tunnel { * arrived */ /* These four fields used only by GRE */ - __u32 i_seqno; /* The last seen seqno */ - __u32 o_seqno; /* The last output seqno */ + u32 i_seqno; /* The last seen seqno */ + u32 o_seqno; /* The last output seqno */ int tun_hlen; /* Precalculated header length */ int mlink; @@ -273,8 +273,8 @@ static inline u8 ip_tunnel_ecn_encap(u8 tos, const struct iphdr *iph, int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto); int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, - __be32 src, __be32 dst, __u8 proto, - __u8 tos, __u8 ttl, __be16 df, bool xnet); + __be32 src, __be32 dst, u8 proto, + u8 tos, u8 ttl, __be16 df, bool xnet); struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb, bool gre_csum, int gso_type_mask); -- cgit v1.2.3 From 376534a3d17002d608985bd67c3b0880eacadd14 Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Thu, 20 Aug 2015 13:56:22 +0200 Subject: ip_tunnels: use offsetofend Signed-off-by: Jiri Benc Acked-by: Thomas Graf Signed-off-by: David S. Miller --- include/net/ip_tunnels.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index ca173f22f07f..cc3b39e9010b 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -23,9 +23,7 @@ #define IPTUNNEL_ERR_TIMEO (30*HZ) /* Used to memset ip_tunnel padding. */ -#define IP_TUNNEL_KEY_SIZE \ - (offsetof(struct ip_tunnel_key, tp_dst) + \ - FIELD_SIZEOF(struct ip_tunnel_key, tp_dst)) +#define IP_TUNNEL_KEY_SIZE offsetofend(struct ip_tunnel_key, tp_dst) struct ip_tunnel_key { __be64 tun_id; -- cgit v1.2.3 From c1ea5d672aaff08da337dee735dbb548e3415585 Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Thu, 20 Aug 2015 13:56:23 +0200 Subject: ip_tunnels: add IPv6 addresses to ip_tunnel_key Add the IPv6 addresses as an union with IPv4 ones. When using IPv4, the newly introduced padding after the IPv4 addresses needs to be zeroed out. Signed-off-by: Jiri Benc Acked-by: Thomas Graf Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/net/ip_tunnels.h | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index cc3b39e9010b..6a51371dad00 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -25,10 +25,24 @@ /* Used to memset ip_tunnel padding. */ #define IP_TUNNEL_KEY_SIZE offsetofend(struct ip_tunnel_key, tp_dst) +/* Used to memset ipv4 address padding. */ +#define IP_TUNNEL_KEY_IPV4_PAD offsetofend(struct ip_tunnel_key, u.ipv4.dst) +#define IP_TUNNEL_KEY_IPV4_PAD_LEN \ + (FIELD_SIZEOF(struct ip_tunnel_key, u) - \ + FIELD_SIZEOF(struct ip_tunnel_key, u.ipv4)) + struct ip_tunnel_key { __be64 tun_id; - __be32 ipv4_src; - __be32 ipv4_dst; + union { + struct { + __be32 src; + __be32 dst; + } ipv4; + struct { + struct in6_addr src; + struct in6_addr dst; + } ipv6; + } u; __be16 tun_flags; u8 ipv4_tos; u8 ipv4_ttl; @@ -177,8 +191,10 @@ static inline void __ip_tunnel_info_init(struct ip_tunnel_info *tun_info, const void *opts, u8 opts_len) { tun_info->key.tun_id = tun_id; - tun_info->key.ipv4_src = saddr; - tun_info->key.ipv4_dst = daddr; + tun_info->key.u.ipv4.src = saddr; + tun_info->key.u.ipv4.dst = daddr; + memset((unsigned char *)&tun_info->key + IP_TUNNEL_KEY_IPV4_PAD, + 0, IP_TUNNEL_KEY_IPV4_PAD_LEN); tun_info->key.ipv4_tos = tos; tun_info->key.ipv4_ttl = ttl; tun_info->key.tun_flags = tun_flags; -- cgit v1.2.3 From 7c383fb2254c44e096427470da6a36380169b548 Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Thu, 20 Aug 2015 13:56:24 +0200 Subject: ip_tunnels: use tos and ttl fields also for IPv6 Rename the ipv4_tos and ipv4_ttl fields to just 'tos' and 'ttl', as they'll be used with IPv6 tunnels, too. Signed-off-by: Jiri Benc Acked-by: Thomas Graf Signed-off-by: David S. Miller --- include/net/ip_tunnels.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 6a51371dad00..224e4ecec91b 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -44,8 +44,8 @@ struct ip_tunnel_key { } ipv6; } u; __be16 tun_flags; - u8 ipv4_tos; - u8 ipv4_ttl; + u8 tos; /* TOS for IPv4, TC for IPv6 */ + u8 ttl; /* TTL for IPv4, HL for IPv6 */ __be16 tp_src; __be16 tp_dst; }; @@ -195,8 +195,8 @@ static inline void __ip_tunnel_info_init(struct ip_tunnel_info *tun_info, tun_info->key.u.ipv4.dst = daddr; memset((unsigned char *)&tun_info->key + IP_TUNNEL_KEY_IPV4_PAD, 0, IP_TUNNEL_KEY_IPV4_PAD_LEN); - tun_info->key.ipv4_tos = tos; - tun_info->key.ipv4_ttl = ttl; + tun_info->key.tos = tos; + tun_info->key.ttl = ttl; tun_info->key.tun_flags = tun_flags; /* For the tunnel types on the top of IPsec, the tp_src and tp_dst of -- cgit v1.2.3 From 61adedf3e3f1d3f032c5a6a299978d91eff6d555 Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Thu, 20 Aug 2015 13:56:25 +0200 Subject: route: move lwtunnel state to dst_entry Currently, the lwtunnel state resides in per-protocol data. This is a problem if we encapsulate ipv6 traffic in an ipv4 tunnel (or vice versa). The xmit function of the tunnel does not know whether the packet has been routed to it by ipv4 or ipv6, yet it needs the lwtstate data. Moving the lwtstate data to dst_entry makes such inter-protocol tunneling possible. As a bonus, this brings a nice diffstat. Signed-off-by: Jiri Benc Acked-by: Roopa Prabhu Acked-by: Thomas Graf Signed-off-by: David S. Miller --- include/net/dst.h | 3 ++- include/net/dst_metadata.h | 15 +++++---------- include/net/ip6_fib.h | 1 - include/net/lwtunnel.h | 12 ------------ include/net/route.h | 1 - 5 files changed, 7 insertions(+), 25 deletions(-) (limited to 'include/net') diff --git a/include/net/dst.h b/include/net/dst.h index 2578811cef51..0a9a723f6c19 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -44,6 +44,7 @@ struct dst_entry { #else void *__pad1; #endif + struct lwtunnel_state *lwtstate; int (*input)(struct sk_buff *); int (*output)(struct sock *sk, struct sk_buff *skb); @@ -89,7 +90,7 @@ struct dst_entry { * (L1_CACHE_SIZE would be too much) */ #ifdef CONFIG_64BIT - long __pad_to_align_refcnt[2]; + long __pad_to_align_refcnt[1]; #endif /* * __refcnt wants to be on a different cache line from diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h index 075f523ff23f..2cb52d562272 100644 --- a/include/net/dst_metadata.h +++ b/include/net/dst_metadata.h @@ -23,22 +23,17 @@ static inline struct metadata_dst *skb_metadata_dst(struct sk_buff *skb) return NULL; } -static inline struct ip_tunnel_info *skb_tunnel_info(struct sk_buff *skb, - int family) +static inline struct ip_tunnel_info *skb_tunnel_info(struct sk_buff *skb) { struct metadata_dst *md_dst = skb_metadata_dst(skb); - struct rtable *rt; + struct dst_entry *dst; if (md_dst) return &md_dst->u.tun_info; - switch (family) { - case AF_INET: - rt = (struct rtable *)skb_dst(skb); - if (rt && rt->rt_lwtstate) - return lwt_tun_info(rt->rt_lwtstate); - break; - } + dst = skb_dst(skb); + if (dst && dst->lwtstate) + return lwt_tun_info(dst->lwtstate); return NULL; } diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 276328e3daa6..063d30474cf6 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -133,7 +133,6 @@ struct rt6_info { /* more non-fragment space at head required */ unsigned short rt6i_nfheader_len; u8 rt6i_protocol; - struct lwtunnel_state *rt6i_lwtstate; }; static inline struct inet6_dev *ip6_dst_idev(struct dst_entry *dst) diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h index cfee53916ba5..843489884448 100644 --- a/include/net/lwtunnel.h +++ b/include/net/lwtunnel.h @@ -87,9 +87,7 @@ int lwtunnel_get_encap_size(struct lwtunnel_state *lwtstate); struct lwtunnel_state *lwtunnel_state_alloc(int hdr_len); int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b); int lwtunnel_output(struct sock *sk, struct sk_buff *skb); -int lwtunnel_output6(struct sock *sk, struct sk_buff *skb); int lwtunnel_input(struct sk_buff *skb); -int lwtunnel_input6(struct sk_buff *skb); #else @@ -164,21 +162,11 @@ static inline int lwtunnel_output(struct sock *sk, struct sk_buff *skb) return -EOPNOTSUPP; } -static inline int lwtunnel_output6(struct sock *sk, struct sk_buff *skb) -{ - return -EOPNOTSUPP; -} - static inline int lwtunnel_input(struct sk_buff *skb) { return -EOPNOTSUPP; } -static inline int lwtunnel_input6(struct sk_buff *skb) -{ - return -EOPNOTSUPP; -} - #endif #endif /* __NET_LWTUNNEL_H */ diff --git a/include/net/route.h b/include/net/route.h index 6dda2c1bf8c6..395d79bb556c 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -66,7 +66,6 @@ struct rtable { struct list_head rt_uncached; struct uncached_list *rt_uncached_list; - struct lwtunnel_state *rt_lwtstate; }; static inline bool rt_is_input_route(const struct rtable *rt) -- cgit v1.2.3 From ab450605b35caa768ca33e86db9403229bf42be4 Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Thu, 20 Aug 2015 13:56:27 +0200 Subject: ipv6: ndisc: inherit metadata dst when creating ndisc requests If output device wants to see the dst, inherit the dst of the original skb in the ndisc request. This is an IPv6 counterpart of commit 0accfc268f4d ("arp: Inherit metadata dst when creating ARP requests"). Signed-off-by: Jiri Benc Acked-by: Thomas Graf Signed-off-by: David S. Miller --- include/net/ndisc.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/ndisc.h b/include/net/ndisc.h index b3a7751251b4..aba5695fadb0 100644 --- a/include/net/ndisc.h +++ b/include/net/ndisc.h @@ -182,7 +182,8 @@ int ndisc_rcv(struct sk_buff *skb); void ndisc_send_ns(struct net_device *dev, struct neighbour *neigh, const struct in6_addr *solicit, - const struct in6_addr *daddr, const struct in6_addr *saddr); + const struct in6_addr *daddr, const struct in6_addr *saddr, + struct sk_buff *oskb); void ndisc_send_rs(struct net_device *dev, const struct in6_addr *saddr, const struct in6_addr *daddr); -- cgit v1.2.3 From 705cc62f6728c5a23e3c82465aa94e652e0b50e4 Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Thu, 20 Aug 2015 13:56:28 +0200 Subject: vxlan: provide access function for vxlan socket address family Signed-off-by: Jiri Benc Acked-by: Thomas Graf Signed-off-by: David S. Miller --- include/net/vxlan.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/net') diff --git a/include/net/vxlan.h b/include/net/vxlan.h index e4534f1b2d8c..43677e6b9c43 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -241,3 +241,8 @@ static inline void vxlan_get_rx_port(struct net_device *netdev) } #endif #endif + +static inline unsigned short vxlan_get_sk_family(struct vxlan_sock *vs) +{ + return vs->sock->sk->sk_family; +} -- cgit v1.2.3 From 904af04d30f303d96902584206457128c3051d8d Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Thu, 20 Aug 2015 13:56:31 +0200 Subject: ipv6: route: extend flow representation with tunnel key Use flowi_tunnel in flowi6 similarly to what is done with IPv4. This complements commit 1b7179d3adff ("route: Extend flow representation with tunnel key"). Signed-off-by: Jiri Benc Acked-by: Thomas Graf Signed-off-by: David S. Miller --- include/net/flow.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/flow.h b/include/net/flow.h index f305588fc162..9e0297c4c11d 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -130,6 +130,7 @@ struct flowi6 { #define flowi6_proto __fl_common.flowic_proto #define flowi6_flags __fl_common.flowic_flags #define flowi6_secid __fl_common.flowic_secid +#define flowi6_tun_key __fl_common.flowic_tun_key struct in6_addr daddr; struct in6_addr saddr; __be32 flowlabel; -- cgit v1.2.3 From e4ff67513096e6e196ca58043fce04d0f87babbe Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sun, 26 Jul 2015 15:03:27 +0300 Subject: ipvs: add sync_maxlen parameter for the sync daemon Allow setups with large MTU to send large sync packets by adding sync_maxlen parameter. The default value is now based on MTU but no more than 1500 for compatibility reasons. To avoid problems if MTU changes allow fragmentation by sending packets with DF=0. Problem reported by Dan Carpenter. Reported-by: Dan Carpenter Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman --- include/net/ip_vs.h | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'include/net') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 4e3731ee4eac..2fdc13caf712 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -846,6 +846,13 @@ struct ipvs_master_sync_state { /* How much time to keep dests in trash */ #define IP_VS_DEST_TRASH_PERIOD (120 * HZ) +struct ipvs_sync_daemon_cfg { + int syncid; + u16 sync_maxlen; + /* multicast interface name */ + char mcast_ifn[IP_VS_IFNAME_MAXLEN]; +}; + /* IPVS in network namespace */ struct netns_ipvs { int gen; /* Generation */ @@ -961,15 +968,10 @@ struct netns_ipvs { spinlock_t sync_buff_lock; struct task_struct **backup_threads; int threads_mask; - int send_mesg_maxlen; - int recv_mesg_maxlen; volatile int sync_state; - volatile int master_syncid; - volatile int backup_syncid; struct mutex sync_mutex; - /* multicast interface name */ - char master_mcast_ifn[IP_VS_IFNAME_MAXLEN]; - char backup_mcast_ifn[IP_VS_IFNAME_MAXLEN]; + struct ipvs_sync_daemon_cfg mcfg; /* Master Configuration */ + struct ipvs_sync_daemon_cfg bcfg; /* Backup Configuration */ /* net name space ptr */ struct net *net; /* Needed by timer routines */ /* Number of heterogeneous destinations, needed becaus heterogeneous @@ -1408,7 +1410,8 @@ static inline void ip_vs_dest_put_and_free(struct ip_vs_dest *dest) /* IPVS sync daemon data and function prototypes * (from ip_vs_sync.c) */ -int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid); +int start_sync_thread(struct net *net, struct ipvs_sync_daemon_cfg *cfg, + int state); int stop_sync_thread(struct net *net, int state); void ip_vs_sync_conn(struct net *net, struct ip_vs_conn *cp, int pkts); -- cgit v1.2.3 From d33288172e72c4729e8b9f2243fb40601afabc8f Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sun, 26 Jul 2015 15:03:28 +0300 Subject: ipvs: add more mcast parameters for the sync daemon - mcast_group: configure the multicast address, now IPv6 is supported too - mcast_port: configure the multicast port - mcast_ttl: configure the multicast TTL/HOP_LIMIT Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman --- include/net/ip_vs.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/net') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 2fdc13caf712..9b9ca87a4210 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -847,8 +847,12 @@ struct ipvs_master_sync_state { #define IP_VS_DEST_TRASH_PERIOD (120 * HZ) struct ipvs_sync_daemon_cfg { + union nf_inet_addr mcast_group; int syncid; u16 sync_maxlen; + u16 mcast_port; + u8 mcast_af; + u8 mcast_ttl; /* multicast interface name */ char mcast_ifn[IP_VS_IFNAME_MAXLEN]; }; -- cgit v1.2.3 From 58ce31cca1ffe057f4744c3f671e3e84606d3d4a Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Wed, 19 Aug 2015 17:07:33 -0700 Subject: vxlan: GRO support at tunnel layer Add calls to gro_cells infrastructure to do GRO when receiving on a tunnel. Testing: Ran 200 netperf TCP_STREAM instance - With fix (GRO enabled on VXLAN interface) Verify GRO is happening. 9084 MBps tput 3.44% CPU utilization - Without fix (GRO disabled on VXLAN interface) Verified no GRO is happening. 9084 MBps tput 5.54% CPU utilization Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/vxlan.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 43677e6b9c43..6b3234599a2c 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -161,6 +161,7 @@ struct vxlan_dev { struct timer_list age_timer; spinlock_t hash_lock; unsigned int addrcnt; + struct gro_cells gro_cells; struct vxlan_config cfg; -- cgit v1.2.3 From 751a587ac9f9a8bf314590fbac32d9e418060c5a Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Fri, 21 Aug 2015 12:41:14 +0200 Subject: route: fix breakage after moving lwtunnel state __recnt and related fields need to be in its own cacheline for performance reasons. Commit 61adedf3e3f1 ("route: move lwtunnel state to dst_entry") broke that on 32bit archs, causing BUILD_BUG_ON in dst_hold to be triggered. This patch fixes the breakage by moving the lwtunnel state to the end of dst_entry on 32bit archs. Unfortunately, this makes it share the cacheline with __refcnt and may affect performance, thus further patches may be needed. Reported-by: kbuild test robot Fixes: 61adedf3e3f1 ("route: move lwtunnel state to dst_entry") Signed-off-by: Jiri Benc Signed-off-by: David S. Miller --- include/net/dst.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/dst.h b/include/net/dst.h index 0a9a723f6c19..ef8f1d43a203 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -44,7 +44,6 @@ struct dst_entry { #else void *__pad1; #endif - struct lwtunnel_state *lwtstate; int (*input)(struct sk_buff *); int (*output)(struct sock *sk, struct sk_buff *skb); @@ -85,11 +84,12 @@ struct dst_entry { __u32 __pad2; #endif +#ifdef CONFIG_64BIT + struct lwtunnel_state *lwtstate; /* * Align __refcnt to a 64 bytes alignment * (L1_CACHE_SIZE would be too much) */ -#ifdef CONFIG_64BIT long __pad_to_align_refcnt[1]; #endif /* @@ -99,6 +99,9 @@ struct dst_entry { atomic_t __refcnt; /* client references */ int __use; unsigned long lastuse; +#ifndef CONFIG_64BIT + struct lwtunnel_state *lwtstate; +#endif union { struct dst_entry *next; struct rtable __rcu *rt_next; -- cgit v1.2.3 From 127eb7cd3c210afead788991a30950a9e36759ea Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 24 Aug 2015 09:45:41 -0700 Subject: lwt: Add cfg argument to build_state Add cfg and family arguments to lwt build state functions. cfg is a void pointer and will either be a pointer to a fib_config or fib6_config structure. The family parameter indicates which one (either AF_INET or AF_INET6). LWT encpasulation implementation may use the fib configuration to build the LWT state. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/lwtunnel.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/net') diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h index 843489884448..fce0e35e74d0 100644 --- a/include/net/lwtunnel.h +++ b/include/net/lwtunnel.h @@ -26,6 +26,7 @@ struct lwtunnel_state { struct lwtunnel_encap_ops { int (*build_state)(struct net_device *dev, struct nlattr *encap, + unsigned int family, const void *cfg, struct lwtunnel_state **ts); int (*output)(struct sock *sk, struct sk_buff *skb); int (*input)(struct sk_buff *skb); @@ -80,6 +81,7 @@ int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *op, unsigned int num); int lwtunnel_build_state(struct net_device *dev, u16 encap_type, struct nlattr *encap, + unsigned int family, const void *cfg, struct lwtunnel_state **lws); int lwtunnel_fill_encap(struct sk_buff *skb, struct lwtunnel_state *lwtstate); @@ -130,6 +132,7 @@ static inline int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *op, static inline int lwtunnel_build_state(struct net_device *dev, u16 encap_type, struct nlattr *encap, + unsigned int family, const void *cfg, struct lwtunnel_state **lws) { return -EOPNOTSUPP; -- cgit v1.2.3 From 6f021c62d64f38092bc2a0c5fe7b81d5e5b21a00 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 21 Aug 2015 12:30:00 -0700 Subject: tcp: fix slow start after idle vs TSO/GSO slow start after idle might reduce cwnd, but we perform this after first packet was cooked and sent. With TSO/GSO, it means that we might send a full TSO packet even if cwnd should have been reduced to IW10. Moving the SSAI check in skb_entail() makes sense, because we slightly reduce number of times this check is done, especially for large send() and TCP Small queue callbacks from softirq context. As Neal pointed out, we also need to perform the check if/when receive window opens. Tested: Following packetdrill test demonstrates the problem // Test of slow start after idle `sysctl -q net.ipv4.tcp_slow_start_after_idle=1` 0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 +0 bind(3, ..., ...) = 0 +0 listen(3, 1) = 0 +0 < S 0:0(0) win 65535 +0 > S. 0:0(0) ack 1 +.100 < . 1:1(0) ack 1 win 511 +0 accept(3, ..., ...) = 4 +0 setsockopt(4, SOL_SOCKET, SO_SNDBUF, [200000], 4) = 0 +0 write(4, ..., 26000) = 26000 +0 > . 1:5001(5000) ack 1 +0 > . 5001:10001(5000) ack 1 +0 %{ assert tcpi_snd_cwnd == 10 }% +.100 < . 1:1(0) ack 10001 win 511 +0 %{ assert tcpi_snd_cwnd == 20, tcpi_snd_cwnd }% +0 > . 10001:20001(10000) ack 1 +0 > P. 20001:26001(6000) ack 1 +.100 < . 1:1(0) ack 26001 win 511 +0 %{ assert tcpi_snd_cwnd == 36, tcpi_snd_cwnd }% +4 write(4, ..., 20000) = 20000 // If slow start after idle works properly, we should send 5 MSS here (cwnd/2) +0 > . 26001:31001(5000) ack 1 +0 %{ assert tcpi_snd_cwnd == 10, tcpi_snd_cwnd }% +0 > . 31001:36001(5000) ack 1 Signed-off-by: Eric Dumazet Cc: Neal Cardwell Cc: Yuchung Cheng Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- include/net/tcp.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 364426a2be5a..309801f7eb82 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1165,6 +1165,19 @@ static inline void tcp_sack_reset(struct tcp_options_received *rx_opt) } u32 tcp_default_init_rwnd(u32 mss); +void tcp_cwnd_restart(struct sock *sk, s32 delta); + +static inline void tcp_slow_start_after_idle_check(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + s32 delta; + + if (!sysctl_tcp_slow_start_after_idle || tp->packets_out) + return; + delta = tcp_time_stamp - tp->lsndtime; + if (delta > inet_csk(sk)->icsk_rto) + tcp_cwnd_restart(sk, delta); +} /* Determine a window scaling and initial window to offer. */ void tcp_select_initial_window(int __space, __u32 mss, __u32 *rcv_wnd, -- cgit v1.2.3 From 43e122b014c955a33220fabbd09c4b5e4f422c3c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 21 Aug 2015 17:38:02 -0700 Subject: tcp: refine pacing rate determination When TCP pacing was added back in linux-3.12, we chose to apply a fixed ratio of 200 % against current rate, to allow probing for optimal throughput even during slow start phase, where cwnd can be doubled every other gRTT. At Google, we found it was better applying a different ratio while in Congestion Avoidance phase. This ratio was set to 120 %. We've used the normal tcp_in_slow_start() helper for a while, then tuned the condition to select the conservative ratio as soon as cwnd >= ssthresh/2 : - After cwnd reduction, it is safer to ramp up more slowly, as we approach optimal cwnd. - Initial ramp up (ssthresh == INFINITY) still allows doubling cwnd every other RTT. Signed-off-by: Eric Dumazet Cc: Neal Cardwell Cc: Yuchung Cheng Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- include/net/tcp.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 309801f7eb82..4a7b03947a38 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -281,6 +281,8 @@ extern unsigned int sysctl_tcp_notsent_lowat; extern int sysctl_tcp_min_tso_segs; extern int sysctl_tcp_autocorking; extern int sysctl_tcp_invalid_ratelimit; +extern int sysctl_tcp_pacing_ss_ratio; +extern int sysctl_tcp_pacing_ca_ratio; extern atomic_long_t tcp_memory_allocated; extern struct percpu_counter tcp_sockets_allocated; -- cgit v1.2.3 From 2c0027cd54cc3ed856e87d9aeddb6ef00f5f17f4 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Sun, 23 Aug 2015 08:21:22 -0600 Subject: inetpeer: remove dead code Remove various inlined functions not referenced in the kernel. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/inetpeer.h | 67 -------------------------------------------------- 1 file changed, 67 deletions(-) (limited to 'include/net') diff --git a/include/net/inetpeer.h b/include/net/inetpeer.h index d5332ddcea3f..002f0bd27001 100644 --- a/include/net/inetpeer.h +++ b/include/net/inetpeer.h @@ -65,71 +65,12 @@ struct inet_peer_base { int total; }; -#define INETPEER_BASE_BIT 0x1UL - -static inline struct inet_peer *inetpeer_ptr(unsigned long val) -{ - BUG_ON(val & INETPEER_BASE_BIT); - return (struct inet_peer *) val; -} - -static inline struct inet_peer_base *inetpeer_base_ptr(unsigned long val) -{ - if (!(val & INETPEER_BASE_BIT)) - return NULL; - val &= ~INETPEER_BASE_BIT; - return (struct inet_peer_base *) val; -} - -static inline bool inetpeer_ptr_is_peer(unsigned long val) -{ - return !(val & INETPEER_BASE_BIT); -} - -static inline void __inetpeer_ptr_set_peer(unsigned long *val, struct inet_peer *peer) -{ - /* This implicitly clears INETPEER_BASE_BIT */ - *val = (unsigned long) peer; -} - -static inline bool inetpeer_ptr_set_peer(unsigned long *ptr, struct inet_peer *peer) -{ - unsigned long val = (unsigned long) peer; - unsigned long orig = *ptr; - - if (!(orig & INETPEER_BASE_BIT) || - cmpxchg(ptr, orig, val) != orig) - return false; - return true; -} - -static inline void inetpeer_init_ptr(unsigned long *ptr, struct inet_peer_base *base) -{ - *ptr = (unsigned long) base | INETPEER_BASE_BIT; -} - -static inline void inetpeer_transfer_peer(unsigned long *to, unsigned long *from) -{ - unsigned long val = *from; - - *to = val; - if (inetpeer_ptr_is_peer(val)) { - struct inet_peer *peer = inetpeer_ptr(val); - atomic_inc(&peer->refcnt); - } -} - void inet_peer_base_init(struct inet_peer_base *); void inet_initpeers(void) __init; #define INETPEER_METRICS_NEW (~(u32) 0) -static inline bool inet_metrics_new(const struct inet_peer *p) -{ - return p->metrics[RTAX_LOCK-1] == INETPEER_METRICS_NEW; -} - /* can be called with or without local BH being disabled */ struct inet_peer *inet_getpeer(struct inet_peer_base *base, const struct inetpeer_addr *daddr, @@ -163,12 +104,4 @@ bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout); void inetpeer_invalidate_tree(struct inet_peer_base *); -/* - * temporary check to make sure we dont access rid, tcp_ts, - * tcp_ts_stamp if no refcount is taken on inet_peer - */ -static inline void inet_peer_refcheck(const struct inet_peer *p) -{ - WARN_ON_ONCE(atomic_read(&p->refcnt) <= 0); -} #endif /* _NET_INETPEER_H */ -- cgit v1.2.3 From 48e92c44bd73a8bc213560058e6b18e45929526e Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Tue, 25 Aug 2015 18:36:50 +0200 Subject: vxlan: fix multiple inclusion of vxlan.h The vxlan_get_sk_family inline function was added after the last #endif, making multiple inclusion of net/vxlan.h fail. Move it to the proper place. Reported-by: Mark Rustad Fixes: 705cc62f6728c ("vxlan: provide access function for vxlan socket address family") Signed-off-by: Jiri Benc Signed-off-by: David S. Miller --- include/net/vxlan.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 6b3234599a2c..480a319b4c92 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -241,9 +241,10 @@ static inline void vxlan_get_rx_port(struct net_device *netdev) { } #endif -#endif static inline unsigned short vxlan_get_sk_family(struct vxlan_sock *vs) { return vs->sock->sk->sk_family; } + +#endif -- cgit v1.2.3 From 3c645621b79828be7a46fb2694eb423b343b4bbe Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 25 Aug 2015 20:06:31 -0700 Subject: net_sched: make tcf_hash_destroy() static tcf_hash_destroy() used once. Make it static. Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/net/act_api.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/net') diff --git a/include/net/act_api.h b/include/net/act_api.h index 4519c81304bd..9d446f136607 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -111,7 +111,6 @@ struct tc_action_ops { }; int tcf_hash_search(struct tc_action *a, u32 index); -void tcf_hash_destroy(struct tc_action *a); u32 tcf_hash_new_index(struct tcf_hashinfo *hinfo); int tcf_hash_check(u32 index, struct tc_action *a, int bind); int tcf_hash_create(u32 index, struct nlattr *est, struct tc_action *a, -- cgit v1.2.3 From cff82457c5584f6a96d2b85d1a88b81ba304a330 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 25 Aug 2015 20:06:35 -0700 Subject: net_sched: act_bpf: remove spinlock in fast path Similar to act_gact/act_mirred, act_bpf can be lockless in packet processing with extra care taken to free bpf programs after rcu grace period. Replacement of existing act_bpf (very rare) is done with synchronize_rcu() and final destruction is done from tc_action_ops->cleanup() callback that is called from tcf_exts_destroy()->tcf_action_destroy()->__tcf_hash_release() when bind and refcnt reach zero which is only possible when classifier is destroyed. Previous two patches fixed the last two classifiers (tcindex and rsvp) to call tcf_exts_destroy() from rcu callback. Similar to gact/mirred there is a race between prog->filter and prog->tcf_action. Meaning that the program being replaced may use previous default action if it happened to return TC_ACT_UNSPEC. act_mirred race betwen tcf_action and tcfm_dev is similar. In all cases the race is harmless. Long term we may want to improve the situation by replacing the whole tc_action->priv as single pointer instead of updating inner fields one by one. Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/net/tc_act/tc_bpf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/tc_act/tc_bpf.h b/include/net/tc_act/tc_bpf.h index a152e9858b2c..958d69cfb19c 100644 --- a/include/net/tc_act/tc_bpf.h +++ b/include/net/tc_act/tc_bpf.h @@ -15,7 +15,7 @@ struct tcf_bpf { struct tcf_common common; - struct bpf_prog *filter; + struct bpf_prog __rcu *filter; union { u32 bpf_fd; u16 bpf_num_ops; -- cgit v1.2.3 From e79e259588a414589a016edc428ee8dd308f81ad Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Wed, 26 Aug 2015 11:31:47 -0700 Subject: dst: Add __skb_dst_copy() variation This variation on skb_dst_copy() doesn't require two skbs. Signed-off-by: Joe Stringer Acked-by: Pravin B Shelar Acked-by: Thomas Graf Signed-off-by: David S. Miller --- include/net/dst.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/dst.h b/include/net/dst.h index ef8f1d43a203..4c4801645371 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -289,13 +289,18 @@ static inline void skb_dst_drop(struct sk_buff *skb) } } -static inline void skb_dst_copy(struct sk_buff *nskb, const struct sk_buff *oskb) +static inline void __skb_dst_copy(struct sk_buff *nskb, unsigned long refdst) { - nskb->_skb_refdst = oskb->_skb_refdst; + nskb->_skb_refdst = refdst; if (!(nskb->_skb_refdst & SKB_DST_NOREF)) dst_clone(skb_dst(nskb)); } +static inline void skb_dst_copy(struct sk_buff *nskb, const struct sk_buff *oskb) +{ + __skb_dst_copy(nskb, oskb->_skb_refdst); +} + /** * skb_dst_force - makes sure skb dst is refcounted * @skb: buffer -- cgit v1.2.3 From 86ca02e77408bb58ba596c1a411ec7f631733690 Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Wed, 26 Aug 2015 11:31:51 -0700 Subject: netfilter: connlabels: Export setting connlabel length Add functions to change connlabel length into nf_conntrack_labels.c so they may be reused by other modules like OVS and nftables without needing to jump through xt_match_check() hoops. Suggested-by: Florian Westphal Signed-off-by: Joe Stringer Acked-by: Florian Westphal Acked-by: Thomas Graf Signed-off-by: David S. Miller --- include/net/netfilter/nf_conntrack_labels.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack_labels.h b/include/net/netfilter/nf_conntrack_labels.h index dec6336bf850..7e2b1d025f50 100644 --- a/include/net/netfilter/nf_conntrack_labels.h +++ b/include/net/netfilter/nf_conntrack_labels.h @@ -54,7 +54,11 @@ int nf_connlabels_replace(struct nf_conn *ct, #ifdef CONFIG_NF_CONNTRACK_LABELS int nf_conntrack_labels_init(void); void nf_conntrack_labels_fini(void); +int nf_connlabels_get(struct net *net, unsigned int n_bits); +void nf_connlabels_put(struct net *net); #else static inline int nf_conntrack_labels_init(void) { return 0; } static inline void nf_conntrack_labels_fini(void) {} +static inline int nf_connlabels_get(struct net *net, unsigned int n_bits) { return 0; } +static inline void nf_connlabels_put(struct net *net) {} #endif -- cgit v1.2.3 From 3b3ae880266d148bf73a573a766bc9b78c08d805 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 26 Aug 2015 23:00:06 +0200 Subject: net: sched: consolidate tc_classify{,_compat} For classifiers getting invoked via tc_classify(), we always need an extra function call into tc_classify_compat(), as both are being exported as symbols and tc_classify() itself doesn't do much except handling of reclassifications when tp->classify() returned with TC_ACT_RECLASSIFY. CBQ and ATM are the only qdiscs that directly call into tc_classify_compat(), all others use tc_classify(). When tc actions are being configured out in the kernel, tc_classify() effectively does nothing besides delegating. We could spare this layer and consolidate both functions. pktgen on single CPU constantly pushing skbs directly into the netif_receive_skb() path with a dummy classifier on ingress qdisc attached, improves slightly from 22.3Mpps to 23.1Mpps. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/net/pkt_sched.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index 2342bf12cb78..401038d2f9b8 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -110,10 +110,8 @@ static inline void qdisc_run(struct Qdisc *q) __qdisc_run(q); } -int tc_classify_compat(struct sk_buff *skb, const struct tcf_proto *tp, - struct tcf_result *res); int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp, - struct tcf_result *res); + struct tcf_result *res, bool compat_mode); static inline __be16 tc_skb_protocol(const struct sk_buff *skb) { -- cgit v1.2.3 From c29a70d2cadfea443c027d23481f820530b70057 Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Wed, 26 Aug 2015 23:46:50 -0700 Subject: tunnel: introduce udp_tun_rx_dst() Introduce function udp_tun_rx_dst() to initialize tunnel dst on receive path. Signed-off-by: Pravin B Shelar Reviewed-by: Jesse Gross Acked-by: Thomas Graf Signed-off-by: David S. Miller --- include/net/dst_metadata.h | 61 ++++++++++++++++++++++++++++++++++++++++++++++ include/net/udp_tunnel.h | 4 +++ 2 files changed, 65 insertions(+) (limited to 'include/net') diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h index 2cb52d562272..60c03326c087 100644 --- a/include/net/dst_metadata.h +++ b/include/net/dst_metadata.h @@ -48,4 +48,65 @@ static inline bool skb_valid_dst(const struct sk_buff *skb) struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags); struct metadata_dst __percpu *metadata_dst_alloc_percpu(u8 optslen, gfp_t flags); +static inline struct metadata_dst *tun_rx_dst(__be16 flags, + __be64 tunnel_id, int md_size) +{ + struct metadata_dst *tun_dst; + struct ip_tunnel_info *info; + + tun_dst = metadata_dst_alloc(md_size, GFP_ATOMIC); + if (!tun_dst) + return NULL; + + info = &tun_dst->u.tun_info; + info->mode = IP_TUNNEL_INFO_RX; + info->key.tun_flags = flags; + info->key.tun_id = tunnel_id; + info->key.tp_src = 0; + info->key.tp_dst = 0; + return tun_dst; +} + +static inline struct metadata_dst *ip_tun_rx_dst(struct sk_buff *skb, + __be16 flags, + __be64 tunnel_id, + int md_size) +{ + const struct iphdr *iph = ip_hdr(skb); + struct metadata_dst *tun_dst; + struct ip_tunnel_info *info; + + tun_dst = tun_rx_dst(flags, tunnel_id, md_size); + if (!tun_dst) + return NULL; + + info = &tun_dst->u.tun_info; + info->key.u.ipv4.src = iph->saddr; + info->key.u.ipv4.dst = iph->daddr; + info->key.tos = iph->tos; + info->key.ttl = iph->ttl; + return tun_dst; +} + +static inline struct metadata_dst *ipv6_tun_rx_dst(struct sk_buff *skb, + __be16 flags, + __be64 tunnel_id, + int md_size) +{ + const struct ipv6hdr *ip6h = ipv6_hdr(skb); + struct metadata_dst *tun_dst; + struct ip_tunnel_info *info; + + tun_dst = tun_rx_dst(flags, tunnel_id, md_size); + if (!tun_dst) + return NULL; + + info = &tun_dst->u.tun_info; + info->key.u.ipv6.src = ip6h->saddr; + info->key.u.ipv6.dst = ip6h->daddr; + info->key.tos = ipv6_get_dsfield(ip6h); + info->key.ttl = ip6h->hop_limit; + return tun_dst; +} + #endif /* __NET_DST_METADATA_H */ diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h index c491c1221606..35041d0fc21e 100644 --- a/include/net/udp_tunnel.h +++ b/include/net/udp_tunnel.h @@ -93,6 +93,10 @@ int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk, void udp_tunnel_sock_release(struct socket *sock); +struct metadata_dst *udp_tun_rx_dst(struct sk_buff *skb, unsigned short family, + __be16 flags, __be64 tunnel_id, + int md_size); + static inline struct sk_buff *udp_tunnel_handle_offloads(struct sk_buff *skb, bool udp_csum) { -- cgit v1.2.3 From e305ac6cf5a1e1386aedce7ef9cb773635d5845c Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Wed, 26 Aug 2015 23:46:52 -0700 Subject: geneve: Add support to collect tunnel metadata. Following patch create new tunnel flag which enable tunnel metadata collection on given device. These devices can be used by tunnel metadata based routing or by OVS. Geneve Consolidation patch get rid of collect_md_tun to simplify tunnel lookup further. Signed-off-by: Pravin B Shelar Reviewed-by: Jesse Gross Acked-by: Thomas Graf Signed-off-by: David S. Miller --- include/net/geneve.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/net') diff --git a/include/net/geneve.h b/include/net/geneve.h index 2a0543a1899d..4245e1d23b9b 100644 --- a/include/net/geneve.h +++ b/include/net/geneve.h @@ -96,6 +96,9 @@ int geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt, __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port, __be16 tun_flags, u8 vni[3], u8 opt_len, u8 *opt, bool csum, bool xnet); + +struct net_device *geneve_dev_create_fb(struct net *net, const char *name, + u8 name_assign_type, u16 dst_port); #endif /*ifdef CONFIG_INET */ #endif /*ifdef__NET_GENEVE_H */ -- cgit v1.2.3 From 371bd1061d29562e6423435073623add8c475ee2 Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Wed, 26 Aug 2015 23:46:54 -0700 Subject: geneve: Consolidate Geneve functionality in single module. geneve_core module handles send and receive functionality. This way OVS could use the Geneve API. Now with use of tunnel meatadata mode OVS can directly use Geneve netdevice. So there is no need for separate module for Geneve. Following patch consolidates Geneve protocol processing in single module. Signed-off-by: Pravin B Shelar Reviewed-by: Jesse Gross Acked-by: John W. Linville Signed-off-by: David S. Miller --- include/net/geneve.h | 34 ---------------------------------- 1 file changed, 34 deletions(-) (limited to 'include/net') diff --git a/include/net/geneve.h b/include/net/geneve.h index 4245e1d23b9b..3106ed6eae0d 100644 --- a/include/net/geneve.h +++ b/include/net/geneve.h @@ -62,41 +62,7 @@ struct genevehdr { struct geneve_opt options[]; }; -static inline struct genevehdr *geneve_hdr(const struct sk_buff *skb) -{ - return (struct genevehdr *)(udp_hdr(skb) + 1); -} - #ifdef CONFIG_INET -struct geneve_sock; - -typedef void (geneve_rcv_t)(struct geneve_sock *gs, struct sk_buff *skb); - -struct geneve_sock { - struct list_head list; - geneve_rcv_t *rcv; - void *rcv_data; - struct socket *sock; - struct rcu_head rcu; - int refcnt; - struct udp_offload udp_offloads; -}; - -#define GENEVE_VER 0 -#define GENEVE_BASE_HLEN (sizeof(struct udphdr) + sizeof(struct genevehdr)) - -struct geneve_sock *geneve_sock_add(struct net *net, __be16 port, - geneve_rcv_t *rcv, void *data, - bool no_share, bool ipv6); - -void geneve_sock_release(struct geneve_sock *vs); - -int geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt, - struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos, - __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port, - __be16 tun_flags, u8 vni[3], u8 opt_len, u8 *opt, - bool csum, bool xnet); - struct net_device *geneve_dev_create_fb(struct net *net, const char *name, u8 name_assign_type, u16 dst_port); #endif /*ifdef CONFIG_INET */ -- cgit v1.2.3 From d66d6c3152e8d5a6db42a56bf7ae1c6cae87ba48 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Thu, 27 Aug 2015 21:21:38 +0200 Subject: net: sched: register noqueue qdisc This way users can attach noqueue just like any other qdisc using tc without having to mess with tx_queue_len first. Signed-off-by: Phil Sutter Signed-off-by: David S. Miller --- include/net/sch_generic.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 2eab08c38e32..444faa89a55f 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -340,6 +340,7 @@ extern struct Qdisc noop_qdisc; extern struct Qdisc_ops noop_qdisc_ops; extern struct Qdisc_ops pfifo_fast_ops; extern struct Qdisc_ops mq_qdisc_ops; +extern struct Qdisc_ops noqueue_qdisc_ops; extern const struct Qdisc_ops *default_qdisc_ops; struct Qdisc_class_common { -- cgit v1.2.3 From 72afa352d6a3d4da7783b5ddee02b94be49e051a Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 27 Aug 2015 16:06:59 -0700 Subject: net: Introduce ipv4_addr_hash and use it for tcp metrics Refactors a common line into helper function. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/net') diff --git a/include/net/ip.h b/include/net/ip.h index bee5f3582e38..7b9e1c782aa3 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -458,6 +458,11 @@ static __inline__ void inet_reset_saddr(struct sock *sk) #endif +static inline unsigned int ipv4_addr_hash(__be32 ip) +{ + return (__force unsigned int) ip; +} + bool ip_call_ra_chain(struct sk_buff *skb); /* -- cgit v1.2.3 From 3abef286cf2f138de353fb0b54453621de961043 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 27 Aug 2015 16:07:00 -0700 Subject: net: Add set,get helpers for inetpeer addresses Use inetpeer set,get helpers in tcp_metrics rather than peeking into the inetpeer_addr struct. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/inetpeer.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'include/net') diff --git a/include/net/inetpeer.h b/include/net/inetpeer.h index 002f0bd27001..f75b9e7036a2 100644 --- a/include/net/inetpeer.h +++ b/include/net/inetpeer.h @@ -71,6 +71,29 @@ void inet_initpeers(void) __init; #define INETPEER_METRICS_NEW (~(u32) 0) +static inline void inetpeer_set_addr_v4(struct inetpeer_addr *iaddr, __be32 ip) +{ + iaddr->addr.a4 = ip; + iaddr->family = AF_INET; +} + +static inline __be32 inetpeer_get_addr_v4(struct inetpeer_addr *iaddr) +{ + return iaddr->addr.a4; +} + +static inline void inetpeer_set_addr_v6(struct inetpeer_addr *iaddr, + struct in6_addr *in6) +{ + iaddr->addr.in6 = *in6; + iaddr->family = AF_INET6; +} + +static inline struct in6_addr *inetpeer_get_addr_v6(struct inetpeer_addr *iaddr) +{ + return &iaddr->addr.in6; +} + /* can be called with or without local BH being disabled */ struct inet_peer *inet_getpeer(struct inet_peer_base *base, const struct inetpeer_addr *daddr, -- cgit v1.2.3 From d39d14ffa24cca9f0e44aa4a63315f4c44c56a93 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 27 Aug 2015 16:07:01 -0700 Subject: net: Add helper function to compare inetpeer addresses tcp_metrics and inetpeer both have functions to compare inetpeer addresses. Consolidate into 1 version. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/inetpeer.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include/net') diff --git a/include/net/inetpeer.h b/include/net/inetpeer.h index f75b9e7036a2..9d9b3446731d 100644 --- a/include/net/inetpeer.h +++ b/include/net/inetpeer.h @@ -121,6 +121,22 @@ static inline struct inet_peer *inet_getpeer_v6(struct inet_peer_base *base, return inet_getpeer(base, &daddr, create); } +static inline int inetpeer_addr_cmp(const struct inetpeer_addr *a, + const struct inetpeer_addr *b) +{ + int i, n = (a->family == AF_INET ? 1 : 4); + + for (i = 0; i < n; i++) { + if (a->addr.a6[i] == b->addr.a6[i]) + continue; + if ((__force u32)a->addr.a6[i] < (__force u32)b->addr.a6[i]) + return -1; + return 1; + } + + return 0; +} + /* can be called from BH context or outside */ void inet_putpeer(struct inet_peer *p); bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout); -- cgit v1.2.3 From 5345c2e12d41f815c1009c9dee72f3d5fcfd4282 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 27 Aug 2015 16:07:02 -0700 Subject: net: Refactor inetpeer address struct Move the inetpeer_addr_base union to inetpeer_addr and drop inetpeer_addr_base. Both the a6 and in6_addr overlays are not needed; drop the __be32 version and rename in6 to a6 for consistency with ipv4. Add a new u32 array to the union which removes the need for the typecast in the compare function and the use of a consistent arg for both ipv4 and ipv6 addresses which makes the compare function more readable. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/inetpeer.h | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) (limited to 'include/net') diff --git a/include/net/inetpeer.h b/include/net/inetpeer.h index 9d9b3446731d..e34f98aa93b1 100644 --- a/include/net/inetpeer.h +++ b/include/net/inetpeer.h @@ -15,16 +15,14 @@ #include #include -struct inetpeer_addr_base { +#define INETPEER_MAXKEYSZ (sizeof(struct in6_addr) / sizeof(u32)) + +struct inetpeer_addr { union { __be32 a4; - __be32 a6[4]; - struct in6_addr in6; + struct in6_addr a6; + u32 key[INETPEER_MAXKEYSZ]; }; -}; - -struct inetpeer_addr { - struct inetpeer_addr_base addr; __u16 family; }; @@ -73,25 +71,25 @@ void inet_initpeers(void) __init; static inline void inetpeer_set_addr_v4(struct inetpeer_addr *iaddr, __be32 ip) { - iaddr->addr.a4 = ip; + iaddr->a4 = ip; iaddr->family = AF_INET; } static inline __be32 inetpeer_get_addr_v4(struct inetpeer_addr *iaddr) { - return iaddr->addr.a4; + return iaddr->a4; } static inline void inetpeer_set_addr_v6(struct inetpeer_addr *iaddr, struct in6_addr *in6) { - iaddr->addr.in6 = *in6; + iaddr->a6 = *in6; iaddr->family = AF_INET6; } static inline struct in6_addr *inetpeer_get_addr_v6(struct inetpeer_addr *iaddr) { - return &iaddr->addr.in6; + return &iaddr->a6; } /* can be called with or without local BH being disabled */ @@ -105,7 +103,7 @@ static inline struct inet_peer *inet_getpeer_v4(struct inet_peer_base *base, { struct inetpeer_addr daddr; - daddr.addr.a4 = v4daddr; + daddr.a4 = v4daddr; daddr.family = AF_INET; return inet_getpeer(base, &daddr, create); } @@ -116,7 +114,7 @@ static inline struct inet_peer *inet_getpeer_v6(struct inet_peer_base *base, { struct inetpeer_addr daddr; - daddr.addr.in6 = *v6daddr; + daddr.a6 = *v6daddr; daddr.family = AF_INET6; return inet_getpeer(base, &daddr, create); } @@ -124,12 +122,17 @@ static inline struct inet_peer *inet_getpeer_v6(struct inet_peer_base *base, static inline int inetpeer_addr_cmp(const struct inetpeer_addr *a, const struct inetpeer_addr *b) { - int i, n = (a->family == AF_INET ? 1 : 4); + int i, n; + + if (a->family == AF_INET) + n = sizeof(a->a4) / sizeof(u32); + else + n = sizeof(a->a6) / sizeof(u32); for (i = 0; i < n; i++) { - if (a->addr.a6[i] == b->addr.a6[i]) + if (a->key[i] == b->key[i]) continue; - if ((__force u32)a->addr.a6[i] < (__force u32)b->addr.a6[i]) + if (a->key[i] < b->key[i]) return -1; return 1; } -- cgit v1.2.3 From 192132b9a034d87566294be0fba5f8f75c2cf16b Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 27 Aug 2015 16:07:03 -0700 Subject: net: Add support for VRFs to inetpeer cache inetpeer caches based on address only, so duplicate IP addresses within a namespace return the same cached entry. Enhance the ipv4 address key to contain both the IPv4 address and VRF device index. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/inetpeer.h | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'include/net') diff --git a/include/net/inetpeer.h b/include/net/inetpeer.h index e34f98aa93b1..4a6009d4486b 100644 --- a/include/net/inetpeer.h +++ b/include/net/inetpeer.h @@ -15,11 +15,17 @@ #include #include +/* IPv4 address key for cache lookups */ +struct ipv4_addr_key { + __be32 addr; + int vif; +}; + #define INETPEER_MAXKEYSZ (sizeof(struct in6_addr) / sizeof(u32)) struct inetpeer_addr { union { - __be32 a4; + struct ipv4_addr_key a4; struct in6_addr a6; u32 key[INETPEER_MAXKEYSZ]; }; @@ -71,13 +77,13 @@ void inet_initpeers(void) __init; static inline void inetpeer_set_addr_v4(struct inetpeer_addr *iaddr, __be32 ip) { - iaddr->a4 = ip; + iaddr->a4.addr = ip; iaddr->family = AF_INET; } static inline __be32 inetpeer_get_addr_v4(struct inetpeer_addr *iaddr) { - return iaddr->a4; + return iaddr->a4.addr; } static inline void inetpeer_set_addr_v6(struct inetpeer_addr *iaddr, @@ -99,11 +105,12 @@ struct inet_peer *inet_getpeer(struct inet_peer_base *base, static inline struct inet_peer *inet_getpeer_v4(struct inet_peer_base *base, __be32 v4daddr, - int create) + int vif, int create) { struct inetpeer_addr daddr; - daddr.a4 = v4daddr; + daddr.a4.addr = v4daddr; + daddr.a4.vif = vif; daddr.family = AF_INET; return inet_getpeer(base, &daddr, create); } -- cgit v1.2.3 From 46fa062ad63146dd138ec0f017e71224471e8ea5 Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Fri, 28 Aug 2015 20:48:19 +0200 Subject: ip_tunnels: convert the mode field of ip_tunnel_info to flags The mode field holds a single bit of information only (whether the ip_tunnel_info struct is for rx or tx). Change the mode field to bit flags. This allows more mode flags to be added. Signed-off-by: Jiri Benc Acked-by: Alexei Starovoitov Acked-by: Thomas Graf Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- include/net/dst_metadata.h | 1 - include/net/ip_tunnels.h | 9 ++------- 2 files changed, 2 insertions(+), 8 deletions(-) (limited to 'include/net') diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h index 60c03326c087..2b83f0d232e0 100644 --- a/include/net/dst_metadata.h +++ b/include/net/dst_metadata.h @@ -59,7 +59,6 @@ static inline struct metadata_dst *tun_rx_dst(__be16 flags, return NULL; info = &tun_dst->u.tun_info; - info->mode = IP_TUNNEL_INFO_RX; info->key.tun_flags = flags; info->key.tun_id = tunnel_id; info->key.tp_src = 0; diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 224e4ecec91b..9bdb3948798f 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -50,13 +50,8 @@ struct ip_tunnel_key { __be16 tp_dst; }; -/* Indicates whether the tunnel info structure represents receive - * or transmit tunnel parameters. - */ -enum { - IP_TUNNEL_INFO_RX, - IP_TUNNEL_INFO_TX, -}; +/* Flags for ip_tunnel_info mode. */ +#define IP_TUNNEL_INFO_TX 0x01 /* represents tx tunnel parameters */ struct ip_tunnel_info { struct ip_tunnel_key key; -- cgit v1.2.3 From 7f9562a1f405306eacb97f95d78cb996e33f27f5 Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Fri, 28 Aug 2015 20:48:20 +0200 Subject: ip_tunnels: record IP version in tunnel info There's currently nothing preventing directing packets with IPv6 encapsulation data to IPv4 tunnels (and vice versa). If this happens, IPv6 addresses are incorrectly interpreted as IPv4 ones. Track whether the given ip_tunnel_key contains IPv4 or IPv6 data. Store this in ip_tunnel_info. Reject packets at appropriate places if they are supposed to be encapsulated into an incompatible protocol. Signed-off-by: Jiri Benc Acked-by: Alexei Starovoitov Acked-by: Thomas Graf Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- include/net/dst_metadata.h | 1 + include/net/ip_tunnels.h | 10 ++++++++++ 2 files changed, 11 insertions(+) (limited to 'include/net') diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h index 2b83f0d232e0..d32f49cc621d 100644 --- a/include/net/dst_metadata.h +++ b/include/net/dst_metadata.h @@ -105,6 +105,7 @@ static inline struct metadata_dst *ipv6_tun_rx_dst(struct sk_buff *skb, info->key.u.ipv6.dst = ip6h->daddr; info->key.tos = ipv6_get_dsfield(ip6h); info->key.ttl = ip6h->hop_limit; + info->mode = IP_TUNNEL_INFO_IPV6; return tun_dst; } diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 9bdb3948798f..2b4fa06e91bd 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -52,6 +53,7 @@ struct ip_tunnel_key { /* Flags for ip_tunnel_info mode. */ #define IP_TUNNEL_INFO_TX 0x01 /* represents tx tunnel parameters */ +#define IP_TUNNEL_INFO_IPV6 0x02 /* key contains IPv6 addresses */ struct ip_tunnel_info { struct ip_tunnel_key key; @@ -208,6 +210,8 @@ static inline void __ip_tunnel_info_init(struct ip_tunnel_info *tun_info, tun_info->options = opts; tun_info->options_len = opts_len; + + tun_info->mode = 0; } static inline void ip_tunnel_info_init(struct ip_tunnel_info *tun_info, @@ -221,6 +225,12 @@ static inline void ip_tunnel_info_init(struct ip_tunnel_info *tun_info, tun_id, tun_flags, opts, opts_len); } +static inline unsigned short ip_tunnel_info_af(const struct ip_tunnel_info + *tun_info) +{ + return tun_info->mode & IP_TUNNEL_INFO_IPV6 ? AF_INET6 : AF_INET; +} + #ifdef CONFIG_INET int ip_tunnel_init(struct net_device *dev); -- cgit v1.2.3 From a43a9ef6a2e510fec61176ff2c34fab3e7d581da Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Fri, 28 Aug 2015 20:48:22 +0200 Subject: vxlan: do not receive IPv4 packets on IPv6 socket By default (subject to the sysctl settings), IPv6 sockets listen also for IPv4 traffic. Vxlan is not prepared for that and expects IPv6 header in packets received through an IPv6 socket. In addition, it's currently not possible to have both IPv4 and IPv6 vxlan tunnel on the same port (unless bindv6only sysctl is enabled), as it's not possible to create and bind both IPv4 and IPv6 vxlan interfaces and there's no way to specify both IPv4 and IPv6 remote/group IP addresses. Set IPV6_V6ONLY on vxlan sockets to fix both of these issues. This is not done globally in udp_tunnel, as l2tp and tipc seems to work okay when receiving IPv4 packets on IPv6 socket and people may rely on this behavior. The other tunnels (geneve and fou) do not support IPv6. Signed-off-by: Jiri Benc Signed-off-by: David S. Miller --- include/net/udp_tunnel.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h index 35041d0fc21e..cb2f89f20f5c 100644 --- a/include/net/udp_tunnel.h +++ b/include/net/udp_tunnel.h @@ -31,7 +31,8 @@ struct udp_port_cfg { __be16 peer_udp_port; unsigned int use_udp_checksums:1, use_udp6_tx_checksums:1, - use_udp6_rx_checksums:1; + use_udp6_rx_checksums:1, + ipv6_v6only:1; }; int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg, -- cgit v1.2.3 From c4c6bc314618f60ba69b0cbf93e506e4c38a11d2 Mon Sep 17 00:00:00 2001 From: Raghavendra K T Date: Sun, 30 Aug 2015 11:29:41 +0530 Subject: net: Introduce helper functions to get the per cpu data Signed-off-by: Raghavendra K T Signed-off-by: David S. Miller --- include/net/ip.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/net') diff --git a/include/net/ip.h b/include/net/ip.h index 7b9e1c782aa3..9b9ca2839399 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -202,10 +202,20 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, #define NET_ADD_STATS_BH(net, field, adnd) SNMP_ADD_STATS_BH((net)->mib.net_statistics, field, adnd) #define NET_ADD_STATS_USER(net, field, adnd) SNMP_ADD_STATS_USER((net)->mib.net_statistics, field, adnd) +u64 snmp_get_cpu_field(void __percpu *mib, int cpu, int offct); unsigned long snmp_fold_field(void __percpu *mib, int offt); #if BITS_PER_LONG==32 +u64 snmp_get_cpu_field64(void __percpu *mib, int cpu, int offct, + size_t syncp_offset); u64 snmp_fold_field64(void __percpu *mib, int offt, size_t sync_off); #else +static inline u64 snmp_get_cpu_field64(void __percpu *mib, int cpu, int offct, + size_t syncp_offset) +{ + return snmp_get_cpu_field(mib, cpu, offct); + +} + static inline u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_off) { return snmp_fold_field(mib, offt); -- cgit v1.2.3 From 4c22279848c531fc7f555d463daf3d0df963bd41 Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Sun, 30 Aug 2015 18:09:38 -0700 Subject: ip-tunnel: Use API to access tunnel metadata options. Currently tun-info options pointer is used in few cases to pass options around. But tunnel options can be accessed using ip_tunnel_info_opts() API without using the pointer. Following patch removes the redundant pointer and consistently make use of API. Signed-off-by: Pravin B Shelar Acked-by: Thomas Graf Reviewed-by: Jesse Gross Signed-off-by: David S. Miller --- include/net/dst_metadata.h | 31 ++++++++++----------- include/net/ip_tunnels.h | 67 +++++++++++++++++++++------------------------- 2 files changed, 45 insertions(+), 53 deletions(-) (limited to 'include/net') diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h index d32f49cc621d..547ab8241593 100644 --- a/include/net/dst_metadata.h +++ b/include/net/dst_metadata.h @@ -48,21 +48,16 @@ static inline bool skb_valid_dst(const struct sk_buff *skb) struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags); struct metadata_dst __percpu *metadata_dst_alloc_percpu(u8 optslen, gfp_t flags); -static inline struct metadata_dst *tun_rx_dst(__be16 flags, - __be64 tunnel_id, int md_size) +static inline struct metadata_dst *tun_rx_dst(int md_size) { struct metadata_dst *tun_dst; - struct ip_tunnel_info *info; tun_dst = metadata_dst_alloc(md_size, GFP_ATOMIC); if (!tun_dst) return NULL; - info = &tun_dst->u.tun_info; - info->key.tun_flags = flags; - info->key.tun_id = tunnel_id; - info->key.tp_src = 0; - info->key.tp_dst = 0; + tun_dst->u.tun_info.options_len = 0; + tun_dst->u.tun_info.mode = 0; return tun_dst; } @@ -73,17 +68,14 @@ static inline struct metadata_dst *ip_tun_rx_dst(struct sk_buff *skb, { const struct iphdr *iph = ip_hdr(skb); struct metadata_dst *tun_dst; - struct ip_tunnel_info *info; - tun_dst = tun_rx_dst(flags, tunnel_id, md_size); + tun_dst = tun_rx_dst(md_size); if (!tun_dst) return NULL; - info = &tun_dst->u.tun_info; - info->key.u.ipv4.src = iph->saddr; - info->key.u.ipv4.dst = iph->daddr; - info->key.tos = iph->tos; - info->key.ttl = iph->ttl; + ip_tunnel_key_init(&tun_dst->u.tun_info.key, + iph->saddr, iph->daddr, iph->tos, iph->ttl, + 0, 0, tunnel_id, flags); return tun_dst; } @@ -96,16 +88,21 @@ static inline struct metadata_dst *ipv6_tun_rx_dst(struct sk_buff *skb, struct metadata_dst *tun_dst; struct ip_tunnel_info *info; - tun_dst = tun_rx_dst(flags, tunnel_id, md_size); + tun_dst = tun_rx_dst(md_size); if (!tun_dst) return NULL; info = &tun_dst->u.tun_info; + info->mode = IP_TUNNEL_INFO_IPV6; + info->key.tun_flags = flags; + info->key.tun_id = tunnel_id; + info->key.tp_src = 0; + info->key.tp_dst = 0; + info->key.u.ipv6.src = ip6h->saddr; info->key.u.ipv6.dst = ip6h->daddr; info->key.tos = ipv6_get_dsfield(ip6h); info->key.ttl = ip6h->hop_limit; - info->mode = IP_TUNNEL_INFO_IPV6; return tun_dst; } diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 2b4fa06e91bd..9a6a3ba888e8 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -57,7 +57,6 @@ struct ip_tunnel_key { struct ip_tunnel_info { struct ip_tunnel_key key; - const void *options; u8 options_len; u8 mode; }; @@ -180,49 +179,32 @@ int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *op, int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *op, unsigned int num); -static inline void __ip_tunnel_info_init(struct ip_tunnel_info *tun_info, - __be32 saddr, __be32 daddr, - u8 tos, u8 ttl, - __be16 tp_src, __be16 tp_dst, - __be64 tun_id, __be16 tun_flags, - const void *opts, u8 opts_len) +static inline void ip_tunnel_key_init(struct ip_tunnel_key *key, + __be32 saddr, __be32 daddr, + u8 tos, u8 ttl, + __be16 tp_src, __be16 tp_dst, + __be64 tun_id, __be16 tun_flags) { - tun_info->key.tun_id = tun_id; - tun_info->key.u.ipv4.src = saddr; - tun_info->key.u.ipv4.dst = daddr; - memset((unsigned char *)&tun_info->key + IP_TUNNEL_KEY_IPV4_PAD, + key->tun_id = tun_id; + key->u.ipv4.src = saddr; + key->u.ipv4.dst = daddr; + memset((unsigned char *)key + IP_TUNNEL_KEY_IPV4_PAD, 0, IP_TUNNEL_KEY_IPV4_PAD_LEN); - tun_info->key.tos = tos; - tun_info->key.ttl = ttl; - tun_info->key.tun_flags = tun_flags; + key->tos = tos; + key->ttl = ttl; + key->tun_flags = tun_flags; /* For the tunnel types on the top of IPsec, the tp_src and tp_dst of * the upper tunnel are used. * E.g: GRE over IPSEC, the tp_src and tp_port are zero. */ - tun_info->key.tp_src = tp_src; - tun_info->key.tp_dst = tp_dst; + key->tp_src = tp_src; + key->tp_dst = tp_dst; /* Clear struct padding. */ - if (sizeof(tun_info->key) != IP_TUNNEL_KEY_SIZE) - memset((unsigned char *)&tun_info->key + IP_TUNNEL_KEY_SIZE, - 0, sizeof(tun_info->key) - IP_TUNNEL_KEY_SIZE); - - tun_info->options = opts; - tun_info->options_len = opts_len; - - tun_info->mode = 0; -} - -static inline void ip_tunnel_info_init(struct ip_tunnel_info *tun_info, - const struct iphdr *iph, - __be16 tp_src, __be16 tp_dst, - __be64 tun_id, __be16 tun_flags, - const void *opts, u8 opts_len) -{ - __ip_tunnel_info_init(tun_info, iph->saddr, iph->daddr, - iph->tos, iph->ttl, tp_src, tp_dst, - tun_id, tun_flags, opts, opts_len); + if (sizeof(*key) != IP_TUNNEL_KEY_SIZE) + memset((unsigned char *)key + IP_TUNNEL_KEY_SIZE, + 0, sizeof(*key) - IP_TUNNEL_KEY_SIZE); } static inline unsigned short ip_tunnel_info_af(const struct ip_tunnel_info @@ -317,11 +299,24 @@ static inline void iptunnel_xmit_stats(int err, } } -static inline void *ip_tunnel_info_opts(struct ip_tunnel_info *info, size_t n) +static inline void *ip_tunnel_info_opts(struct ip_tunnel_info *info) { return info + 1; } +static inline void ip_tunnel_info_opts_get(void *to, + const struct ip_tunnel_info *info) +{ + memcpy(to, info + 1, info->options_len); +} + +static inline void ip_tunnel_info_opts_set(struct ip_tunnel_info *info, + const void *from, int len) +{ + memcpy(ip_tunnel_info_opts(info), from, len); + info->options_len = len; +} + static inline struct ip_tunnel_info *lwt_tun_info(struct lwtunnel_state *lwtstate) { return (struct ip_tunnel_info *)lwtstate->data; -- cgit v1.2.3 From c3a8d9474684d391b0afc3970d9b249add15ec07 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 31 Aug 2015 15:58:47 +0200 Subject: tcp: use dctcp if enabled on the route to the initiator Currently, the following case doesn't use DCTCP, even if it should: A responder has f.e. Cubic as system wide default, but for a specific route to the initiating host, DCTCP is being set in RTAX_CC_ALGO. The initiating host then uses DCTCP as congestion control, but since the initiator sets ECT(0), tcp_ecn_create_request() doesn't set ecn_ok, and we have to fall back to Reno after 3WHS completes. We were thinking on how to solve this in a minimal, non-intrusive way without bloating tcp_ecn_create_request() needlessly: lets cache the CA ecn option flag in RTAX_FEATURES. In other words, when ECT(0) is set on the SYN packet, set ecn_ok=1 iff route RTAX_FEATURES contains the unexposed (internal-only) DST_FEATURE_ECN_CA. This allows to only do a single metric feature lookup inside tcp_ecn_create_request(). Joint work with Florian Westphal. Signed-off-by: Daniel Borkmann Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/net/dst.h | 6 ++++++ include/net/tcp.h | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/dst.h b/include/net/dst.h index 4c4801645371..9261d928303d 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -207,6 +207,12 @@ static inline void dst_metric_set(struct dst_entry *dst, int metric, u32 val) p[metric-1] = val; } +/* Kernel-internal feature bits that are unallocated in user space. */ +#define DST_FEATURE_ECN_CA (1 << 31) + +#define DST_FEATURE_MASK (DST_FEATURE_ECN_CA) +#define DST_FEATURE_ECN_MASK (DST_FEATURE_ECN_CA | RTAX_FEATURE_ECN) + static inline u32 dst_feature(const struct dst_entry *dst, u32 feature) { diff --git a/include/net/tcp.h b/include/net/tcp.h index 4a7b03947a38..0cab28cd43a9 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -888,7 +888,7 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked); extern struct tcp_congestion_ops tcp_reno; struct tcp_congestion_ops *tcp_ca_find_key(u32 key); -u32 tcp_ca_get_key_by_name(const char *name); +u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca); #ifdef CONFIG_INET char *tcp_ca_get_name_by_key(u32 key, char *buffer); #else -- cgit v1.2.3 From c42858eaf4926eb2f44f3e26731b276ab966ac28 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 31 Aug 2015 13:57:34 -0700 Subject: gro_cells: remove spinlock protecting receive queues As David pointed out, spinlock are no longer needed to protect the per cpu queues used in gro cells infrastructure. Also use new napi_complete_done() API so that gro_flush_timeout tweaks have an effect. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/gro_cells.h | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) (limited to 'include/net') diff --git a/include/net/gro_cells.h b/include/net/gro_cells.h index 0f712c0bc0bf..cf6c74550baa 100644 --- a/include/net/gro_cells.h +++ b/include/net/gro_cells.h @@ -32,37 +32,28 @@ static inline void gro_cells_receive(struct gro_cells *gcells, struct sk_buff *s return; } - /* We run in BH context */ - spin_lock(&cell->napi_skbs.lock); - __skb_queue_tail(&cell->napi_skbs, skb); if (skb_queue_len(&cell->napi_skbs) == 1) napi_schedule(&cell->napi); - - spin_unlock(&cell->napi_skbs.lock); } -/* called unser BH context */ +/* called under BH context */ static inline int gro_cell_poll(struct napi_struct *napi, int budget) { struct gro_cell *cell = container_of(napi, struct gro_cell, napi); struct sk_buff *skb; int work_done = 0; - spin_lock(&cell->napi_skbs.lock); while (work_done < budget) { skb = __skb_dequeue(&cell->napi_skbs); if (!skb) break; - spin_unlock(&cell->napi_skbs.lock); napi_gro_receive(napi, skb); work_done++; - spin_lock(&cell->napi_skbs.lock); } if (work_done < budget) - napi_complete(napi); - spin_unlock(&cell->napi_skbs.lock); + napi_complete_done(napi, work_done); return work_done; } @@ -77,7 +68,7 @@ static inline int gro_cells_init(struct gro_cells *gcells, struct net_device *de for_each_possible_cpu(i) { struct gro_cell *cell = per_cpu_ptr(gcells->cells, i); - skb_queue_head_init(&cell->napi_skbs); + __skb_queue_head_init(&cell->napi_skbs); netif_napi_add(dev, &cell->napi, gro_cell_poll, 64); napi_enable(&cell->napi); } @@ -92,8 +83,9 @@ static inline void gro_cells_destroy(struct gro_cells *gcells) return; for_each_possible_cpu(i) { struct gro_cell *cell = per_cpu_ptr(gcells->cells, i); + netif_napi_del(&cell->napi); - skb_queue_purge(&cell->napi_skbs); + __skb_queue_purge(&cell->napi_skbs); } free_percpu(gcells->cells); gcells->cells = NULL; -- cgit v1.2.3 From 63b6c13dbb7d3e36f031629f7e4e86dacfcab8cf Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Mon, 31 Aug 2015 20:05:57 -0700 Subject: tun_dst: Remove opts_size opts_size is only written and never read. Following patch removes this unused variable. Signed-off-by: Pravin B Shelar Signed-off-by: David S. Miller --- include/net/dst_metadata.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/net') diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h index 547ab8241593..af9d5382f6cb 100644 --- a/include/net/dst_metadata.h +++ b/include/net/dst_metadata.h @@ -7,7 +7,6 @@ struct metadata_dst { struct dst_entry dst; - size_t opts_len; union { struct ip_tunnel_info tun_info; } u; -- cgit v1.2.3 From 9cf94eab8b309e8bcc78b41dd1561c75b537dd0b Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 31 Aug 2015 19:11:02 +0200 Subject: netfilter: conntrack: use nf_ct_tmpl_free in CT/synproxy error paths Commit 0838aa7fcfcd ("netfilter: fix netns dependencies with conntrack templates") migrated templates to the new allocator api, but forgot to update error paths for them in CT and synproxy to use nf_ct_tmpl_free() instead of nf_conntrack_free(). Due to that, memory is being freed into the wrong kmemcache, but also we drop the per net reference count of ct objects causing an imbalance. In Brad's case, this leads to a wrap-around of net->ct.count and thus lets __nf_conntrack_alloc() refuse to create a new ct object: [ 10.340913] xt_addrtype: ipv6 does not support BROADCAST matching [ 10.810168] nf_conntrack: table full, dropping packet [ 11.917416] r8169 0000:07:00.0 eth0: link up [ 11.917438] IPv6: ADDRCONF(NETDEV_CHANGE): eth0: link becomes ready [ 12.815902] nf_conntrack: table full, dropping packet [ 15.688561] nf_conntrack: table full, dropping packet [ 15.689365] nf_conntrack: table full, dropping packet [ 15.690169] nf_conntrack: table full, dropping packet [ 15.690967] nf_conntrack: table full, dropping packet [...] With slab debugging, it also reports the wrong kmemcache (kmalloc-512 vs. nf_conntrack_ffffffff81ce75c0) and reports poison overwrites, etc. Thus, to fix the problem, export and use nf_ct_tmpl_free() instead. Fixes: 0838aa7fcfcd ("netfilter: fix netns dependencies with conntrack templates") Reported-by: Brad Jackson Signed-off-by: Daniel Borkmann Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index 37cd3911d5c5..4023c4ce260f 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -292,6 +292,7 @@ extern unsigned int nf_conntrack_hash_rnd; void init_nf_conntrack_hash_rnd(void); struct nf_conn *nf_ct_tmpl_alloc(struct net *net, u16 zone, gfp_t flags); +void nf_ct_tmpl_free(struct nf_conn *tmpl); #define NF_CT_STAT_INC(net, count) __this_cpu_inc((net)->ct.stat->count) #define NF_CT_STAT_INC_ATOMIC(net, count) this_cpu_inc((net)->ct.stat->count) -- cgit v1.2.3 From 9b8ff51822893e743eee09350c1928daa3ef503f Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 1 Sep 2015 14:26:35 -0600 Subject: net: Make table id type u32 A number of VRF patches used 'int' for table id. It should be u32 to be consistent with the rest of the stack. Fixes: 4e3c89920cd3a ("net: Introduce VRF related flags and helpers") 15be405eb2ea9 ("net: Add inet_addr lookup by table") 30bbaa1950055 ("net: Fix up inet_addr_type checks") 021dd3b8a142d ("net: Add routes to the table associated with the device") dc028da54ed35 ("inet: Move VRF table lookup to inlined function") f6d3c19274c74 ("net: FIB tracepoints") Signed-off-by: David Ahern Reviewed-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/net/route.h | 2 +- include/net/vrf.h | 24 ++++++++++++------------ 2 files changed, 13 insertions(+), 13 deletions(-) (limited to 'include/net') diff --git a/include/net/route.h b/include/net/route.h index 395d79bb556c..cc61cb95f059 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -188,7 +188,7 @@ void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk); void ip_rt_send_redirect(struct sk_buff *skb); unsigned int inet_addr_type(struct net *net, __be32 addr); -unsigned int inet_addr_type_table(struct net *net, __be32 addr, int tb_id); +unsigned int inet_addr_type_table(struct net *net, __be32 addr, u32 tb_id); unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev, __be32 addr); unsigned int inet_addr_type_dev_table(struct net *net, diff --git a/include/net/vrf.h b/include/net/vrf.h index 5bfb16237fd7..593e6094ddd4 100644 --- a/include/net/vrf.h +++ b/include/net/vrf.h @@ -66,9 +66,9 @@ static inline int vrf_master_ifindex(const struct net_device *dev) } /* called with rcu_read_lock */ -static inline int vrf_dev_table_rcu(const struct net_device *dev) +static inline u32 vrf_dev_table_rcu(const struct net_device *dev) { - int tb_id = 0; + u32 tb_id = 0; if (dev) { struct net_vrf_dev *vrf_ptr; @@ -80,9 +80,9 @@ static inline int vrf_dev_table_rcu(const struct net_device *dev) return tb_id; } -static inline int vrf_dev_table(const struct net_device *dev) +static inline u32 vrf_dev_table(const struct net_device *dev) { - int tb_id; + u32 tb_id; rcu_read_lock(); tb_id = vrf_dev_table_rcu(dev); @@ -91,10 +91,10 @@ static inline int vrf_dev_table(const struct net_device *dev) return tb_id; } -static inline int vrf_dev_table_ifindex(struct net *net, int ifindex) +static inline u32 vrf_dev_table_ifindex(struct net *net, int ifindex) { struct net_device *dev; - int tb_id = 0; + u32 tb_id = 0; if (!ifindex) return 0; @@ -111,9 +111,9 @@ static inline int vrf_dev_table_ifindex(struct net *net, int ifindex) } /* called with rtnl */ -static inline int vrf_dev_table_rtnl(const struct net_device *dev) +static inline u32 vrf_dev_table_rtnl(const struct net_device *dev) { - int tb_id = 0; + u32 tb_id = 0; if (dev) { struct net_vrf_dev *vrf_ptr; @@ -149,22 +149,22 @@ static inline int vrf_master_ifindex(const struct net_device *dev) return 0; } -static inline int vrf_dev_table_rcu(const struct net_device *dev) +static inline u32 vrf_dev_table_rcu(const struct net_device *dev) { return 0; } -static inline int vrf_dev_table(const struct net_device *dev) +static inline u32 vrf_dev_table(const struct net_device *dev) { return 0; } -static inline int vrf_dev_table_ifindex(struct net *net, int ifindex) +static inline u32 vrf_dev_table_ifindex(struct net *net, int ifindex) { return 0; } -static inline int vrf_dev_table_rtnl(const struct net_device *dev) +static inline u32 vrf_dev_table_rtnl(const struct net_device *dev) { return 0; } -- cgit v1.2.3 From e5276937ae6e654a811345f0716266f12e77bede Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Tue, 1 Sep 2015 09:24:23 -0700 Subject: flow_dissector: Move skb related functions to skbuff.h Move the flow dissector functions that are specific to skbuffs into skbuff.h out of flow_dissector.h. This makes flow_dissector.h have no dependencies on skbuff.h. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/flow_dissector.h | 50 -------------------------------------------- 1 file changed, 50 deletions(-) (limited to 'include/net') diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index 1a8c22419936..6777a84c6f94 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -2,7 +2,6 @@ #define _NET_FLOW_DISSECTOR_H #include -#include #include #include @@ -134,23 +133,6 @@ struct flow_dissector { unsigned short int offset[FLOW_DISSECTOR_KEY_MAX]; }; -void skb_flow_dissector_init(struct flow_dissector *flow_dissector, - const struct flow_dissector_key *key, - unsigned int key_count); - -bool __skb_flow_dissect(const struct sk_buff *skb, - struct flow_dissector *flow_dissector, - void *target_container, - void *data, __be16 proto, int nhoff, int hlen); - -static inline bool skb_flow_dissect(const struct sk_buff *skb, - struct flow_dissector *flow_dissector, - void *target_container) -{ - return __skb_flow_dissect(skb, flow_dissector, target_container, - NULL, 0, 0, 0); -} - struct flow_keys { struct flow_dissector_key_control control; #define FLOW_KEYS_HASH_START_FIELD basic @@ -170,38 +152,6 @@ __be32 flow_get_u32_dst(const struct flow_keys *flow); extern struct flow_dissector flow_keys_dissector; extern struct flow_dissector flow_keys_buf_dissector; -static inline bool skb_flow_dissect_flow_keys(const struct sk_buff *skb, - struct flow_keys *flow) -{ - memset(flow, 0, sizeof(*flow)); - return __skb_flow_dissect(skb, &flow_keys_dissector, flow, - NULL, 0, 0, 0); -} - -static inline bool skb_flow_dissect_flow_keys_buf(struct flow_keys *flow, - void *data, __be16 proto, - int nhoff, int hlen) -{ - memset(flow, 0, sizeof(*flow)); - return __skb_flow_dissect(NULL, &flow_keys_buf_dissector, flow, - data, proto, nhoff, hlen); -} - -__be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto, - void *data, int hlen_proto); - -static inline __be32 skb_flow_get_ports(const struct sk_buff *skb, - int thoff, u8 ip_proto) -{ - return __skb_flow_get_ports(skb, thoff, ip_proto, NULL, 0); -} - -u32 flow_hash_from_keys(struct flow_keys *keys); -void __skb_get_hash(struct sk_buff *skb); -u32 skb_get_poff(const struct sk_buff *skb); -u32 __skb_get_poff(const struct sk_buff *skb, void *data, - const struct flow_keys *keys, int hlen); - /* struct flow_keys_digest: * * This structure is used to hold a digest of the full flow keys. This is a -- cgit v1.2.3 From bcc83839ffdb063dd2b0370cd85c4f825761fc59 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Tue, 1 Sep 2015 09:24:24 -0700 Subject: skbuff: Make __skb_set_sw_hash a general function Move __skb_set_sw_hash to skbuff.h and add __skb_set_hash which is a common method (between __skb_set_sw_hash and skb_set_hash) to set the hash in an skbuff. Also, move skb_clear_hash to be closer to __skb_set_hash. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/flow_dissector.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/net') diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index 6777a84c6f94..af76c496f7db 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -167,4 +167,9 @@ struct flow_keys_digest { void make_flow_keys_digest(struct flow_keys_digest *digest, const struct flow_keys *flow); +static inline bool flow_keys_have_l4(struct flow_keys *keys) +{ + return (keys->ports.ports || keys->tags.flow_label); +} + #endif -- cgit v1.2.3 From c6cc1ca7f4d70cbb3ea3a5ca163c5dabaf155cdb Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Tue, 1 Sep 2015 09:24:25 -0700 Subject: flowi: Abstract out functions to get flow hash based on flowi Create __get_hash_from_flowi6 and __get_hash_from_flowi4 to get the flow keys and hash based on flowi structures. These are called by __skb_get_hash_flowi6 and __skb_get_hash_flowi4. Also, created get_hash_from_flowi6 and get_hash_from_flowi4 which can be called when just the hash value for a flowi is needed. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/flow.h | 19 +++++++++++++++++++ include/net/flow_dissector.h | 2 ++ 2 files changed, 21 insertions(+) (limited to 'include/net') diff --git a/include/net/flow.h b/include/net/flow.h index 9e0297c4c11d..dafe97c3c048 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -10,6 +10,7 @@ #include #include #include +#include /* * ifindex generation is per-net namespace, and loopback is @@ -243,4 +244,22 @@ void flow_cache_flush(struct net *net); void flow_cache_flush_deferred(struct net *net); extern atomic_t flow_cache_genid; +__u32 __get_hash_from_flowi6(struct flowi6 *fl6, struct flow_keys *keys); + +static inline __u32 get_hash_from_flowi6(struct flowi6 *fl6) +{ + struct flow_keys keys; + + return __get_hash_from_flowi6(fl6, &keys); +} + +__u32 __get_hash_from_flowi4(struct flowi4 *fl4, struct flow_keys *keys); + +static inline __u32 get_hash_from_flowi4(struct flowi4 *fl4) +{ + struct flow_keys keys; + + return __get_hash_from_flowi4(fl4, &keys); +} + #endif diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index af76c496f7db..74fe160f0b05 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -172,4 +172,6 @@ static inline bool flow_keys_have_l4(struct flow_keys *keys) return (keys->ports.ports || keys->tags.flow_label); } +u32 flow_hash_from_keys(struct flow_keys *keys); + #endif -- cgit v1.2.3 From 807e165dc44fd93f9d378f861f0540a158d7343a Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Tue, 1 Sep 2015 09:24:28 -0700 Subject: flow_dissector: Add control/reporting of fragmentation Add an input flag to flow dissector on rather dissection should be attempted on a first fragment. Also add key_control flags to indicate that a packet is a fragment or first fragment. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/flow_dissector.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/net') diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index 74fe160f0b05..34102270b086 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -12,6 +12,8 @@ struct flow_dissector_key_control { u16 thoff; u16 addr_type; + u32 is_fragment:1; + u32 first_frag:1; }; /** @@ -122,6 +124,8 @@ enum flow_dissector_key_id { FLOW_DISSECTOR_KEY_MAX, }; +#define FLOW_DISSECTOR_F_PARSE_1ST_FRAG BIT(0) + struct flow_dissector_key { enum flow_dissector_key_id key_id; size_t offset; /* offset of struct flow_dissector_key_* -- cgit v1.2.3 From 8306b688f1a6621b9efe3b0d827e26750528b12a Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Tue, 1 Sep 2015 09:24:30 -0700 Subject: flow_dissector: Add flag to stop parsing at L3 Add an input flag to flow dissector on rather dissection should be stopped when an L3 packet is encountered. This would be useful if a caller just wanted to get IP addresses of the outermost header (e.g. to do an L3 hash). Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/flow_dissector.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index 34102270b086..bb81e3b576e7 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -125,6 +125,7 @@ enum flow_dissector_key_id { }; #define FLOW_DISSECTOR_F_PARSE_1ST_FRAG BIT(0) +#define FLOW_DISSECTOR_F_STOP_AT_L3 BIT(1) struct flow_dissector_key { enum flow_dissector_key_id key_id; -- cgit v1.2.3 From 872b1abb1ed47a691f465fb3d285f6cf6bcd8663 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Tue, 1 Sep 2015 09:24:31 -0700 Subject: flow_dissector: Add flag to stop parsing when an IPv6 flow label is seen Add an input flag to flow dissector on rather dissection should be stopped when a flow label is encountered. Presumably, the flow label is derived from a sufficient hash of an inner transport packet so further dissection is not needed (that is ports are not included in the flow hash). Using the flow label instead of ports has the additional benefit that packet fragments should hash to same value as non-fragments for a flow (assuming that the same flow label is used). We set this flag by default in for skb_get_hash. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/flow_dissector.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index bb81e3b576e7..66dbc3498efb 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -126,6 +126,7 @@ enum flow_dissector_key_id { #define FLOW_DISSECTOR_F_PARSE_1ST_FRAG BIT(0) #define FLOW_DISSECTOR_F_STOP_AT_L3 BIT(1) +#define FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL BIT(2) struct flow_dissector_key { enum flow_dissector_key_id key_id; -- cgit v1.2.3 From 823b96939578eae67b9d6c0e33a39d6a7b6401e7 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Tue, 1 Sep 2015 09:24:32 -0700 Subject: flow_dissector: Add control/reporting of encapsulation Add an input flag to flow dissector on rather dissection should stop when encapsulation is detected (IP/IP or GRE). Also, add a key_control flag that indicates encapsulation was encountered during the dissection. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/flow_dissector.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index 66dbc3498efb..bddd1089dbce 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -14,6 +14,7 @@ struct flow_dissector_key_control { u16 addr_type; u32 is_fragment:1; u32 first_frag:1; + u32 encapsulation:1; }; /** @@ -127,6 +128,7 @@ enum flow_dissector_key_id { #define FLOW_DISSECTOR_F_PARSE_1ST_FRAG BIT(0) #define FLOW_DISSECTOR_F_STOP_AT_L3 BIT(1) #define FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL BIT(2) +#define FLOW_DISSECTOR_F_STOP_AT_ENCAP BIT(3) struct flow_dissector_key { enum flow_dissector_key_id key_id; -- cgit v1.2.3 From 4b36993d3df0834eff3b4172962de0343a4d9123 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 1 Sep 2015 16:46:08 -0700 Subject: flow_dissector: Don't use bit fields. Just have a flags member instead. In file included from include/linux/linkage.h:4:0, from include/linux/kernel.h:6, from net/core/flow_dissector.c:1: In function 'flow_keys_hash_start', inlined from 'flow_hash_from_keys' at net/core/flow_dissector.c:553:34: >> include/linux/compiler.h:447:38: error: call to '__compiletime_assert_459' declared with attribute error: BUILD_BUG_ON failed: FLOW_KEYS_HASH_OFFSET % sizeof(u32) Reported-by: kbuild test robot Signed-off-by: David S. Miller --- include/net/flow_dissector.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index bddd1089dbce..8c8548cf5888 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -12,11 +12,13 @@ struct flow_dissector_key_control { u16 thoff; u16 addr_type; - u32 is_fragment:1; - u32 first_frag:1; - u32 encapsulation:1; + u32 flags; }; +#define FLOW_DIS_IS_FRAGMENT BIT(0) +#define FLOW_DIS_FIRST_FRAG BIT(1) +#define FLOW_DIS_ENCAPSULATION BIT(2) + /** * struct flow_dissector_key_basic: * @thoff: Transport header offset -- cgit v1.2.3 From 20a17bf6c04e3eca8824c930ecc55ab832558e3b Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 1 Sep 2015 21:19:17 -0700 Subject: flow_dissector: Use 'const' where possible. Signed-off-by: David S. Miller --- include/net/flow.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/flow.h b/include/net/flow.h index dafe97c3c048..acd6a096250e 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -244,18 +244,18 @@ void flow_cache_flush(struct net *net); void flow_cache_flush_deferred(struct net *net); extern atomic_t flow_cache_genid; -__u32 __get_hash_from_flowi6(struct flowi6 *fl6, struct flow_keys *keys); +__u32 __get_hash_from_flowi6(const struct flowi6 *fl6, struct flow_keys *keys); -static inline __u32 get_hash_from_flowi6(struct flowi6 *fl6) +static inline __u32 get_hash_from_flowi6(const struct flowi6 *fl6) { struct flow_keys keys; return __get_hash_from_flowi6(fl6, &keys); } -__u32 __get_hash_from_flowi4(struct flowi4 *fl4, struct flow_keys *keys); +__u32 __get_hash_from_flowi4(const struct flowi4 *fl4, struct flow_keys *keys); -static inline __u32 get_hash_from_flowi4(struct flowi4 *fl4) +static inline __u32 get_hash_from_flowi4(const struct flowi4 *fl4) { struct flow_keys keys; -- cgit v1.2.3 From 62da98656b62a5ca57f22263705175af8ded5aa1 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 3 Sep 2015 01:26:07 +0200 Subject: netfilter: nf_conntrack: make nf_ct_zone_dflt built-in Fengguang reported, that some randconfig generated the following linker issue with nf_ct_zone_dflt object involved: [...] CC init/version.o LD init/built-in.o net/built-in.o: In function `ipv4_conntrack_defrag': nf_defrag_ipv4.c:(.text+0x93e95): undefined reference to `nf_ct_zone_dflt' net/built-in.o: In function `ipv6_defrag': nf_defrag_ipv6_hooks.c:(.text+0xe3ffe): undefined reference to `nf_ct_zone_dflt' make: *** [vmlinux] Error 1 Given that configurations exist where we have a built-in part, which is accessing nf_ct_zone_dflt such as the two handlers nf_ct_defrag_user() and nf_ct6_defrag_user(), and a part that configures nf_conntrack as a module, we must move nf_ct_zone_dflt into a fixed, guaranteed built-in area when netfilter is configured in general. Therefore, split the more generic parts into a common header under include/linux/netfilter/ and move nf_ct_zone_dflt into the built-in section that already holds parts related to CONFIG_NF_CONNTRACK in the netfilter core. This fixes the issue on my side. Fixes: 308ac9143ee2 ("netfilter: nf_conntrack: push zone object into functions") Reported-by: Fengguang Wu Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/net/netfilter/nf_conntrack_zones.h | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack_zones.h b/include/net/netfilter/nf_conntrack_zones.h index 5316c7b3a374..4e32512cef32 100644 --- a/include/net/netfilter/nf_conntrack_zones.h +++ b/include/net/netfilter/nf_conntrack_zones.h @@ -1,24 +1,7 @@ #ifndef _NF_CONNTRACK_ZONES_H #define _NF_CONNTRACK_ZONES_H -#include - -#define NF_CT_DEFAULT_ZONE_ID 0 - -#define NF_CT_ZONE_DIR_ORIG (1 << IP_CT_DIR_ORIGINAL) -#define NF_CT_ZONE_DIR_REPL (1 << IP_CT_DIR_REPLY) - -#define NF_CT_DEFAULT_ZONE_DIR (NF_CT_ZONE_DIR_ORIG | NF_CT_ZONE_DIR_REPL) - -#define NF_CT_FLAG_MARK 1 - -struct nf_conntrack_zone { - u16 id; - u8 flags; - u8 dir; -}; - -extern const struct nf_conntrack_zone nf_ct_zone_dflt; +#include #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include -- cgit v1.2.3 From 22f66895e60cfc55b92f6fa93f05bb3fbdbd0bed Mon Sep 17 00:00:00 2001 From: Avri Altman Date: Tue, 18 Aug 2015 16:52:07 +0300 Subject: mac80211: protect non-HT BSS when HT TDLS traffic exists HT TDLS traffic should be protected in a non-HT BSS to avoid collisions. Therefore, when TDLS peers join/leave, check if protection is (now) needed and set the ht_operation_mode of the virtual interface according to the HT capabilities of the TDLS peer(s). This works because a non-HT BSS connection never sets (or otherwise uses) the ht_operation_mode; it just means that drivers must be aware that this field applies to all HT traffic for this virtual interface, not just the traffic within the BSS. Document that. Signed-off-by: Avri Altman Signed-off-by: Johannes Berg --- include/net/mac80211.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index e3314e516681..bfc569498bfa 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -477,7 +477,9 @@ struct ieee80211_event { * @chandef: Channel definition for this BSS -- the hardware might be * configured a higher bandwidth than this BSS uses, for example. * @ht_operation_mode: HT operation mode like in &struct ieee80211_ht_operation. - * This field is only valid when the channel type is one of the HT types. + * This field is only valid when the channel is a wide HT/VHT channel. + * Note that with TDLS this can be the case (channel is HT, protection must + * be used from this field) even when the BSS association isn't using HT. * @cqm_rssi_thold: Connection quality monitor RSSI threshold, a zero value * implies disabled * @cqm_rssi_hyst: Connection quality monitor RSSI hysteresis -- cgit v1.2.3 From 33398cf2f360c5ce24c8a22436d52a06ad4e5eb5 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 8 Sep 2015 15:01:02 -0700 Subject: memcg: export struct mem_cgroup mem_cgroup structure is defined in mm/memcontrol.c currently which means that the code outside of this file has to use external API even for trivial access stuff. This patch exports mm_struct with its dependencies and makes some of the exported functions inlines. This even helps to reduce the code size a bit (make defconfig + CONFIG_MEMCG=y) text data bss dec hex filename 12355346 1823792 1089536 15268674 e8fb42 vmlinux.before 12354970 1823792 1089536 15268298 e8f9ca vmlinux.after This is not much (370B) but better than nothing. We also save a function call in some hot paths like callers of mem_cgroup_count_vm_event which is used for accounting. The patch doesn't introduce any functional changes. [vdavykov@parallels.com: inline memcg_kmem_is_active] [vdavykov@parallels.com: do not expose type outside of CONFIG_MEMCG] [akpm@linux-foundation.org: memcontrol.h needs eventfd.h for eventfd_ctx] [akpm@linux-foundation.org: export mem_cgroup_from_task() to modules] Signed-off-by: Michal Hocko Reviewed-by: Vladimir Davydov Suggested-by: Johannes Weiner Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/net/sock.h | 28 ---------------------------- 1 file changed, 28 deletions(-) (limited to 'include/net') diff --git a/include/net/sock.h b/include/net/sock.h index 43c6abcf06ab..a98c71ea40c5 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1042,34 +1042,6 @@ struct proto { #endif }; -/* - * Bits in struct cg_proto.flags - */ -enum cg_proto_flags { - /* Currently active and new sockets should be assigned to cgroups */ - MEMCG_SOCK_ACTIVE, - /* It was ever activated; we must disarm static keys on destruction */ - MEMCG_SOCK_ACTIVATED, -}; - -struct cg_proto { - struct page_counter memory_allocated; /* Current allocated memory. */ - struct percpu_counter sockets_allocated; /* Current number of sockets. */ - int memory_pressure; - long sysctl_mem[3]; - unsigned long flags; - /* - * memcg field is used to find which memcg we belong directly - * Each memcg struct can hold more than one cg_proto, so container_of - * won't really cut. - * - * The elegant solution would be having an inverse function to - * proto_cgroup in struct proto, but that means polluting the structure - * for everybody, instead of just for memcg users. - */ - struct mem_cgroup *memcg; -}; - int proto_register(struct proto *prot, int alloc_slab); void proto_unregister(struct proto *prot); -- cgit v1.2.3 From e752eb68811aeece2220e183e23369a34122fb5e Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 8 Sep 2015 15:01:16 -0700 Subject: memcg: move memcg_proto_active from sock.h The only user is sock_update_memcg which is living in memcontrol.c so it doesn't make much sense to pollute sock.h by this inline helper. Move it to memcontrol.c and open code it into its only caller. Signed-off-by: Michal Hocko Cc: Vladimir Davydov Cc: Johannes Weiner Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/net/sock.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/net') diff --git a/include/net/sock.h b/include/net/sock.h index a98c71ea40c5..7aa78440559a 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1045,11 +1045,6 @@ struct proto { int proto_register(struct proto *prot, int alloc_slab); void proto_unregister(struct proto *prot); -static inline bool memcg_proto_active(struct cg_proto *cg_proto) -{ - return test_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags); -} - #ifdef SOCK_REFCNT_DEBUG static inline void sk_refcnt_debug_inc(struct sock *sk) { -- cgit v1.2.3 From f53de1e9a4aaf8cbe08845da6f7ff26a078ac507 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Wed, 9 Sep 2015 14:20:56 +0200 Subject: net: ipv6: use common fib_default_rule_pref This switches IPv6 policy routing to use the shared fib_default_rule_pref() function of IPv4 and DECnet. It is also used in multicast routing for IPv4 as well as IPv6. The motivation for this patch is a complaint about iproute2 behaving inconsistent between IPv4 and IPv6 when adding policy rules: Formerly, IPv6 rules were assigned a fixed priority of 0x3FFF whereas for IPv4 the assigned priority value was decreased with each rule added. Since then all users of the default_pref field have been converted to assign the generic function fib_default_rule_pref(), fib_nl_newrule() may just use it directly instead. Therefore get rid of the function pointer altogether and make fib_default_rule_pref() static, as it's not used outside fib_rules.c anymore. Signed-off-by: Phil Sutter Signed-off-by: David S. Miller --- include/net/fib_rules.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/net') diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h index 4e8f804f4589..59160de702b6 100644 --- a/include/net/fib_rules.h +++ b/include/net/fib_rules.h @@ -66,7 +66,6 @@ struct fib_rules_ops { struct nlattr **); int (*fill)(struct fib_rule *, struct sk_buff *, struct fib_rule_hdr *); - u32 (*default_pref)(struct fib_rules_ops *ops); size_t (*nlmsg_payload)(struct fib_rule *); /* Called after modifications to the rules set, must flush @@ -118,5 +117,4 @@ int fib_rules_lookup(struct fib_rules_ops *, struct flowi *, int flags, struct fib_lookup_arg *); int fib_default_rule_add(struct fib_rules_ops *, u32 pref, u32 table, u32 flags); -u32 fib_default_rule_pref(struct fib_rules_ops *ops); #endif -- cgit v1.2.3