From e6247027e5173c00efb2084d688d06ff835bc3b0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 5 Dec 2013 04:45:08 -0800 Subject: net: introduce dev_consume_skb_any() Some network drivers use dev_kfree_skb_any() and dev_kfree_skb_irq() helpers to free skbs, both for dropped packets and TX completed ones. We need to separate the two causes to get better diagnostics given by dropwatch or "perf record -e skb:kfree_skb" This patch provides two new helpers, dev_consume_skb_any() and dev_consume_skb_irq() to be used for consumed skbs. __dev_kfree_skb_irq() is slightly optimized to remove one atomic_dec_and_test() in fast path, and use this_cpu_{r|w} accessors. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 53 +++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 44 insertions(+), 9 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 7f0ed423a360..9d55e5188b96 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2368,17 +2368,52 @@ static inline int netif_copy_real_num_queues(struct net_device *to_dev, #define DEFAULT_MAX_NUM_RSS_QUEUES (8) int netif_get_num_default_rss_queues(void); -/* Use this variant when it is known for sure that it - * is executing from hardware interrupt context or with hardware interrupts - * disabled. - */ -void dev_kfree_skb_irq(struct sk_buff *skb); +enum skb_free_reason { + SKB_REASON_CONSUMED, + SKB_REASON_DROPPED, +}; + +void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason); +void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason); -/* Use this variant in places where it could be invoked - * from either hardware interrupt or other context, with hardware interrupts - * either disabled or enabled. +/* + * It is not allowed to call kfree_skb() or consume_skb() from hardware + * interrupt context or with hardware interrupts being disabled. + * (in_irq() || irqs_disabled()) + * + * We provide four helpers that can be used in following contexts : + * + * dev_kfree_skb_irq(skb) when caller drops a packet from irq context, + * replacing kfree_skb(skb) + * + * dev_consume_skb_irq(skb) when caller consumes a packet from irq context. + * Typically used in place of consume_skb(skb) in TX completion path + * + * dev_kfree_skb_any(skb) when caller doesn't know its current irq context, + * replacing kfree_skb(skb) + * + * dev_consume_skb_any(skb) when caller doesn't know its current irq context, + * and consumed a packet. Used in place of consume_skb(skb) */ -void dev_kfree_skb_any(struct sk_buff *skb); +static inline void dev_kfree_skb_irq(struct sk_buff *skb) +{ + __dev_kfree_skb_irq(skb, SKB_REASON_DROPPED); +} + +static inline void dev_consume_skb_irq(struct sk_buff *skb) +{ + __dev_kfree_skb_irq(skb, SKB_REASON_CONSUMED); +} + +static inline void dev_kfree_skb_any(struct sk_buff *skb) +{ + __dev_kfree_skb_any(skb, SKB_REASON_DROPPED); +} + +static inline void dev_consume_skb_any(struct sk_buff *skb) +{ + __dev_kfree_skb_any(skb, SKB_REASON_CONSUMED); +} int netif_rx(struct sk_buff *skb); int netif_rx_ni(struct sk_buff *skb); -- cgit v1.2.3 From 37cb0620073cb64101d9307931c135c70b2e3f04 Mon Sep 17 00:00:00 2001 From: Ying Xue Date: Tue, 10 Dec 2013 20:45:41 -0800 Subject: tipc: remove TIPC usage of field af_packet_priv in struct net_device TIPC is currently using the field 'af_packet_priv' in struct net_device as a handle to find the bearer instance associated to the given network device. But, by doing so it is blocking other networking cleanups, such as the one discussed here: http://patchwork.ozlabs.org/patch/178044/ This commit removes this usage from TIPC. Instead, we introduce a new field, 'tipc_ptr', to the net_device structure, to serve this purpose. When TIPC bearer is enabled, the bearer object is associated to 'tipc_ptr'. When a TIPC packet arrives in the recv_msg() upcall from a networking device, the bearer object can now be obtained from 'tipc_ptr'. When a bearer is disabled, the bearer object is detached from its underlying network device by setting 'tipc_ptr' to NULL. Additionally, an RCU lock is used to protect the new pointer. Henceforth, the existing tipc_net_lock is used in write mode to serialize write accesses to this pointer, while the new RCU lock is applied on the read side to ensure that the pointer is 100% valid within its wrapped area for all readers. Signed-off-by: Ying Xue Cc: Patrick McHardy Reviewed-by: Paul Gortmaker Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 9d55e5188b96..0ca8100f9fbc 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1282,6 +1282,9 @@ struct net_device { #endif #if IS_ENABLED(CONFIG_NET_DSA) struct dsa_switch_tree *dsa_ptr; /* dsa specific data */ +#endif +#if IS_ENABLED(CONFIG_TIPC) + struct tipc_bearer __rcu *tipc_ptr; /* TIPC specific data */ #endif void *atalk_ptr; /* AppleTalk link */ struct in_device __rcu *ip_ptr; /* IPv4 specific data */ -- cgit v1.2.3 From 299603e8370a93dd5d8e8d800f0dff1ce2c53d36 Mon Sep 17 00:00:00 2001 From: Jerry Chu Date: Wed, 11 Dec 2013 20:53:45 -0800 Subject: net-gro: Prepare GRO stack for the upcoming tunneling support This patch modifies the GRO stack to avoid the use of "network_header" and associated macros like ip_hdr() and ipv6_hdr() in order to allow an arbitary number of IP hdrs (v4 or v6) to be used in the encapsulation chain. This lays the foundation for various IP tunneling support (IP-in-IP, GRE, VXLAN, SIT,...) to be added later. With this patch, the GRO stack traversing now is mostly based on skb_gro_offset rather than special hdr offsets saved in skb (e.g., skb->network_header). As a result all but the top layer (i.e., the the transport layer) must have hdrs of the same length in order for a pkt to be considered for aggregation. Therefore when adding a new encap layer (e.g., for tunneling), one must check and skip flows (e.g., by setting NAPI_GRO_CB(p)->same_flow to 0) that have a different hdr length. Note that unlike the network header, the transport header can and will continue to be set by the GRO code since there will be at most one "transport layer" in the encap chain. Signed-off-by: H.K. Jerry Chu Suggested-by: Eric Dumazet Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 0ca8100f9fbc..5260d2eae2e6 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1676,7 +1676,7 @@ struct offload_callbacks { int (*gso_send_check)(struct sk_buff *skb); struct sk_buff **(*gro_receive)(struct sk_buff **head, struct sk_buff *skb); - int (*gro_complete)(struct sk_buff *skb); + int (*gro_complete)(struct sk_buff *skb, int nhoff); }; struct packet_offload { -- cgit v1.2.3 From e001bfad913bf119fb67c1e8dd2d4ec1f5d392fa Mon Sep 17 00:00:00 2001 From: dingtianhong Date: Fri, 13 Dec 2013 10:19:55 +0800 Subject: bonding: create bond_first_slave_rcu() The bond_first_slave_rcu() will be used to instead of bond_first_slave() in rcu_read_lock(). According to the Jay Vosburgh's suggestion, the struct netdev_adjacent should hide from users who wanted to use it directly. so I package a new function to get the first slave of the bond. Suggested-by: Nikolay Aleksandrov Suggested-by: Jay Vosburgh Suggested-by: Veaceslav Falico Signed-off-by: Ding Tianhong Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 5260d2eae2e6..2c74d20dad34 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2907,6 +2907,7 @@ void *netdev_lower_get_next_private_rcu(struct net_device *dev, priv = netdev_lower_get_next_private_rcu(dev, &(iter))) void *netdev_adjacent_get_private(struct list_head *adj_list); +void *netdev_lower_get_first_private_rcu(struct net_device *dev); struct net_device *netdev_master_upper_dev_get(struct net_device *dev); struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev); int netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev); -- cgit v1.2.3 From 477bb93320cec7ae74d5ccfad4f2bfa0b28fbe90 Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Fri, 13 Dec 2013 12:35:56 -0800 Subject: net: remove dead code for add/del multiple These function to manipulate multiple addresses are not used anywhere in current net-next tree. Some out of tree code maybe using these but too bad; they should submit their code upstream.. Also, make __hw_addr_flush local since only used by dev_addr_lists.c Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/linux/netdevice.h | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2c74d20dad34..a0dfcc8c002b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2810,17 +2810,10 @@ int register_netdev(struct net_device *dev); void unregister_netdev(struct net_device *dev); /* General hardware address lists handling functions */ -int __hw_addr_add_multiple(struct netdev_hw_addr_list *to_list, - struct netdev_hw_addr_list *from_list, - int addr_len, unsigned char addr_type); -void __hw_addr_del_multiple(struct netdev_hw_addr_list *to_list, - struct netdev_hw_addr_list *from_list, - int addr_len, unsigned char addr_type); int __hw_addr_sync(struct netdev_hw_addr_list *to_list, struct netdev_hw_addr_list *from_list, int addr_len); void __hw_addr_unsync(struct netdev_hw_addr_list *to_list, struct netdev_hw_addr_list *from_list, int addr_len); -void __hw_addr_flush(struct netdev_hw_addr_list *list); void __hw_addr_init(struct netdev_hw_addr_list *list); /* Functions used for device addresses handling */ @@ -2828,10 +2821,6 @@ int dev_addr_add(struct net_device *dev, const unsigned char *addr, unsigned char addr_type); int dev_addr_del(struct net_device *dev, const unsigned char *addr, unsigned char addr_type); -int dev_addr_add_multiple(struct net_device *to_dev, - struct net_device *from_dev, unsigned char addr_type); -int dev_addr_del_multiple(struct net_device *to_dev, - struct net_device *from_dev, unsigned char addr_type); void dev_addr_flush(struct net_device *dev); int dev_addr_init(struct net_device *dev); -- cgit v1.2.3 From 1d143d9f0c833fcf38cc737eb0a8698fa2dd144c Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Sun, 29 Dec 2013 14:01:29 -0800 Subject: net: core functions cleanup The following functions are not used outside of net/core/dev.c and should be declared static. call_netdevice_notifiers_info __dev_remove_offload netdev_has_any_upper_dev __netdev_adjacent_dev_remove __netdev_adjacent_dev_link_lists __netdev_adjacent_dev_unlink_lists __netdev_adjacent_dev_unlink __netdev_adjacent_dev_link_neighbour __netdev_adjacent_dev_unlink_neighbour And the following are never used and should be deleted netdev_lower_dev_get_private_rcu __netdev_find_adj_rcu Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/linux/netdevice.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 88afa8048a7c..bec60c481966 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1741,8 +1741,6 @@ netdev_notifier_info_to_dev(const struct netdev_notifier_info *info) return info->dev; } -int call_netdevice_notifiers_info(unsigned long val, struct net_device *dev, - struct netdev_notifier_info *info); int call_netdevice_notifiers(unsigned long val, struct net_device *dev); @@ -1809,7 +1807,6 @@ void dev_remove_pack(struct packet_type *pt); void __dev_remove_pack(struct packet_type *pt); void dev_add_offload(struct packet_offload *po); void dev_remove_offload(struct packet_offload *po); -void __dev_remove_offload(struct packet_offload *po); struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short flags, unsigned short mask); @@ -2867,7 +2864,6 @@ extern int weight_p; extern int bpf_jit_enable; bool netdev_has_upper_dev(struct net_device *dev, struct net_device *upper_dev); -bool netdev_has_any_upper_dev(struct net_device *dev); struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev, struct list_head **iter); @@ -2907,8 +2903,6 @@ int netdev_master_upper_dev_link_private(struct net_device *dev, void *private); void netdev_upper_dev_unlink(struct net_device *dev, struct net_device *upper_dev); -void *netdev_lower_dev_get_private_rcu(struct net_device *dev, - struct net_device *lower_dev); void *netdev_lower_dev_get_private(struct net_device *dev, struct net_device *lower_dev); int skb_checksum_help(struct sk_buff *skb); -- cgit v1.2.3 From 86f8515f9721fa171483f0fe0391968fbb949cc9 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 29 Dec 2013 17:27:11 +0100 Subject: net: netprio: rename config to be more consistent with cgroup configs While we're at it and introduced CGROUP_NET_CLASSID, lets also make NETPRIO_CGROUP more consistent with the rest of cgroups and rename it into CONFIG_CGROUP_NET_PRIO so that for networking, we now have CONFIG_CGROUP_NET_{PRIO,CLASSID}. This not only makes the CONFIG option consistent among networking cgroups, but also among cgroups CONFIG conventions in general as the vast majority has a prefix of CONFIG_CGROUP_. Signed-off-by: Daniel Borkmann Cc: Zefan Li Cc: cgroups@vger.kernel.org Acked-by: Li Zefan Signed-off-by: Pablo Neira Ayuso --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 5260d2eae2e6..45cf68194aa8 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1444,7 +1444,7 @@ struct net_device { /* max exchange id for FCoE LRO by ddp */ unsigned int fcoe_ddp_xid; #endif -#if IS_ENABLED(CONFIG_NETPRIO_CGROUP) +#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO) struct netprio_map __rcu *priomap; #endif /* phy device may attach itself for hardware timestamping */ -- cgit v1.2.3 From 8f84985fec10de64a6b4cdfea45f2b0ab8f07c78 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Sat, 4 Jan 2014 13:57:59 +0800 Subject: net: unify the pcpu_tstats and br_cpu_netstats as one They are same, so unify them as one, pcpu_sw_netstats. Define pcpu_sw_netstat in netdevice.h, remove pcpu_tstats from if_tunnel and remove br_cpu_netstats from br_private.h Cc: Cong Wang Cc: Stephen Hemminger Signed-off-by: Li RongQing Signed-off-by: David S. Miller --- include/linux/netdevice.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index bec60c481966..51c0fe258163 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1409,7 +1409,7 @@ struct net_device { union { void *ml_priv; struct pcpu_lstats __percpu *lstats; /* loopback stats */ - struct pcpu_tstats __percpu *tstats; /* tunnel stats */ + struct pcpu_sw_netstats __percpu *tstats; struct pcpu_dstats __percpu *dstats; /* dummy stats */ struct pcpu_vstats __percpu *vstats; /* veth stats */ }; @@ -1685,6 +1685,15 @@ struct packet_offload { struct list_head list; }; +/* often modified stats are per cpu, other are shared (netdev->stats) */ +struct pcpu_sw_netstats { + u64 rx_packets; + u64 rx_bytes; + u64 tx_packets; + u64 tx_bytes; + struct u64_stats_sync syncp; +}; + #include /* netdevice notifier chain. Please remember to update the rtnetlink -- cgit v1.2.3 From bf5a755f5e9186406bbf50f4087100af5bd68e40 Mon Sep 17 00:00:00 2001 From: Jerry Chu Date: Tue, 7 Jan 2014 10:23:19 -0800 Subject: net-gre-gro: Add GRE support to the GRO stack This patch built on top of Commit 299603e8370a93dd5d8e8d800f0dff1ce2c53d36 ("net-gro: Prepare GRO stack for the upcoming tunneling support") to add the support of the standard GRE (RFC1701/RFC2784/RFC2890) to the GRO stack. It also serves as an example for supporting other encapsulation protocols in the GRO stack in the future. The patch supports version 0 and all the flags (key, csum, seq#) but will flush any pkt with the S (seq#) flag. This is because the S flag is not support by GSO, and a GRO pkt may end up in the forwarding path, thus requiring GSO support to break it up correctly. Currently the "packet_offload" structure only contains L3 (ETH_P_IP/ ETH_P_IPV6) GRO offload support so the encapped pkts are limited to IP pkts (i.e., w/o L2 hdr). But support for other protocol type can be easily added, so is the support for GRE variations like NVGRE. The patch also support csum offload. Specifically if the csum flag is on and the h/w is capable of checksumming the payload (CHECKSUM_COMPLETE), the code will take advantage of the csum computed by the h/w when validating the GRE csum. Note that commit 60769a5dcd8755715c7143b4571d5c44f01796f1 "ipv4: gre: add GRO capability" already introduces GRO capability to IPv4 GRE tunnels, using the gro_cells infrastructure. But GRO is done after GRE hdr has been removed (i.e., decapped). The following patch applies GRO when pkts first come in (before hitting the GRE tunnel code). There is some performance advantage for applying GRO as early as possible. Also this approach is transparent to other subsystem like Open vSwitch where GRE decap is handled outside of the IP stack hence making it harder for the gro_cells stuff to apply. On the other hand, some NICs are still not capable of hashing on the inner hdr of a GRE pkt (RSS). In that case the GRO processing of pkts from the same remote host will all happen on the same CPU and the performance may be suboptimal. I'm including some rough preliminary performance numbers below. Note that the performance will be highly dependent on traffic load, mix as usual. Moreover it also depends on NIC offload features hence the following is by no means a comprehesive study. Local testing and tuning will be needed to decide the best setting. All tests spawned 50 copies of netperf TCP_STREAM and ran for 30 secs. (super_netperf 50 -H 192.168.1.18 -l 30) An IP GRE tunnel with only the key flag on (e.g., ip tunnel add gre1 mode gre local 10.246.17.18 remote 10.246.17.17 ttl 255 key 123) is configured. The GRO support for pkts AFTER decap are controlled through the device feature of the GRE device (e.g., ethtool -K gre1 gro on/off). 1.1 ethtool -K gre1 gro off; ethtool -K eth0 gro off thruput: 9.16Gbps CPU utilization: 19% 1.2 ethtool -K gre1 gro on; ethtool -K eth0 gro off thruput: 5.9Gbps CPU utilization: 15% 1.3 ethtool -K gre1 gro off; ethtool -K eth0 gro on thruput: 9.26Gbps CPU utilization: 12-13% 1.4 ethtool -K gre1 gro on; ethtool -K eth0 gro on thruput: 9.26Gbps CPU utilization: 10% The following tests were performed on a different NIC that is capable of csum offload. I.e., the h/w is capable of computing IP payload csum (CHECKSUM_COMPLETE). 2.1 ethtool -K gre1 gro on (hence will use gro_cells) 2.1.1 ethtool -K eth0 gro off; csum offload disabled thruput: 8.53Gbps CPU utilization: 9% 2.1.2 ethtool -K eth0 gro off; csum offload enabled thruput: 8.97Gbps CPU utilization: 7-8% 2.1.3 ethtool -K eth0 gro on; csum offload disabled thruput: 8.83Gbps CPU utilization: 5-6% 2.1.4 ethtool -K eth0 gro on; csum offload enabled thruput: 8.98Gbps CPU utilization: 5% 2.2 ethtool -K gre1 gro off 2.2.1 ethtool -K eth0 gro off; csum offload disabled thruput: 5.93Gbps CPU utilization: 9% 2.2.2 ethtool -K eth0 gro off; csum offload enabled thruput: 5.62Gbps CPU utilization: 8% 2.2.3 ethtool -K eth0 gro on; csum offload disabled thruput: 7.69Gbps CPU utilization: 8% 2.2.4 ethtool -K eth0 gro on; csum offload enabled thruput: 8.96Gbps CPU utilization: 5-6% Signed-off-by: H.K. Jerry Chu Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d9c961aa6a7f..a2a70cc70e7b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1632,7 +1632,10 @@ struct napi_gro_cb { int data_offset; /* This is non-zero if the packet cannot be merged with the new skb. */ - int flush; + u16 flush; + + /* Save the IP ID here and check when we get to the transport layer */ + u16 flush_id; /* Number of segments aggregated. */ u16 count; @@ -1651,6 +1654,9 @@ struct napi_gro_cb { /* Used in ipv6_gro_receive() */ int proto; + /* used to support CHECKSUM_COMPLETE for tunneling protocols */ + __wsum csum; + /* used in skb_gro_receive() slow path */ struct sk_buff *last; }; @@ -1900,6 +1906,14 @@ static inline void *skb_gro_network_header(struct sk_buff *skb) skb_network_offset(skb); } +static inline void skb_gro_postpull_rcsum(struct sk_buff *skb, + const void *start, unsigned int len) +{ + if (skb->ip_summed == CHECKSUM_COMPLETE) + NAPI_GRO_CB(skb)->csum = csum_sub(NAPI_GRO_CB(skb)->csum, + csum_partial(start, len, 0)); +} + static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, const void *saddr, @@ -2440,6 +2454,8 @@ gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb); void napi_gro_flush(struct napi_struct *napi, bool flush_old); struct sk_buff *napi_get_frags(struct napi_struct *napi); gro_result_t napi_gro_frags(struct napi_struct *napi); +struct packet_offload *gro_find_receive_by_type(__be16 type); +struct packet_offload *gro_find_complete_by_type(__be16 type); static inline void napi_free_frags(struct napi_struct *napi) { -- cgit v1.2.3 From 5bb025fae53889cc99a21058c5dd369bf8cce820 Mon Sep 17 00:00:00 2001 From: Veaceslav Falico Date: Tue, 14 Jan 2014 21:58:51 +0100 Subject: net: rename sysfs symlinks on device name change Currently, we don't rename the upper/lower_ifc symlinks in /sys/class/net/*/ , which might result stale/duplicate links/names. Fix this by adding netdev_adjacent_rename_links(dev, oldname) which renames all the upper/lower interface's links to dev from the upper/lower_oldname to the new name. We don't need a rollback because only we control these symlinks and if we fail to rename them - sysfs will anyway complain. Reported-by: Ding Tianhong CC: Ding Tianhong CC: "David S. Miller" CC: Eric Dumazet CC: Nicolas Dichtel CC: Cong Wang Signed-off-by: Veaceslav Falico Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 5c88ab19b3eb..30f6513f3b4d 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2941,6 +2941,7 @@ int netdev_master_upper_dev_link_private(struct net_device *dev, void *private); void netdev_upper_dev_unlink(struct net_device *dev, struct net_device *upper_dev); +void netdev_adjacent_rename_links(struct net_device *dev, char *oldname); void *netdev_lower_dev_get_private(struct net_device *dev, struct net_device *lower_dev); int skb_checksum_help(struct sk_buff *skb); -- cgit v1.2.3 From 1d486bfb66971ebacc2a46a23431ace9af70dc66 Mon Sep 17 00:00:00 2001 From: Veaceslav Falico Date: Thu, 16 Jan 2014 00:02:18 +0100 Subject: net: add NETDEV_PRECHANGEMTU to notify before mtu change happens Currently, if a device changes its mtu, first the change happens (invloving all the side effects), and after that the NETDEV_CHANGEMTU is sent so that other devices can catch up with the new mtu. However, if they return NOTIFY_BAD, then the change is reverted and error returned. This is a really long and costy operation (sometimes). To fix this, add NETDEV_PRECHANGEMTU notification which is called prior to any change actually happening, and if any callee returns NOTIFY_BAD - the change is aborted. This way we're skipping all the playing with apply/revert the mtu. CC: "David S. Miller" CC: Jiri Pirko CC: Eric Dumazet CC: Nicolas Dichtel CC: Cong Wang Signed-off-by: Veaceslav Falico Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 30f6513f3b4d..d7668b881d08 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1718,7 +1718,7 @@ struct pcpu_sw_netstats { #define NETDEV_CHANGE 0x0004 /* Notify device state change */ #define NETDEV_REGISTER 0x0005 #define NETDEV_UNREGISTER 0x0006 -#define NETDEV_CHANGEMTU 0x0007 +#define NETDEV_CHANGEMTU 0x0007 /* notify after mtu change happened */ #define NETDEV_CHANGEADDR 0x0008 #define NETDEV_GOING_DOWN 0x0009 #define NETDEV_CHANGENAME 0x000A @@ -1734,6 +1734,7 @@ struct pcpu_sw_netstats { #define NETDEV_JOIN 0x0014 #define NETDEV_CHANGEUPPER 0x0015 #define NETDEV_RESEND_IGMP 0x0016 +#define NETDEV_PRECHANGEMTU 0x0017 /* notify before mtu change happened */ int register_netdevice_notifier(struct notifier_block *nb); int unregister_netdevice_notifier(struct notifier_block *nb); -- cgit v1.2.3 From a953be53ce40440acb4740edb48577b9468d4c3d Mon Sep 17 00:00:00 2001 From: Michael Dalton Date: Thu, 16 Jan 2014 22:23:28 -0800 Subject: net-sysfs: add support for device-specific rx queue sysfs attributes Extend existing support for netdevice receive queue sysfs attributes to permit a device-specific attribute group. Initial use case for this support will be to allow the virtio-net device to export per-receive queue mergeable receive buffer size. Signed-off-by: Michael Dalton Signed-off-by: David S. Miller --- include/linux/netdevice.h | 35 +++++++++++++++++++++++++++++++---- 1 file changed, 31 insertions(+), 4 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d7668b881d08..e985231fe04b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -668,15 +668,28 @@ extern struct rps_sock_flow_table __rcu *rps_sock_flow_table; bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, u32 flow_id, u16 filter_id); #endif +#endif /* CONFIG_RPS */ /* This structure contains an instance of an RX queue. */ struct netdev_rx_queue { +#ifdef CONFIG_RPS struct rps_map __rcu *rps_map; struct rps_dev_flow_table __rcu *rps_flow_table; +#endif struct kobject kobj; struct net_device *dev; } ____cacheline_aligned_in_smp; -#endif /* CONFIG_RPS */ + +/* + * RX queue sysfs structures and functions. + */ +struct rx_queue_attribute { + struct attribute attr; + ssize_t (*show)(struct netdev_rx_queue *queue, + struct rx_queue_attribute *attr, char *buf); + ssize_t (*store)(struct netdev_rx_queue *queue, + struct rx_queue_attribute *attr, const char *buf, size_t len); +}; #ifdef CONFIG_XPS /* @@ -1313,7 +1326,7 @@ struct net_device { unicast) */ -#ifdef CONFIG_RPS +#ifdef CONFIG_SYSFS struct netdev_rx_queue *_rx; /* Number of RX queues allocated at register_netdev() time */ @@ -1424,6 +1437,8 @@ struct net_device { struct device dev; /* space for optional device, statistics, and wireless sysfs groups */ const struct attribute_group *sysfs_groups[4]; + /* space for optional per-rx queue attributes */ + const struct attribute_group *sysfs_rx_queue_group; /* rtnetlink link ops */ const struct rtnl_link_ops *rtnl_link_ops; @@ -2375,7 +2390,7 @@ static inline bool netif_is_multiqueue(const struct net_device *dev) int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq); -#ifdef CONFIG_RPS +#ifdef CONFIG_SYSFS int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq); #else static inline int netif_set_real_num_rx_queues(struct net_device *dev, @@ -2394,7 +2409,7 @@ static inline int netif_copy_real_num_queues(struct net_device *to_dev, from_dev->real_num_tx_queues); if (err) return err; -#ifdef CONFIG_RPS +#ifdef CONFIG_SYSFS return netif_set_real_num_rx_queues(to_dev, from_dev->real_num_rx_queues); #else @@ -2402,6 +2417,18 @@ static inline int netif_copy_real_num_queues(struct net_device *to_dev, #endif } +#ifdef CONFIG_SYSFS +static inline unsigned int get_netdev_rx_queue_index( + struct netdev_rx_queue *queue) +{ + struct net_device *dev = queue->dev; + int index = queue - dev->_rx; + + BUG_ON(index >= dev->num_rx_queues); + return index; +} +#endif + #define DEFAULT_MAX_NUM_RSS_QUEUES (8) int netif_get_num_default_rss_queues(void); -- cgit v1.2.3 From 1d3ee88ae0d605629bf369ab0b868dae8ca62a48 Mon Sep 17 00:00:00 2001 From: "sfeldma@cumulusnetworks.com" Date: Thu, 16 Jan 2014 22:57:56 -0800 Subject: bonding: add netlink attributes to slave link dev If link is IFF_SLAVE, extend link dev netlink attributes to include slave attributes with new IFLA_SLAVE nest. Add netlink notification (RTM_NEWLINK) when slave status changes from backup to active, or visa-versa. Adds new ndo_get_slave op to net_device_ops to fill skb with IFLA_SLAVE attributes. Currently only used by bonding driver, but could be used by other aggregating devices with slaves. Signed-off-by: Scott Feldman Signed-off-by: David S. Miller --- include/linux/netdevice.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e985231fe04b..83ce2aee65e6 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -921,6 +921,9 @@ struct netdev_phys_port_id { * int (*ndo_del_slave)(struct net_device *dev, struct net_device *slave_dev); * Called to release previously enslaved netdev. * + * int (*ndo_get_slave)(struct net_device *slave_dev, struct sk_buff *skb); + * Called to fill netlink skb with slave info. + * * Feature/offload setting functions. * netdev_features_t (*ndo_fix_features)(struct net_device *dev, * netdev_features_t features); @@ -1093,6 +1096,8 @@ struct net_device_ops { struct net_device *slave_dev); int (*ndo_del_slave)(struct net_device *dev, struct net_device *slave_dev); + int (*ndo_get_slave)(struct net_device *slave_dev, + struct sk_buff *skb); netdev_features_t (*ndo_fix_features)(struct net_device *dev, netdev_features_t features); int (*ndo_set_features)(struct net_device *dev, -- cgit v1.2.3 From b582ef0990d457f7ce8ccf827af51a575ca0b4a6 Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Mon, 20 Jan 2014 13:59:19 +0200 Subject: net: Add GRO support for UDP encapsulating protocols Add GRO handlers for protocols that do UDP encapsulation, with the intent of being able to coalesce packets which encapsulate packets belonging to the same TCP session. For GRO purposes, the destination UDP port takes the role of the ether type field in the ethernet header or the next protocol in the IP header. The UDP GRO handler will only attempt to coalesce packets whose destination port is registered to have gro handler. Use a mark on the skb GRO CB data to disallow (flush) running the udp gro receive code twice on a packet. This solves the problem of udp encapsulated packets whose inner VM packet is udp and happen to carry a port which has registered offloads. Signed-off-by: Shlomo Pongratz Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- include/linux/netdevice.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 83ce2aee65e6..c31022980e18 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1675,7 +1675,10 @@ struct napi_gro_cb { unsigned long age; /* Used in ipv6_gro_receive() */ - int proto; + u16 proto; + + /* Used in udp_gro_receive */ + u16 udp_mark; /* used to support CHECKSUM_COMPLETE for tunneling protocols */ __wsum csum; @@ -1714,6 +1717,11 @@ struct packet_offload { struct list_head list; }; +struct udp_offload { + __be16 port; + struct offload_callbacks callbacks; +}; + /* often modified stats are per cpu, other are shared (netdev->stats) */ struct pcpu_sw_netstats { u64 rx_packets; -- cgit v1.2.3 From a9517d0f43832d787a4a0348163d659641bfd83c Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 22 Jan 2014 09:05:57 +0100 Subject: rtnetlink: remove ndo_get_slave No longer used API bond-specific can be removed now. This is now handled in a generic way in rtnl_link_ops. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/netdevice.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index c31022980e18..440a02ee6f92 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -921,9 +921,6 @@ struct netdev_phys_port_id { * int (*ndo_del_slave)(struct net_device *dev, struct net_device *slave_dev); * Called to release previously enslaved netdev. * - * int (*ndo_get_slave)(struct net_device *slave_dev, struct sk_buff *skb); - * Called to fill netlink skb with slave info. - * * Feature/offload setting functions. * netdev_features_t (*ndo_fix_features)(struct net_device *dev, * netdev_features_t features); @@ -1096,8 +1093,6 @@ struct net_device_ops { struct net_device *slave_dev); int (*ndo_del_slave)(struct net_device *dev, struct net_device *slave_dev); - int (*ndo_get_slave)(struct net_device *slave_dev, - struct sk_buff *skb); netdev_features_t (*ndo_fix_features)(struct net_device *dev, netdev_features_t features); int (*ndo_set_features)(struct net_device *dev, -- cgit v1.2.3