From 04600794958f1833f5571c6cde40f260ab557f55 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 5 Aug 2010 17:45:15 +0200 Subject: cfg80211: support sysfs namespaces Enable using network namespaces with wireless devices even when sysfs is enabled using the same infrastructure that was built for netdevs. Signed-off-by: Johannes Berg Acked-by: "Eric W. Biederman" Signed-off-by: John W. Linville --- include/linux/netdevice.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 46c36ffe20ee..a4b14fd81c6a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2171,6 +2171,8 @@ extern void dev_seq_stop(struct seq_file *seq, void *v); extern int netdev_class_create_file(struct class_attribute *class_attr); extern void netdev_class_remove_file(struct class_attribute *class_attr); +extern struct kobj_ns_type_operations net_ns_type_operations; + extern char *netdev_drivername(const struct net_device *dev, char *buffer, int len); extern void linkwatch_run_queue(void); -- cgit v1.2.3 From 21dc330157454046dd7c494961277d76e1c957fe Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 23 Aug 2010 00:13:46 -0700 Subject: net: Rename skb_has_frags to skb_has_frag_list SKBs can be "fragmented" in two ways, via a page array (called skb_shinfo(skb)->frags[]) and via a list of SKBs (called skb_shinfo(skb)->frag_list). Since skb_has_frags() tests the latter, it's name is confusing since it sounds more like it's testing the former. Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 46c36ffe20ee..ce2de8b64083 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2191,7 +2191,7 @@ static inline int net_gso_ok(int features, int gso_type) static inline int skb_gso_ok(struct sk_buff *skb, int features) { return net_gso_ok(features, skb_shinfo(skb)->gso_type) && - (!skb_has_frags(skb) || (features & NETIF_F_FRAGLIST)); + (!skb_has_frag_list(skb) || (features & NETIF_F_FRAGLIST)); } static inline int netif_needs_gso(struct net_device *dev, struct sk_buff *skb) -- cgit v1.2.3 From 1726442e115a9e58f40747d009a5b4f303e0840a Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Mon, 23 Aug 2010 16:26:41 +0000 Subject: net: increase the size of priv_flags and add IFF_OVS_DATAPATH IFF_OVS_DATAPATH is a place-holder for the Open vSwitch datapath which I am preparing to submit for merging. As all 16 bits of priv_flags are already assigned flags, also increase the size of priv_flags to 32 bits. Unfortunately, by my calculations this increases the size of struct net_device by 4 bytes on 32bit architectures and 8 bytes on 64 bit architectures. I couldn't see an obvious way to avoid that. Cc: Jesse Gross Signed-off-by: Simon Horman Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ce2de8b64083..59962dbc2758 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -901,7 +901,7 @@ struct net_device { unsigned int flags; /* interface flags (a la BSD) */ unsigned short gflags; - unsigned short priv_flags; /* Like 'flags' but invisible to userspace. */ + unsigned int priv_flags; /* Like 'flags' but invisible to userspace. */ unsigned short padded; /* How much padding added by alloc_netdev() */ unsigned char operstate; /* RFC2863 operstate */ -- cgit v1.2.3 From 4dc89133f49b8cfd77ba7e83f5960aed63aaa99e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 31 Aug 2010 07:40:16 +0000 Subject: net: add a comment on netdev->last_rx As some driver authors seem to reintroduce dev->last_rx use, add a comment to strongly discourage this. Since commit 6cf3f41e6c0 (bonding, net: Move last_rx update into bonding recv logic), network drivers dont need to update last_rx themselves, unless they use this field to implement a timeout. Not updating last_rx helps not dirtying a cache line, improving performance in SMP. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 0cf9448a32c4..c82220a9f3d5 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -953,7 +953,14 @@ struct net_device { /* * Cache line mostly used on receive path (including eth_type_trans()) */ - unsigned long last_rx; /* Time of last Rx */ + unsigned long last_rx; /* Time of last Rx + * This should not be set in + * drivers, unless really needed, + * because network stack (bonding) + * use it if/when necessary, to + * avoid dirtying this cache line. + */ + /* Interface address info used in eth_type_trans() */ unsigned char *dev_addr; /* hw address, (before bcast because most packets are -- cgit v1.2.3 From 86cac58b71227cc34a3d0e78f19585c0eff49ea3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 31 Aug 2010 18:25:32 +0000 Subject: skge: add GRO support - napi_gro_flush() is exported from net/core/dev.c, to avoid an irq_save/irq_restore in the packet receive path. - use napi_gro_receive() instead of netif_receive_skb() - use napi_gro_flush() before calling __napi_complete() - turn on NETIF_F_GRO by default - Tested on a Marvell 88E8001 Gigabit NIC Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index c82220a9f3d5..af05186d5b36 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1702,6 +1702,7 @@ extern gro_result_t dev_gro_receive(struct napi_struct *napi, extern gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb); extern gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb); +extern void napi_gro_flush(struct napi_struct *napi); extern void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb); extern struct sk_buff * napi_get_frags(struct napi_struct *napi); -- cgit v1.2.3 From 95ae6b228f814fc0528d0506ee9f18ac333d6851 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 15 Sep 2010 04:04:31 +0000 Subject: ipv4: ip_ptr cleanups dev->ip_ptr is protected by rtnl and rcu. Yet some places dont use appropriate primitives and/or locking rules. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index af05186d5b36..8992fffb8104 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -942,7 +942,7 @@ struct net_device { void *dsa_ptr; /* dsa specific data */ #endif void *atalk_ptr; /* AppleTalk link */ - void *ip_ptr; /* IPv4 specific data */ + struct in_device __rcu *ip_ptr; /* IPv4 specific data */ void *dn_ptr; /* DECnet specific data */ void *ip6_ptr; /* IPv6 specific data */ void *ec_ptr; /* Econet specific data */ -- cgit v1.2.3 From cd13539b8bc9ae884e6d8d9374c594adff4304e4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 16 Sep 2010 02:58:13 +0000 Subject: net: shrinks struct net_device commit ab95bfe01 (net: replace hooks in __netif_receive_skb) added rx_handler at wrong place, between two cache line aligned objects, creating a big hole (a full cache line) Move rx_handler and rx_handler_data before rx_queue, filling existing hole. Move master field in the cache line(s) used in receive path. This saves 64 bytes (or L1_CACHE_BYTES), and avoids two possible cache misses in receive path. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 8992fffb8104..ec17887a5bca 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -918,10 +918,6 @@ struct net_device { unsigned short needed_headroom; unsigned short needed_tailroom; - struct net_device *master; /* Pointer to master device of a group, - * which this device is member of. - */ - /* Interface address info. */ unsigned char perm_addr[MAX_ADDR_LEN]; /* permanent hw address */ unsigned char addr_assign_type; /* hw address assignment type */ @@ -951,7 +947,7 @@ struct net_device { assign before registering */ /* - * Cache line mostly used on receive path (including eth_type_trans()) + * Cache lines mostly used on receive path (including eth_type_trans()) */ unsigned long last_rx; /* Time of last Rx * This should not be set in @@ -961,6 +957,10 @@ struct net_device { * avoid dirtying this cache line. */ + struct net_device *master; /* Pointer to master device of a group, + * which this device is member of. + */ + /* Interface address info used in eth_type_trans() */ unsigned char *dev_addr; /* hw address, (before bcast because most packets are @@ -980,10 +980,14 @@ struct net_device { unsigned int num_rx_queues; #endif - struct netdev_queue rx_queue; rx_handler_func_t *rx_handler; void *rx_handler_data; + struct netdev_queue rx_queue; /* use two cache lines */ + +/* + * Cache lines mostly used on transmit path + */ struct netdev_queue *_tx ____cacheline_aligned_in_smp; /* Number of TX queues allocated at alloc_netdev_mq() time */ @@ -997,9 +1001,7 @@ struct net_device { unsigned long tx_queue_len; /* Max frames per queue allowed */ spinlock_t tx_global_lock; -/* - * One part is mostly used on xmit path (device) - */ + /* These may be needed for future network-power-down code. */ /* -- cgit v1.2.3 From 8f8f103d8466e627ecef7894248eb79407d9047c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 19 Sep 2010 11:24:02 -0700 Subject: net: reorder struct netdev_hw_addr Move 'synced' and 'global_use' fields before 'refcount', to shrinks struct netdev_hw_addr by 8 bytes (on 64bit arches). Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ec17887a5bca..f7f1302138af 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -228,9 +228,9 @@ struct netdev_hw_addr { #define NETDEV_HW_ADDR_T_SLAVE 3 #define NETDEV_HW_ADDR_T_UNICAST 4 #define NETDEV_HW_ADDR_T_MULTICAST 5 - int refcount; bool synced; bool global_use; + int refcount; struct rcu_head rcu_head; }; -- cgit v1.2.3 From a02cec2155fbea457eca8881870fd2de1a4c4c76 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 22 Sep 2010 20:43:57 +0000 Subject: net: return operator cleanup Change "return (EXPR);" to "return EXPR;" return is not a function, parentheses are not required. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f7f1302138af..45dcda5bfda9 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1676,7 +1676,7 @@ static inline void netif_wake_subqueue(struct net_device *dev, u16 queue_index) */ static inline int netif_is_multiqueue(const struct net_device *dev) { - return (dev->num_tx_queues > 1); + return dev->num_tx_queues > 1; } extern void netif_set_real_num_tx_queues(struct net_device *dev, -- cgit v1.2.3 From a7855c78a24d6348e989bec616318e68c662e78b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 23 Sep 2010 23:51:51 +0000 Subject: net: loopback driver cleanup loopback driver uses dev->ml_priv to store its percpu stats pointer. It uses ugly casts "(void __percpu __force *)" to shut up sparse complains. Define an union to better document we use ml_priv in loopback driver and define a lstats field with appropriate types. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 45dcda5bfda9..01bd4c82d982 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1050,8 +1050,10 @@ struct net_device { #endif /* mid-layer private */ - void *ml_priv; - + union { + void *ml_priv; + struct pcpu_lstats __percpu *lstats; /* loopback stats */ + }; /* GARP */ struct garp_port *garp_port; -- cgit v1.2.3 From 290b895e0ba4552dfcfc4bd35759c192345b934a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 27 Sep 2010 00:33:35 +0000 Subject: tunnels: prepare percpu accounting Tunnels are going to use percpu for their accounting. They are going to use a new tstats field in net_device. skb_tunnel_rx() is changed to be a wrapper around __skb_tunnel_rx() IPTUNNEL_XMIT() is changed to be a wrapper around __IPTUNNEL_XMIT() Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 01bd4c82d982..83de0eb7a071 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1053,6 +1053,7 @@ struct net_device { union { void *ml_priv; struct pcpu_lstats __percpu *lstats; /* loopback stats */ + struct pcpu_tstats __percpu *tstats; /* tunnel stats */ }; /* GARP */ struct garp_port *garp_port; -- cgit v1.2.3 From 62fe0b40abb3484413800edaef9b087a20059acf Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Mon, 27 Sep 2010 08:24:33 +0000 Subject: net: Allow changing number of RX queues after device allocation For RPS, we create a kobject for each RX queue based on the number of queues passed to alloc_netdev_mq(). However, drivers generally do not determine the numbers of hardware queues to use until much later, so this usually represents the maximum number the driver may use and not the actual number in use. For TX queues, drivers can update the actual number using netif_set_real_num_tx_queues(). Add a corresponding function for RX queues, netif_set_real_num_rx_queues(). Signed-off-by: Ben Hutchings Signed-off-by: David S. Miller --- include/linux/netdevice.h | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 83de0eb7a071..b15732e22eee 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -976,8 +976,11 @@ struct net_device { struct netdev_rx_queue *_rx; - /* Number of RX queues allocated at alloc_netdev_mq() time */ + /* Number of RX queues allocated at register_netdev() time */ unsigned int num_rx_queues; + + /* Number of RX queues currently active in device */ + unsigned int real_num_rx_queues; #endif rx_handler_func_t *rx_handler; @@ -1685,6 +1688,17 @@ static inline int netif_is_multiqueue(const struct net_device *dev) extern void netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq); +#ifdef CONFIG_RPS +extern int netif_set_real_num_rx_queues(struct net_device *dev, + unsigned int rxq); +#else +static inline int netif_set_real_num_rx_queues(struct net_device *dev, + unsigned int rxq) +{ + return 0; +} +#endif + /* Use this variant when it is known for sure that it * is executing from hardware interrupt context or with hardware interrupts * disabled. -- cgit v1.2.3 From 3171d026291d08c2a4cfe06302ce308b09605c4b Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Mon, 27 Sep 2010 08:24:49 +0000 Subject: net: Add netif_copy_real_num_queues() for use by virtual net drivers This sets the active numbers of queues on a net device to match another. Signed-off-by: Ben Hutchings Signed-off-by: David S. Miller --- include/linux/netdevice.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b15732e22eee..c2bec990bd17 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1699,6 +1699,18 @@ static inline int netif_set_real_num_rx_queues(struct net_device *dev, } #endif +static inline int netif_copy_real_num_queues(struct net_device *to_dev, + const struct net_device *from_dev) +{ + netif_set_real_num_tx_queues(to_dev, from_dev->real_num_tx_queues); +#ifdef CONFIG_RPS + return netif_set_real_num_rx_queues(to_dev, + from_dev->real_num_rx_queues); +#else + return 0; +#endif +} + /* Use this variant when it is known for sure that it * is executing from hardware interrupt context or with hardware interrupts * disabled. -- cgit v1.2.3 From 6d81f41c58c69ddde497e9e640ba5805aa26e78c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 27 Sep 2010 20:50:33 +0000 Subject: dummy: percpu stats and lockless xmit Converts dummy network device driver to : - percpu stats - 64bit stats - lockless xmit (NETIF_F_LLTX) - performance features added (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_TSO | NETIF_F_NO_CSUM | NETIF_F_HIGHDMA) Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index c2bec990bd17..6f0845e0b888 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1057,6 +1057,7 @@ struct net_device { void *ml_priv; struct pcpu_lstats __percpu *lstats; /* loopback stats */ struct pcpu_tstats __percpu *tstats; /* tunnel stats */ + struct pcpu_dstats __percpu *dstats; /* dummy stats */ }; /* GARP */ struct garp_port *garp_port; -- cgit v1.2.3 From bfa5ae63b823f4ffd3483a05f60a93a4a7b7d680 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 28 Sep 2010 05:58:37 +0000 Subject: net: rename netdev rx_queue to ingress_queue There is some confusion with rx_queue name after RPS, and net drivers private rx_queue fields. I suggest to rename "struct net_device"->rx_queue to ingress_queue. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 6f0845e0b888..ceed3474014a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -986,7 +986,7 @@ struct net_device { rx_handler_func_t *rx_handler; void *rx_handler_data; - struct netdev_queue rx_queue; /* use two cache lines */ + struct netdev_queue ingress_queue; /* use two cache lines */ /* * Cache lines mostly used on transmit path -- cgit v1.2.3 From 24824a09e35402b8d58dcc5be803a5ad3937bdba Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 2 Oct 2010 06:11:55 +0000 Subject: net: dynamic ingress_queue allocation ingress being not used very much, and net_device->ingress_queue being quite a big object (128 or 256 bytes), use a dynamic allocation if needed (tc qdisc add dev eth0 ingress ...) dev_ingress_queue(dev) helper should be used only with RTNL taken. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ceed3474014a..92d81edd5808 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -986,7 +986,7 @@ struct net_device { rx_handler_func_t *rx_handler; void *rx_handler_data; - struct netdev_queue ingress_queue; /* use two cache lines */ + struct netdev_queue __rcu *ingress_queue; /* * Cache lines mostly used on transmit path -- cgit v1.2.3 From caf586e5f23cebb2a68cbaf288d59dbbf2d74052 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 30 Sep 2010 21:06:55 +0000 Subject: net: add a core netdev->rx_dropped counter In various situations, a device provides a packet to our stack and we drop it before it enters protocol stack : - softnet backlog full (accounted in /proc/net/softnet_stat) - bad vlan tag (not accounted) - unknown/unregistered protocol (not accounted) We can handle a per-device counter of such dropped frames at core level, and automatically adds it to the device provided stats (rx_dropped), so that standard tools can be used (ifconfig, ip link, cat /proc/net/dev) This is a generalization of commit 8990f468a (net: rx_dropped accounting), thus reverting it. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 92d81edd5808..6abcef67b178 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -884,6 +884,9 @@ struct net_device { int iflink; struct net_device_stats stats; + atomic_long_t rx_dropped; /* dropped packets by core network + * Do not use this in drivers. + */ #ifdef CONFIG_WIRELESS_EXT /* List of functions to handle Wireless Extensions (instead of ioctl). -- cgit v1.2.3 From 34d101dd6204bd100fc2e6f7b5f9a10f959ce2c9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 11 Oct 2010 09:16:57 -0700 Subject: neigh: speedup neigh_hh_init() When a new dst is used to send a frame, neigh_resolve_output() tries to associate an struct hh_cache to this dst, calling neigh_hh_init() with the neigh rwlock write locked. Most of the time, hh_cache is already known and linked into neighbour, so we find it and increment its refcount. This patch changes the logic so that we call neigh_hh_init() with neighbour lock read locked only, so that fast path can be run in parallel by concurrent cpus. This brings part of the speedup we got with commit c7d4426a98a5f (introduce DST_NOCACHE flag) for non cached dsts, even for cached ones, removing one of the contention point that routers hit on multiqueue enabled machines. Further improvements would need to use a seqlock instead of an rwlock to protect neigh->ha[], to not dirty neigh too often and remove two atomic ops. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 6abcef67b178..4160db3721ba 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -281,6 +281,12 @@ struct hh_cache { unsigned long hh_data[HH_DATA_ALIGN(LL_MAX_HEADER) / sizeof(long)]; }; +static inline void hh_cache_put(struct hh_cache *hh) +{ + if (atomic_dec_and_test(&hh->hh_refcnt)) + kfree(hh); +} + /* Reserve HH_DATA_MOD byte aligned hard_header_len, but at least that much. * Alternative is: * dev->hard_header_len ? (dev->hard_header_len + -- cgit v1.2.3 From 29b4433d991c88d86ca48a4c1cc33c671475be4b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 11 Oct 2010 10:22:12 +0000 Subject: net: percpu net_device refcount We tried very hard to remove all possible dev_hold()/dev_put() pairs in network stack, using RCU conversions. There is still an unavoidable device refcount change for every dst we create/destroy, and this can slow down some workloads (routers or some app servers, mmap af_packet) We can switch to a percpu refcount implementation, now dynamic per_cpu infrastructure is mature. On a 64 cpus machine, this consumes 256 bytes per device. On x86, dev_hold(dev) code : before lock incl 0x280(%ebx) after: movl 0x260(%ebx),%eax incl fs:(%eax) Stress bench : (Sending 160.000.000 UDP frames, IP route cache disabled, dual E5540 @2.53GHz, 32bit kernel, FIB_TRIE) Before: real 1m1.662s user 0m14.373s sys 12m55.960s After: real 0m51.179s user 0m15.329s sys 10m15.942s Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 4160db3721ba..14fbb04c459d 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1026,7 +1026,7 @@ struct net_device { struct timer_list watchdog_timer; /* Number of references to this device */ - atomic_t refcnt ____cacheline_aligned_in_smp; + int __percpu *pcpu_refcnt; /* delayed register/unregister */ struct list_head todo_list; @@ -1330,6 +1330,7 @@ static inline void unregister_netdevice(struct net_device *dev) unregister_netdevice_queue(dev, NULL); } +extern int netdev_refcnt_read(const struct net_device *dev); extern void free_netdev(struct net_device *dev); extern void synchronize_net(void); extern int register_netdevice_notifier(struct notifier_block *nb); @@ -1798,7 +1799,7 @@ extern void netdev_run_todo(void); */ static inline void dev_put(struct net_device *dev) { - atomic_dec(&dev->refcnt); + irqsafe_cpu_dec(*dev->pcpu_refcnt); } /** @@ -1809,7 +1810,7 @@ static inline void dev_put(struct net_device *dev) */ static inline void dev_hold(struct net_device *dev) { - atomic_inc(&dev->refcnt); + irqsafe_cpu_inc(*dev->pcpu_refcnt); } /* Carrier loss detection, dial on demand. The functions netif_carrier_on -- cgit v1.2.3 From e6484930d7c73d324bccda7d43d131088da697b9 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 18 Oct 2010 18:04:39 +0000 Subject: net: allocate tx queues in register_netdevice This patch introduces netif_alloc_netdev_queues which is called from register_device instead of alloc_netdev_mq. This makes TX queue allocation symmetric with RX allocation. Also, queue locks allocation is done in netdev_init_one_queue. Change set_real_num_tx_queues to fail if requested number < 1 or greater than number of allocated queues. Signed-off-by: Tom Herbert Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 14fbb04c459d..880d56565828 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1696,8 +1696,8 @@ static inline int netif_is_multiqueue(const struct net_device *dev) return dev->num_tx_queues > 1; } -extern void netif_set_real_num_tx_queues(struct net_device *dev, - unsigned int txq); +extern int netif_set_real_num_tx_queues(struct net_device *dev, + unsigned int txq); #ifdef CONFIG_RPS extern int netif_set_real_num_rx_queues(struct net_device *dev, -- cgit v1.2.3 From 7b9c60903714bf0a19d746b228864bad3497284e Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Wed, 20 Oct 2010 13:56:04 +0000 Subject: vlan: Enable software emulation for vlan accleration. Currently users of hardware vlan accleration need to know whether the device supports it before generating packets. However, vlan acceleration will soon be available in a more flexible manner so knowing ahead of time becomes much more difficult. This adds a software fallback path for vlan packets on devices without the necessary offloading support, similar to other types of hardware accleration. Signed-off-by: Jesse Gross Signed-off-by: David S. Miller --- include/linux/netdevice.h | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 880d56565828..2861565a27d9 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2248,9 +2248,17 @@ static inline int skb_gso_ok(struct sk_buff *skb, int features) static inline int netif_needs_gso(struct net_device *dev, struct sk_buff *skb) { - return skb_is_gso(skb) && - (!skb_gso_ok(skb, dev->features) || - unlikely(skb->ip_summed != CHECKSUM_PARTIAL)); + if (skb_is_gso(skb)) { + int features = dev->features; + + if (skb->protocol == htons(ETH_P_8021Q) || skb->vlan_tci) + features &= dev->vlan_features; + + return (!skb_gso_ok(skb, features) || + unlikely(skb->ip_summed != CHECKSUM_PARTIAL)); + } + + return 0; } static inline void netif_set_gso_max_size(struct net_device *dev, -- cgit v1.2.3 From 65ac6a5fa658b90f1be700c55e7cd72e4611015d Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Wed, 20 Oct 2010 13:56:05 +0000 Subject: vlan: Avoid hash table lookup to find group. A struct net_device always maps to zero or one vlan groups and we always know the device when we are looking up a group. We currently do a hash table lookup on the device to find the group but it is much simpler to just store a pointer. Signed-off-by: Jesse Gross Signed-off-by: David S. Miller --- include/linux/netdevice.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2861565a27d9..9c78312ce142 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -942,7 +942,10 @@ struct net_device { /* Protocol specific pointers */ - + +#if defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE) + struct vlan_group *vlgrp; /* VLAN group */ +#endif #ifdef CONFIG_NET_DSA void *dsa_ptr; /* dsa specific data */ #endif -- cgit v1.2.3 From 3701e51382a026cba10c60b03efabe534fba4ca4 Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Wed, 20 Oct 2010 13:56:06 +0000 Subject: vlan: Centralize handling of hardware acceleration. Currently each driver that is capable of vlan hardware acceleration must be aware of the vlan groups that are configured and then pass the stripped tag to a specialized receive function. This is different from other types of hardware offload in that it places a significant amount of knowledge in the driver itself rather keeping it in the networking core. This makes vlan offloading function more similarly to other forms of offloading (such as checksum offloading or TSO) by doing the following: * On receive, stripped vlans are passed directly to the network core, without attempting to check for vlan groups or reconstructing the header if no group * vlans are made less special by folding the logic into the main receive routines * On transmit, the device layer will add the vlan header in software if the hardware doesn't support it, instead of spreading that logic out in upper layers, such as bonding. There are a number of advantages to this: * Fixes all bugs with drivers incorrectly dropping vlan headers at once. * Avoids having to disable VLAN acceleration when in promiscuous mode (good for bridging since it always puts devices in promiscuous mode). * Keeps VLAN tag separate until given to ultimate consumer, which avoids needing to do header reconstruction as in tg3 unless absolutely necessary. * Consolidates common code in core networking. Signed-off-by: Jesse Gross Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 9c78312ce142..ed7db7eebbf3 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1768,7 +1768,6 @@ extern int netdev_rx_handler_register(struct net_device *dev, void *rx_handler_data); extern void netdev_rx_handler_unregister(struct net_device *dev); -extern void netif_nit_deliver(struct sk_buff *skb); extern int dev_valid_name(const char *name); extern int dev_ioctl(struct net *net, unsigned int cmd, void __user *); extern int dev_ethtool(struct net *net, struct ifreq *); -- cgit v1.2.3 From d0c2b0d265a0f1f92922a99a31def9da582197ac Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Tue, 19 Oct 2010 07:12:10 +0000 Subject: napi: unexport napi_reuse_skb The function napi_reuse_skb is only used inside core. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ed7db7eebbf3..fcd3dda86322 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1748,8 +1748,6 @@ extern gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb); extern gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb); extern void napi_gro_flush(struct napi_struct *napi); -extern void napi_reuse_skb(struct napi_struct *napi, - struct sk_buff *skb); extern struct sk_buff * napi_get_frags(struct napi_struct *napi); extern gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, -- cgit v1.2.3 From b616b09afabf6d569aa31176aa86f05d2586de3e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 24 Oct 2010 21:31:58 +0000 Subject: vlan: rcu annotations (struct net_device)->vlgrp is rcu protected : add __rcu annotation and proper rcu primitives. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index fcd3dda86322..ceefb441c236 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -944,7 +944,7 @@ struct net_device { /* Protocol specific pointers */ #if defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE) - struct vlan_group *vlgrp; /* VLAN group */ + struct vlan_group __rcu *vlgrp; /* VLAN group */ #endif #ifdef CONFIG_NET_DSA void *dsa_ptr; /* dsa specific data */ -- cgit v1.2.3 From 198caeca3eb4c81bbdbeb34a870868002f89b3d2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 24 Oct 2010 21:32:05 +0000 Subject: ipv6: ip6_ptr rcu annotations (struct net_device)->ip6_ptr is rcu protected : add __rcu annotation and proper rcu primitives. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ceefb441c236..4722d4a9b58f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -952,7 +952,7 @@ struct net_device { void *atalk_ptr; /* AppleTalk link */ struct in_device __rcu *ip_ptr; /* IPv4 specific data */ void *dn_ptr; /* DECnet specific data */ - void *ip6_ptr; /* IPv6 specific data */ + struct inet6_dev __rcu *ip6_ptr; /* IPv6 specific data */ void *ec_ptr; /* Econet specific data */ void *ax25_ptr; /* AX.25 specific data */ struct wireless_dev *ieee80211_ptr; /* IEEE 802.11 specific data, -- cgit v1.2.3 From 3cc77ec74e1583b50b8405114cdbd6b8ebb8c474 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 24 Oct 2010 21:32:36 +0000 Subject: net/802: add __rcu annotations (struct net_device)->garp_port is rcu protected : (struct garp_port)->applicants is rcu protected : add __rcu annotation and proper rcu primitives. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 4722d4a9b58f..b72d5a460903 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1072,7 +1072,7 @@ struct net_device { struct pcpu_dstats __percpu *dstats; /* dummy stats */ }; /* GARP */ - struct garp_port *garp_port; + struct garp_port __rcu *garp_port; /* class/net/name entry */ struct device dev; -- cgit v1.2.3 From 6e3f7faf3e8a3e226b1a6b955aac12abf2f2e1b6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 25 Oct 2010 03:02:02 +0000 Subject: rps: add __rcu annotations Add __rcu annotations to : (struct netdev_rx_queue)->rps_map (struct netdev_rx_queue)->rps_flow_table struct rps_sock_flow_table *rps_sock_flow_table; And use appropriate rcu primitives. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b72d5a460903..072652d94d9f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -585,15 +585,15 @@ static inline void rps_reset_sock_flow(struct rps_sock_flow_table *table, table->ents[hash & table->mask] = RPS_NO_CPU; } -extern struct rps_sock_flow_table *rps_sock_flow_table; +extern struct rps_sock_flow_table __rcu *rps_sock_flow_table; /* This structure contains an instance of an RX queue. */ struct netdev_rx_queue { - struct rps_map *rps_map; - struct rps_dev_flow_table *rps_flow_table; - struct kobject kobj; - struct netdev_rx_queue *first; - atomic_t count; + struct rps_map __rcu *rps_map; + struct rps_dev_flow_table __rcu *rps_flow_table; + struct kobject kobj; + struct netdev_rx_queue *first; + atomic_t count; } ____cacheline_aligned_in_smp; #endif /* CONFIG_RPS */ -- cgit v1.2.3 From 18543a643fae694982c7d89c22436885f3506497 Mon Sep 17 00:00:00 2001 From: Guillaume Chazarain Date: Sat, 6 Nov 2010 06:39:32 +0000 Subject: net: Detect and ignore netif_stop_queue() calls before register_netdev() After e6484930d7c73d324bccda7d43d131088da697b9: net: allocate tx queues in register_netdevice These calls make net drivers oops at load time, so let's avoid people git-bisect'ing known problems. Signed-off-by: Guillaume Chazarain Signed-off-by: David S. Miller --- include/linux/netdevice.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 072652d94d9f..d8fd2c23a1b9 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1554,6 +1554,11 @@ static inline void netif_tx_wake_all_queues(struct net_device *dev) static inline void netif_tx_stop_queue(struct netdev_queue *dev_queue) { + if (WARN_ON(!dev_queue)) { + printk(KERN_INFO "netif_stop_queue() cannot be called before " + "register_netdev()"); + return; + } set_bit(__QUEUE_STATE_XOFF, &dev_queue->state); } -- cgit v1.2.3