diff options
Diffstat (limited to 'net/core')
| -rw-r--r-- | net/core/Makefile | 1 | ||||
| -rw-r--r-- | net/core/datagram.c | 301 | ||||
| -rw-r--r-- | net/core/dev.c | 788 | ||||
| -rw-r--r-- | net/core/dev_addr_lists.c | 4 | ||||
| -rw-r--r-- | net/core/dev_ioctl.c | 16 | ||||
| -rw-r--r-- | net/core/drop_monitor.c | 2 | ||||
| -rw-r--r-- | net/core/dst.c | 24 | ||||
| -rw-r--r-- | net/core/ethtool.c | 169 | ||||
| -rw-r--r-- | net/core/filter.c | 151 | ||||
| -rw-r--r-- | net/core/flow_dissector.c | 135 | ||||
| -rw-r--r-- | net/core/gen_estimator.c | 29 | ||||
| -rw-r--r-- | net/core/gen_stats.c | 112 | ||||
| -rw-r--r-- | net/core/iovec.c | 47 | ||||
| -rw-r--r-- | net/core/link_watch.c | 2 | ||||
| -rw-r--r-- | net/core/neighbour.c | 279 | ||||
| -rw-r--r-- | net/core/net-sysfs.c | 44 | ||||
| -rw-r--r-- | net/core/net_namespace.c | 41 | ||||
| -rw-r--r-- | net/core/netpoll.c | 11 | ||||
| -rw-r--r-- | net/core/pktgen.c | 79 | ||||
| -rw-r--r-- | net/core/rtnetlink.c | 259 | ||||
| -rw-r--r-- | net/core/scm.c | 3 | ||||
| -rw-r--r-- | net/core/secure_seq.c | 6 | ||||
| -rw-r--r-- | net/core/skbuff.c | 712 | ||||
| -rw-r--r-- | net/core/sock.c | 162 | ||||
| -rw-r--r-- | net/core/sysctl_net_core.c | 21 | ||||
| -rw-r--r-- | net/core/timestamping.c | 43 | ||||
| -rw-r--r-- | net/core/tso.c | 3 | ||||
| -rw-r--r-- | net/core/user_dma.c | 131 | ||||
| -rw-r--r-- | net/core/utils.c | 15 |
29 files changed, 2127 insertions, 1463 deletions
diff --git a/net/core/Makefile b/net/core/Makefile index 71093d94ad2b..235e6c50708d 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -16,7 +16,6 @@ obj-y += net-sysfs.o obj-$(CONFIG_PROC_FS) += net-procfs.o obj-$(CONFIG_NET_PKTGEN) += pktgen.o obj-$(CONFIG_NETPOLL) += netpoll.o -obj-$(CONFIG_NET_DMA) += user_dma.o obj-$(CONFIG_FIB_RULES) += fib_rules.o obj-$(CONFIG_TRACEPOINTS) += net-traces.o obj-$(CONFIG_NET_DROP_MONITOR) += drop_monitor.o diff --git a/net/core/datagram.c b/net/core/datagram.c index fdbc9a81d4c2..df493d68330c 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -49,6 +49,7 @@ #include <linux/spinlock.h> #include <linux/slab.h> #include <linux/pagemap.h> +#include <linux/uio.h> #include <net/protocol.h> #include <linux/skbuff.h> @@ -309,16 +310,14 @@ int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags) EXPORT_SYMBOL(skb_kill_datagram); /** - * skb_copy_datagram_iovec - Copy a datagram to an iovec. + * skb_copy_datagram_iter - Copy a datagram to an iovec iterator. * @skb: buffer to copy * @offset: offset in the buffer to start copying from - * @to: io vector to copy to + * @to: iovec iterator to copy to * @len: amount of data to copy from buffer to iovec - * - * Note: the iovec is modified during the copy. */ -int skb_copy_datagram_iovec(const struct sk_buff *skb, int offset, - struct iovec *to, int len) +int skb_copy_datagram_iter(const struct sk_buff *skb, int offset, + struct iov_iter *to, int len) { int start = skb_headlen(skb); int i, copy = start - offset; @@ -330,8 +329,8 @@ int skb_copy_datagram_iovec(const struct sk_buff *skb, int offset, if (copy > 0) { if (copy > len) copy = len; - if (memcpy_toiovec(to, skb->data + offset, copy)) - goto fault; + if (copy_to_iter(skb->data + offset, copy, to) != copy) + goto short_copy; if ((len -= copy) == 0) return 0; offset += copy; @@ -346,18 +345,12 @@ int skb_copy_datagram_iovec(const struct sk_buff *skb, int offset, end = start + skb_frag_size(frag); if ((copy = end - offset) > 0) { - int err; - u8 *vaddr; - struct page *page = skb_frag_page(frag); - if (copy > len) copy = len; - vaddr = kmap(page); - err = memcpy_toiovec(to, vaddr + frag->page_offset + - offset - start, copy); - kunmap(page); - if (err) - goto fault; + if (copy_page_to_iter(skb_frag_page(frag), + frag->page_offset + offset - + start, copy, to) != copy) + goto short_copy; if (!(len -= copy)) return 0; offset += copy; @@ -374,9 +367,8 @@ int skb_copy_datagram_iovec(const struct sk_buff *skb, int offset, if ((copy = end - offset) > 0) { if (copy > len) copy = len; - if (skb_copy_datagram_iovec(frag_iter, - offset - start, - to, copy)) + if (skb_copy_datagram_iter(frag_iter, offset - start, + to, copy)) goto fault; if ((len -= copy) == 0) return 0; @@ -387,113 +379,33 @@ int skb_copy_datagram_iovec(const struct sk_buff *skb, int offset, if (!len) return 0; + /* This is not really a user copy fault, but rather someone + * gave us a bogus length on the skb. We should probably + * print a warning here as it may indicate a kernel bug. + */ + fault: return -EFAULT; -} -EXPORT_SYMBOL(skb_copy_datagram_iovec); -/** - * skb_copy_datagram_const_iovec - Copy a datagram to an iovec. - * @skb: buffer to copy - * @offset: offset in the buffer to start copying from - * @to: io vector to copy to - * @to_offset: offset in the io vector to start copying to - * @len: amount of data to copy from buffer to iovec - * - * Returns 0 or -EFAULT. - * Note: the iovec is not modified during the copy. - */ -int skb_copy_datagram_const_iovec(const struct sk_buff *skb, int offset, - const struct iovec *to, int to_offset, - int len) -{ - int start = skb_headlen(skb); - int i, copy = start - offset; - struct sk_buff *frag_iter; +short_copy: + if (iov_iter_count(to)) + goto fault; - /* Copy header. */ - if (copy > 0) { - if (copy > len) - copy = len; - if (memcpy_toiovecend(to, skb->data + offset, to_offset, copy)) - goto fault; - if ((len -= copy) == 0) - return 0; - offset += copy; - to_offset += copy; - } - - /* Copy paged appendix. Hmm... why does this look so complicated? */ - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { - int end; - const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; - - WARN_ON(start > offset + len); - - end = start + skb_frag_size(frag); - if ((copy = end - offset) > 0) { - int err; - u8 *vaddr; - struct page *page = skb_frag_page(frag); - - if (copy > len) - copy = len; - vaddr = kmap(page); - err = memcpy_toiovecend(to, vaddr + frag->page_offset + - offset - start, to_offset, copy); - kunmap(page); - if (err) - goto fault; - if (!(len -= copy)) - return 0; - offset += copy; - to_offset += copy; - } - start = end; - } - - skb_walk_frags(skb, frag_iter) { - int end; - - WARN_ON(start > offset + len); - - end = start + frag_iter->len; - if ((copy = end - offset) > 0) { - if (copy > len) - copy = len; - if (skb_copy_datagram_const_iovec(frag_iter, - offset - start, - to, to_offset, - copy)) - goto fault; - if ((len -= copy) == 0) - return 0; - offset += copy; - to_offset += copy; - } - start = end; - } - if (!len) - return 0; - -fault: - return -EFAULT; + return 0; } -EXPORT_SYMBOL(skb_copy_datagram_const_iovec); +EXPORT_SYMBOL(skb_copy_datagram_iter); /** - * skb_copy_datagram_from_iovec - Copy a datagram from an iovec. + * skb_copy_datagram_from_iter - Copy a datagram from an iov_iter. * @skb: buffer to copy * @offset: offset in the buffer to start copying to - * @from: io vector to copy to - * @from_offset: offset in the io vector to start copying from + * @from: the copy source * @len: amount of data to copy to buffer from iovec * * Returns 0 or -EFAULT. - * Note: the iovec is not modified during the copy. */ -int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset, - const struct iovec *from, int from_offset, +int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset, + struct iov_iter *from, int len) { int start = skb_headlen(skb); @@ -504,13 +416,11 @@ int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset, if (copy > 0) { if (copy > len) copy = len; - if (memcpy_fromiovecend(skb->data + offset, from, from_offset, - copy)) + if (copy_from_iter(skb->data + offset, copy, from) != copy) goto fault; if ((len -= copy) == 0) return 0; offset += copy; - from_offset += copy; } /* Copy paged appendix. Hmm... why does this look so complicated? */ @@ -522,24 +432,19 @@ int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset, end = start + skb_frag_size(frag); if ((copy = end - offset) > 0) { - int err; - u8 *vaddr; - struct page *page = skb_frag_page(frag); + size_t copied; if (copy > len) copy = len; - vaddr = kmap(page); - err = memcpy_fromiovecend(vaddr + frag->page_offset + - offset - start, - from, from_offset, copy); - kunmap(page); - if (err) + copied = copy_page_from_iter(skb_frag_page(frag), + frag->page_offset + offset - start, + copy, from); + if (copied != copy) goto fault; if (!(len -= copy)) return 0; offset += copy; - from_offset += copy; } start = end; } @@ -553,16 +458,13 @@ int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset, if ((copy = end - offset) > 0) { if (copy > len) copy = len; - if (skb_copy_datagram_from_iovec(frag_iter, - offset - start, - from, - from_offset, - copy)) + if (skb_copy_datagram_from_iter(frag_iter, + offset - start, + from, copy)) goto fault; if ((len -= copy) == 0) return 0; offset += copy; - from_offset += copy; } start = end; } @@ -572,101 +474,82 @@ int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset, fault: return -EFAULT; } -EXPORT_SYMBOL(skb_copy_datagram_from_iovec); +EXPORT_SYMBOL(skb_copy_datagram_from_iter); /** - * zerocopy_sg_from_iovec - Build a zerocopy datagram from an iovec + * zerocopy_sg_from_iter - Build a zerocopy datagram from an iov_iter * @skb: buffer to copy - * @from: io vector to copy from - * @offset: offset in the io vector to start copying from - * @count: amount of vectors to copy to buffer from + * @from: the source to copy from * * The function will first copy up to headlen, and then pin the userspace * pages and build frags through them. * * Returns 0, -EFAULT or -EMSGSIZE. - * Note: the iovec is not modified during the copy */ -int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from, - int offset, size_t count) +int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from) { - int len = iov_length(from, count) - offset; + int len = iov_iter_count(from); int copy = min_t(int, skb_headlen(skb), len); - int size; - int i = 0; + int frag = 0; /* copy up to skb headlen */ - if (skb_copy_datagram_from_iovec(skb, 0, from, offset, copy)) + if (skb_copy_datagram_from_iter(skb, 0, from, copy)) return -EFAULT; - if (len == copy) - return 0; - - offset += copy; - while (count--) { - struct page *page[MAX_SKB_FRAGS]; - int num_pages; - unsigned long base; + while (iov_iter_count(from)) { + struct page *pages[MAX_SKB_FRAGS]; + size_t start; + ssize_t copied; unsigned long truesize; + int n = 0; - /* Skip over from offset and copied */ - if (offset >= from->iov_len) { - offset -= from->iov_len; - ++from; - continue; - } - len = from->iov_len - offset; - base = (unsigned long)from->iov_base + offset; - size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT; - if (i + size > MAX_SKB_FRAGS) + if (frag == MAX_SKB_FRAGS) return -EMSGSIZE; - num_pages = get_user_pages_fast(base, size, 0, &page[i]); - if (num_pages != size) { - release_pages(&page[i], num_pages, 0); + + copied = iov_iter_get_pages(from, pages, ~0U, + MAX_SKB_FRAGS - frag, &start); + if (copied < 0) return -EFAULT; - } - truesize = size * PAGE_SIZE; - skb->data_len += len; - skb->len += len; + + iov_iter_advance(from, copied); + + truesize = PAGE_ALIGN(copied + start); + skb->data_len += copied; + skb->len += copied; skb->truesize += truesize; atomic_add(truesize, &skb->sk->sk_wmem_alloc); - while (len) { - int off = base & ~PAGE_MASK; - int size = min_t(int, len, PAGE_SIZE - off); - skb_fill_page_desc(skb, i, page[i], off, size); - base += size; - len -= size; - i++; + while (copied) { + int size = min_t(int, copied, PAGE_SIZE - start); + skb_fill_page_desc(skb, frag++, pages[n], start, size); + start = 0; + copied -= size; + n++; } - offset = 0; - ++from; } return 0; } -EXPORT_SYMBOL(zerocopy_sg_from_iovec); +EXPORT_SYMBOL(zerocopy_sg_from_iter); static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset, - u8 __user *to, int len, + struct iov_iter *to, int len, __wsum *csump) { int start = skb_headlen(skb); int i, copy = start - offset; struct sk_buff *frag_iter; int pos = 0; + int n; /* Copy header. */ if (copy > 0) { - int err = 0; if (copy > len) copy = len; - *csump = csum_and_copy_to_user(skb->data + offset, to, copy, - *csump, &err); - if (err) + n = csum_and_copy_to_iter(skb->data + offset, copy, csump, to); + if (n != copy) goto fault; if ((len -= copy) == 0) return 0; offset += copy; - to += copy; pos = copy; } @@ -678,26 +561,22 @@ static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset, end = start + skb_frag_size(frag); if ((copy = end - offset) > 0) { - __wsum csum2; - int err = 0; - u8 *vaddr; + __wsum csum2 = 0; struct page *page = skb_frag_page(frag); + u8 *vaddr = kmap(page); if (copy > len) copy = len; - vaddr = kmap(page); - csum2 = csum_and_copy_to_user(vaddr + - frag->page_offset + - offset - start, - to, copy, 0, &err); + n = csum_and_copy_to_iter(vaddr + frag->page_offset + + offset - start, copy, + &csum2, to); kunmap(page); - if (err) + if (n != copy) goto fault; *csump = csum_block_add(*csump, csum2, pos); if (!(len -= copy)) return 0; offset += copy; - to += copy; pos += copy; } start = end; @@ -722,7 +601,6 @@ static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset, if ((len -= copy) == 0) return 0; offset += copy; - to += copy; pos += copy; } start = end; @@ -775,20 +653,19 @@ __sum16 __skb_checksum_complete(struct sk_buff *skb) EXPORT_SYMBOL(__skb_checksum_complete); /** - * skb_copy_and_csum_datagram_iovec - Copy and checksum skb to user iovec. + * skb_copy_and_csum_datagram_msg - Copy and checksum skb to user iovec. * @skb: skbuff * @hlen: hardware length - * @iov: io vector + * @msg: destination * * Caller _must_ check that skb will fit to this iovec. * * Returns: 0 - success. * -EINVAL - checksum failure. - * -EFAULT - fault during copy. Beware, in this case iovec - * can be modified! + * -EFAULT - fault during copy. */ -int skb_copy_and_csum_datagram_iovec(struct sk_buff *skb, - int hlen, struct iovec *iov) +int skb_copy_and_csum_datagram_msg(struct sk_buff *skb, + int hlen, struct msghdr *msg) { __wsum csum; int chunk = skb->len - hlen; @@ -796,28 +673,20 @@ int skb_copy_and_csum_datagram_iovec(struct sk_buff *skb, if (!chunk) return 0; - /* Skip filled elements. - * Pretty silly, look at memcpy_toiovec, though 8) - */ - while (!iov->iov_len) - iov++; - - if (iov->iov_len < chunk) { + if (iov_iter_count(&msg->msg_iter) < chunk) { if (__skb_checksum_complete(skb)) goto csum_error; - if (skb_copy_datagram_iovec(skb, hlen, iov, chunk)) + if (skb_copy_datagram_msg(skb, hlen, msg, chunk)) goto fault; } else { csum = csum_partial(skb->data, hlen, skb->csum); - if (skb_copy_and_csum_datagram(skb, hlen, iov->iov_base, + if (skb_copy_and_csum_datagram(skb, hlen, &msg->msg_iter, chunk, &csum)) goto fault; if (csum_fold(csum)) goto csum_error; if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE)) netdev_rx_csum_fault(skb->dev); - iov->iov_len -= chunk; - iov->iov_base += chunk; } return 0; csum_error: @@ -825,7 +694,7 @@ csum_error: fault: return -EFAULT; } -EXPORT_SYMBOL(skb_copy_and_csum_datagram_iovec); +EXPORT_SYMBOL(skb_copy_and_csum_datagram_msg); /** * datagram_poll - generic datagram poll diff --git a/net/core/dev.c b/net/core/dev.c index cf8a95f48cff..683d493aa1bf 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -118,6 +118,7 @@ #include <linux/if_vlan.h> #include <linux/ip.h> #include <net/ip.h> +#include <net/mpls.h> #include <linux/ipv6.h> #include <linux/in.h> #include <linux/jhash.h> @@ -133,6 +134,7 @@ #include <linux/vmalloc.h> #include <linux/if_macvlan.h> #include <linux/errqueue.h> +#include <linux/hrtimer.h> #include "net-sysfs.h" @@ -897,23 +899,25 @@ struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type) EXPORT_SYMBOL(dev_getfirstbyhwtype); /** - * dev_get_by_flags_rcu - find any device with given flags + * __dev_get_by_flags - find any device with given flags * @net: the applicable net namespace * @if_flags: IFF_* values * @mask: bitmask of bits in if_flags to check * * Search for any interface with the given flags. Returns NULL if a device * is not found or a pointer to the device. Must be called inside - * rcu_read_lock(), and result refcount is unchanged. + * rtnl_lock(), and result refcount is unchanged. */ -struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags, - unsigned short mask) +struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags, + unsigned short mask) { struct net_device *dev, *ret; + ASSERT_RTNL(); + ret = NULL; - for_each_netdev_rcu(net, dev) { + for_each_netdev(net, dev) { if (((dev->flags ^ if_flags) & mask) == 0) { ret = dev; break; @@ -921,7 +925,7 @@ struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags } return ret; } -EXPORT_SYMBOL(dev_get_by_flags_rcu); +EXPORT_SYMBOL(__dev_get_by_flags); /** * dev_valid_name - check if name is okay for network device @@ -1284,7 +1288,6 @@ static int __dev_open(struct net_device *dev) clear_bit(__LINK_STATE_START, &dev->state); else { dev->flags |= IFF_UP; - net_dmaengine_get(); dev_set_rx_mode(dev); dev_activate(dev); add_device_randomness(dev->dev_addr, dev->addr_len); @@ -1363,7 +1366,6 @@ static int __dev_close_many(struct list_head *head) ops->ndo_stop(dev); dev->flags &= ~IFF_UP; - net_dmaengine_put(); netpoll_poll_enable(dev); } @@ -1435,22 +1437,17 @@ EXPORT_SYMBOL(dev_close); */ void dev_disable_lro(struct net_device *dev) { - /* - * If we're trying to disable lro on a vlan device - * use the underlying physical device instead - */ - if (is_vlan_dev(dev)) - dev = vlan_dev_real_dev(dev); - - /* the same for macvlan devices */ - if (netif_is_macvlan(dev)) - dev = macvlan_dev_real_dev(dev); + struct net_device *lower_dev; + struct list_head *iter; dev->wanted_features &= ~NETIF_F_LRO; netdev_update_features(dev); if (unlikely(dev->features & NETIF_F_LRO)) netdev_WARN(dev, "failed to disable LRO!\n"); + + netdev_for_each_lower_dev(dev, lower_dev, iter) + dev_disable_lro(lower_dev); } EXPORT_SYMBOL(dev_disable_lro); @@ -1697,6 +1694,7 @@ int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb) skb_scrub_packet(skb, true); skb->protocol = eth_type_trans(skb, dev); + skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); return 0; } @@ -2153,7 +2151,7 @@ static inline void __netif_reschedule(struct Qdisc *q) unsigned long flags; local_irq_save(flags); - sd = &__get_cpu_var(softnet_data); + sd = this_cpu_ptr(&softnet_data); q->next_sched = NULL; *sd->output_queue_tailp = q; sd->output_queue_tailp = &q->next_sched; @@ -2177,6 +2175,53 @@ static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb) return (struct dev_kfree_skb_cb *)skb->cb; } +void netif_schedule_queue(struct netdev_queue *txq) +{ + rcu_read_lock(); + if (!(txq->state & QUEUE_STATE_ANY_XOFF)) { + struct Qdisc *q = rcu_dereference(txq->qdisc); + + __netif_schedule(q); + } + rcu_read_unlock(); +} +EXPORT_SYMBOL(netif_schedule_queue); + +/** + * netif_wake_subqueue - allow sending packets on subqueue + * @dev: network device + * @queue_index: sub queue index + * + * Resume individual transmit queue of a device with multiple transmit queues. + */ +void netif_wake_subqueue(struct net_device *dev, u16 queue_index) +{ + struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index); + + if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) { + struct Qdisc *q; + + rcu_read_lock(); + q = rcu_dereference(txq->qdisc); + __netif_schedule(q); + rcu_read_unlock(); + } +} +EXPORT_SYMBOL(netif_wake_subqueue); + +void netif_tx_wake_queue(struct netdev_queue *dev_queue) +{ + if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) { + struct Qdisc *q; + + rcu_read_lock(); + q = rcu_dereference(dev_queue->qdisc); + __netif_schedule(q); + rcu_read_unlock(); + } +} +EXPORT_SYMBOL(netif_tx_wake_queue); + void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason) { unsigned long flags; @@ -2373,16 +2418,6 @@ struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb, rcu_read_lock(); list_for_each_entry_rcu(ptype, &offload_base, list) { if (ptype->type == type && ptype->callbacks.gso_segment) { - if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) { - int err; - - err = ptype->callbacks.gso_send_check(skb); - segs = ERR_PTR(err); - if (err || skb_gso_ok(skb, features)) - break; - __skb_push(skb, (skb->data - - skb_network_header(skb))); - } segs = ptype->callbacks.gso_segment(skb, features); break; } @@ -2485,61 +2520,15 @@ static int illegal_highdma(struct net_device *dev, struct sk_buff *skb) return 0; } -struct dev_gso_cb { - void (*destructor)(struct sk_buff *skb); -}; - -#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb) - -static void dev_gso_skb_destructor(struct sk_buff *skb) -{ - struct dev_gso_cb *cb; - - kfree_skb_list(skb->next); - skb->next = NULL; - - cb = DEV_GSO_CB(skb); - if (cb->destructor) - cb->destructor(skb); -} - -/** - * dev_gso_segment - Perform emulated hardware segmentation on skb. - * @skb: buffer to segment - * @features: device features as applicable to this skb - * - * This function segments the given skb and stores the list of segments - * in skb->next. - */ -static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features) -{ - struct sk_buff *segs; - - segs = skb_gso_segment(skb, features); - - /* Verifying header integrity only. */ - if (!segs) - return 0; - - if (IS_ERR(segs)) - return PTR_ERR(segs); - - skb->next = segs; - DEV_GSO_CB(skb)->destructor = skb->destructor; - skb->destructor = dev_gso_skb_destructor; - - return 0; -} - /* If MPLS offload request, verify we are testing hardware MPLS features * instead of standard features for the netdev. */ -#ifdef CONFIG_NET_MPLS_GSO +#if IS_ENABLED(CONFIG_NET_MPLS_GSO) static netdev_features_t net_mpls_features(struct sk_buff *skb, netdev_features_t features, __be16 type) { - if (type == htons(ETH_P_MPLS_UC) || type == htons(ETH_P_MPLS_MC)) + if (eth_p_mpls(type)) features &= skb->dev->mpls_features; return features; @@ -2574,21 +2563,33 @@ static netdev_features_t harmonize_features(struct sk_buff *skb, netdev_features_t netif_skb_features(struct sk_buff *skb) { + struct net_device *dev = skb->dev; + netdev_features_t features = dev->features; + u16 gso_segs = skb_shinfo(skb)->gso_segs; __be16 protocol = skb->protocol; - netdev_features_t features = skb->dev->features; - if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs) + if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs) features &= ~NETIF_F_GSO_MASK; - if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) { - struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data; - protocol = veh->h_vlan_encapsulated_proto; - } else if (!vlan_tx_tag_present(skb)) { - return harmonize_features(skb, features); + /* If encapsulation offload request, verify we are testing + * hardware encapsulation features instead of standard + * features for the netdev + */ + if (skb->encapsulation) + features &= dev->hw_enc_features; + + if (!vlan_tx_tag_present(skb)) { + if (unlikely(protocol == htons(ETH_P_8021Q) || + protocol == htons(ETH_P_8021AD))) { + struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data; + protocol = veh->h_vlan_encapsulated_proto; + } else { + goto finalize; + } } features = netdev_intersect_features(features, - skb->dev->vlan_features | + dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX); @@ -2601,123 +2602,147 @@ netdev_features_t netif_skb_features(struct sk_buff *skb) NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX); +finalize: + if (dev->netdev_ops->ndo_features_check) + features &= dev->netdev_ops->ndo_features_check(skb, dev, + features); + return harmonize_features(skb, features); } EXPORT_SYMBOL(netif_skb_features); -int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, - struct netdev_queue *txq) +static int xmit_one(struct sk_buff *skb, struct net_device *dev, + struct netdev_queue *txq, bool more) { - const struct net_device_ops *ops = dev->netdev_ops; - int rc = NETDEV_TX_OK; - unsigned int skb_len; + unsigned int len; + int rc; - if (likely(!skb->next)) { - netdev_features_t features; + if (!list_empty(&ptype_all)) + dev_queue_xmit_nit(skb, dev); - /* - * If device doesn't need skb->dst, release it right now while - * its hot in this cpu cache - */ - if (dev->priv_flags & IFF_XMIT_DST_RELEASE) - skb_dst_drop(skb); + len = skb->len; + trace_net_dev_start_xmit(skb, dev); + rc = netdev_start_xmit(skb, dev, txq, more); + trace_net_dev_xmit(skb, rc, dev, len); - features = netif_skb_features(skb); + return rc; +} + +struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev, + struct netdev_queue *txq, int *ret) +{ + struct sk_buff *skb = first; + int rc = NETDEV_TX_OK; - if (vlan_tx_tag_present(skb) && - !vlan_hw_offload_capable(features, skb->vlan_proto)) { - skb = __vlan_put_tag(skb, skb->vlan_proto, - vlan_tx_tag_get(skb)); - if (unlikely(!skb)) - goto out; + while (skb) { + struct sk_buff *next = skb->next; - skb->vlan_tci = 0; + skb->next = NULL; + rc = xmit_one(skb, dev, txq, next != NULL); + if (unlikely(!dev_xmit_complete(rc))) { + skb->next = next; + goto out; } - /* If encapsulation offload request, verify we are testing - * hardware encapsulation features instead of standard - * features for the netdev - */ - if (skb->encapsulation) - features &= dev->hw_enc_features; + skb = next; + if (netif_xmit_stopped(txq) && skb) { + rc = NETDEV_TX_BUSY; + break; + } + } - if (netif_needs_gso(skb, features)) { - if (unlikely(dev_gso_segment(skb, features))) - goto out_kfree_skb; - if (skb->next) - goto gso; - } else { - if (skb_needs_linearize(skb, features) && - __skb_linearize(skb)) - goto out_kfree_skb; +out: + *ret = rc; + return skb; +} - /* If packet is not checksummed and device does not - * support checksumming for this protocol, complete - * checksumming here. - */ - if (skb->ip_summed == CHECKSUM_PARTIAL) { - if (skb->encapsulation) - skb_set_inner_transport_header(skb, - skb_checksum_start_offset(skb)); - else - skb_set_transport_header(skb, - skb_checksum_start_offset(skb)); - if (!(features & NETIF_F_ALL_CSUM) && - skb_checksum_help(skb)) - goto out_kfree_skb; - } - } +static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb, + netdev_features_t features) +{ + if (vlan_tx_tag_present(skb) && + !vlan_hw_offload_capable(features, skb->vlan_proto)) + skb = __vlan_hwaccel_push_inside(skb); + return skb; +} - if (!list_empty(&ptype_all)) - dev_queue_xmit_nit(skb, dev); +static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev) +{ + netdev_features_t features; - skb_len = skb->len; - trace_net_dev_start_xmit(skb, dev); - rc = ops->ndo_start_xmit(skb, dev); - trace_net_dev_xmit(skb, rc, dev, skb_len); - if (rc == NETDEV_TX_OK) - txq_trans_update(txq); - return rc; - } + if (skb->next) + return skb; -gso: - do { - struct sk_buff *nskb = skb->next; + features = netif_skb_features(skb); + skb = validate_xmit_vlan(skb, features); + if (unlikely(!skb)) + goto out_null; - skb->next = nskb->next; - nskb->next = NULL; + if (netif_needs_gso(dev, skb, features)) { + struct sk_buff *segs; - if (!list_empty(&ptype_all)) - dev_queue_xmit_nit(nskb, dev); - - skb_len = nskb->len; - trace_net_dev_start_xmit(nskb, dev); - rc = ops->ndo_start_xmit(nskb, dev); - trace_net_dev_xmit(nskb, rc, dev, skb_len); - if (unlikely(rc != NETDEV_TX_OK)) { - if (rc & ~NETDEV_TX_MASK) - goto out_kfree_gso_skb; - nskb->next = skb->next; - skb->next = nskb; - return rc; + segs = skb_gso_segment(skb, features); + if (IS_ERR(segs)) { + goto out_kfree_skb; + } else if (segs) { + consume_skb(skb); + skb = segs; + } + } else { + if (skb_needs_linearize(skb, features) && + __skb_linearize(skb)) + goto out_kfree_skb; + + /* If packet is not checksummed and device does not + * support checksumming for this protocol, complete + * checksumming here. + */ + if (skb->ip_summed == CHECKSUM_PARTIAL) { + if (skb->encapsulation) + skb_set_inner_transport_header(skb, + skb_checksum_start_offset(skb)); + else + skb_set_transport_header(skb, + skb_checksum_start_offset(skb)); + if (!(features & NETIF_F_ALL_CSUM) && + skb_checksum_help(skb)) + goto out_kfree_skb; } - txq_trans_update(txq); - if (unlikely(netif_xmit_stopped(txq) && skb->next)) - return NETDEV_TX_BUSY; - } while (skb->next); - -out_kfree_gso_skb: - if (likely(skb->next == NULL)) { - skb->destructor = DEV_GSO_CB(skb)->destructor; - consume_skb(skb); - return rc; } + + return skb; + out_kfree_skb: kfree_skb(skb); -out: - return rc; +out_null: + return NULL; +} + +struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev) +{ + struct sk_buff *next, *head = NULL, *tail; + + for (; skb != NULL; skb = next) { + next = skb->next; + skb->next = NULL; + + /* in case skb wont be segmented, point to itself */ + skb->prev = skb; + + skb = validate_xmit_skb(skb, dev); + if (!skb) + continue; + + if (!head) + head = skb; + else + tail->next = skb; + /* If skb was segmented, skb->prev points to + * the last segment. If not, it still contains skb. + */ + tail = skb->prev; + } + return head; } -EXPORT_SYMBOL_GPL(dev_hard_start_xmit); static void qdisc_pkt_len_init(struct sk_buff *skb) { @@ -2780,12 +2805,10 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, * waiting to be sent out; and the qdisc is not running - * xmit the skb directly. */ - if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE)) - skb_dst_force(skb); qdisc_bstats_update(q, skb); - if (sch_direct_xmit(skb, q, dev, txq, root_lock)) { + if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) { if (unlikely(contended)) { spin_unlock(&q->busylock); contended = false; @@ -2796,7 +2819,6 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, rc = NET_XMIT_SUCCESS; } else { - skb_dst_force(skb); rc = q->enqueue(skb, q) & NET_XMIT_MASK; if (qdisc_run_begin(q)) { if (unlikely(contended)) { @@ -2893,6 +2915,14 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) skb_update_prio(skb); + /* If device/qdisc don't need skb->dst, release it right now while + * its hot in this cpu cache. + */ + if (dev->priv_flags & IFF_XMIT_DST_RELEASE) + skb_dst_drop(skb); + else + skb_dst_force(skb); + txq = netdev_pick_tx(dev, skb, accel_priv); q = rcu_dereference_bh(txq->qdisc); @@ -2925,11 +2955,15 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT) goto recursion_alert; + skb = validate_xmit_skb(skb, dev); + if (!skb) + goto drop; + HARD_TX_LOCK(dev, txq, cpu); if (!netif_xmit_stopped(txq)) { __this_cpu_inc(xmit_recursion); - rc = dev_hard_start_xmit(skb, dev, txq); + skb = dev_hard_start_xmit(skb, dev, txq, &rc); __this_cpu_dec(xmit_recursion); if (dev_xmit_complete(rc)) { HARD_TX_UNLOCK(dev, txq); @@ -2950,10 +2984,11 @@ recursion_alert: } rc = -ENETDOWN; +drop: rcu_read_unlock_bh(); atomic_long_inc(&dev->tx_dropped); - kfree_skb(skb); + kfree_skb_list(skb); return rc; out: rcu_read_unlock_bh(); @@ -3130,8 +3165,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, } if (map) { - tcpu = map->cpus[((u64) hash * map->len) >> 32]; - + tcpu = map->cpus[reciprocal_scale(hash, map->len)]; if (cpu_online(tcpu)) { cpu = tcpu; goto done; @@ -3201,7 +3235,7 @@ static void rps_trigger_softirq(void *data) static int rps_ipi_queued(struct softnet_data *sd) { #ifdef CONFIG_RPS - struct softnet_data *mysd = &__get_cpu_var(softnet_data); + struct softnet_data *mysd = this_cpu_ptr(&softnet_data); if (sd != mysd) { sd->rps_ipi_next = mysd->rps_ipi_list; @@ -3228,7 +3262,7 @@ static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen) if (qlen < (netdev_max_backlog >> 1)) return false; - sd = &__get_cpu_var(softnet_data); + sd = this_cpu_ptr(&softnet_data); rcu_read_lock(); fl = rcu_dereference(sd->flow_limit); @@ -3272,7 +3306,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, rps_lock(sd); qlen = skb_queue_len(&sd->input_pkt_queue); if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) { - if (skb_queue_len(&sd->input_pkt_queue)) { + if (qlen) { enqueue: __skb_queue_tail(&sd->input_pkt_queue, skb); input_queue_tail_incr_save(sd, qtail); @@ -3375,7 +3409,7 @@ EXPORT_SYMBOL(netif_rx_ni); static void net_tx_action(struct softirq_action *h) { - struct softnet_data *sd = &__get_cpu_var(softnet_data); + struct softnet_data *sd = this_cpu_ptr(&softnet_data); if (sd->completion_queue) { struct sk_buff *clist; @@ -3467,7 +3501,7 @@ static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq) skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl); skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS); - q = rxq->qdisc; + q = rcu_dereference(rxq->qdisc); if (q != &noop_qdisc) { spin_lock(qdisc_lock(q)); if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) @@ -3484,7 +3518,7 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb, { struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue); - if (!rxq || rxq->qdisc == &noop_qdisc) + if (!rxq || rcu_access_pointer(rxq->qdisc) == &noop_qdisc) goto out; if (*pt_prev) { @@ -3800,7 +3834,7 @@ EXPORT_SYMBOL(netif_receive_skb); static void flush_backlog(void *arg) { struct net_device *dev = arg; - struct softnet_data *sd = &__get_cpu_var(softnet_data); + struct softnet_data *sd = this_cpu_ptr(&softnet_data); struct sk_buff *skb, *tmp; rps_lock(sd); @@ -3965,11 +3999,10 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff if (!(skb->dev->features & NETIF_F_GRO)) goto normal; - if (skb_is_gso(skb) || skb_has_frag_list(skb)) + if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad) goto normal; gro_list_prepare(napi, skb); - NAPI_GRO_CB(skb)->csum = skb->csum; /* Needed for CHECKSUM_COMPLETE */ rcu_read_lock(); list_for_each_entry_rcu(ptype, head, list) { @@ -3983,6 +4016,22 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff NAPI_GRO_CB(skb)->free = 0; NAPI_GRO_CB(skb)->udp_mark = 0; + /* Setup for GRO checksum validation */ + switch (skb->ip_summed) { + case CHECKSUM_COMPLETE: + NAPI_GRO_CB(skb)->csum = skb->csum; + NAPI_GRO_CB(skb)->csum_valid = 1; + NAPI_GRO_CB(skb)->csum_cnt = 0; + break; + case CHECKSUM_UNNECESSARY: + NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1; + NAPI_GRO_CB(skb)->csum_valid = 0; + break; + default: + NAPI_GRO_CB(skb)->csum_cnt = 0; + NAPI_GRO_CB(skb)->csum_valid = 0; + } + pp = ptype->callbacks.gro_receive(&napi->gro_list, skb); break; } @@ -4110,6 +4159,10 @@ EXPORT_SYMBOL(napi_gro_receive); static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) { + if (unlikely(skb->pfmemalloc)) { + consume_skb(skb); + return; + } __skb_pull(skb, skb_headlen(skb)); /* restore the reserve we had after netdev_alloc_skb_ip_align() */ skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb)); @@ -4128,7 +4181,7 @@ struct sk_buff *napi_get_frags(struct napi_struct *napi) struct sk_buff *skb = napi->skb; if (!skb) { - skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD); + skb = napi_alloc_skb(napi, GRO_MAX_HEAD); napi->skb = skb; } return skb; @@ -4212,6 +4265,31 @@ gro_result_t napi_gro_frags(struct napi_struct *napi) } EXPORT_SYMBOL(napi_gro_frags); +/* Compute the checksum from gro_offset and return the folded value + * after adding in any pseudo checksum. + */ +__sum16 __skb_gro_checksum_complete(struct sk_buff *skb) +{ + __wsum wsum; + __sum16 sum; + + wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0); + + /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */ + sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum)); + if (likely(!sum)) { + if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && + !skb->csum_complete_sw) + netdev_rx_csum_fault(skb->dev); + } + + NAPI_GRO_CB(skb)->csum = wsum; + NAPI_GRO_CB(skb)->csum_valid = 1; + + return sum; +} +EXPORT_SYMBOL(__skb_gro_checksum_complete); + /* * net_rps_action_and_irq_enable sends any pending IPI's for rps. * Note: called with local irq disabled, but exits with local irq enabled. @@ -4240,20 +4318,28 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd) local_irq_enable(); } +static bool sd_has_rps_ipi_waiting(struct softnet_data *sd) +{ +#ifdef CONFIG_RPS + return sd->rps_ipi_list != NULL; +#else + return false; +#endif +} + static int process_backlog(struct napi_struct *napi, int quota) { int work = 0; struct softnet_data *sd = container_of(napi, struct softnet_data, backlog); -#ifdef CONFIG_RPS /* Check if we have pending ipi, its better to send them now, * not waiting net_rx_action() end. */ - if (sd->rps_ipi_list) { + if (sd_has_rps_ipi_waiting(sd)) { local_irq_disable(); net_rps_action_and_irq_enable(sd); } -#endif + napi->weight = weight_p; local_irq_disable(); while (1) { @@ -4280,7 +4366,6 @@ static int process_backlog(struct napi_struct *napi, int quota) * We can use a plain write instead of clear_bit(), * and we dont need an smp_mb() memory barrier. */ - list_del(&napi->poll_list); napi->state = 0; rps_unlock(sd); @@ -4300,30 +4385,42 @@ static int process_backlog(struct napi_struct *napi, int quota) * __napi_schedule - schedule for receive * @n: entry to schedule * - * The entry's receive function will be scheduled to run + * The entry's receive function will be scheduled to run. + * Consider using __napi_schedule_irqoff() if hard irqs are masked. */ void __napi_schedule(struct napi_struct *n) { unsigned long flags; local_irq_save(flags); - ____napi_schedule(&__get_cpu_var(softnet_data), n); + ____napi_schedule(this_cpu_ptr(&softnet_data), n); local_irq_restore(flags); } EXPORT_SYMBOL(__napi_schedule); +/** + * __napi_schedule_irqoff - schedule for receive + * @n: entry to schedule + * + * Variant of __napi_schedule() assuming hard irqs are masked + */ +void __napi_schedule_irqoff(struct napi_struct *n) +{ + ____napi_schedule(this_cpu_ptr(&softnet_data), n); +} +EXPORT_SYMBOL(__napi_schedule_irqoff); + void __napi_complete(struct napi_struct *n) { BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state)); - BUG_ON(n->gro_list); - list_del(&n->poll_list); + list_del_init(&n->poll_list); smp_mb__before_atomic(); clear_bit(NAPI_STATE_SCHED, &n->state); } EXPORT_SYMBOL(__napi_complete); -void napi_complete(struct napi_struct *n) +void napi_complete_done(struct napi_struct *n, int work_done) { unsigned long flags; @@ -4334,12 +4431,28 @@ void napi_complete(struct napi_struct *n) if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state))) return; - napi_gro_flush(n, false); - local_irq_save(flags); - __napi_complete(n); - local_irq_restore(flags); + if (n->gro_list) { + unsigned long timeout = 0; + + if (work_done) + timeout = n->dev->gro_flush_timeout; + + if (timeout) + hrtimer_start(&n->timer, ns_to_ktime(timeout), + HRTIMER_MODE_REL_PINNED); + else + napi_gro_flush(n, false); + } + if (likely(list_empty(&n->poll_list))) { + WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state)); + } else { + /* If n->poll_list is not empty, we need to mask irqs */ + local_irq_save(flags); + __napi_complete(n); + local_irq_restore(flags); + } } -EXPORT_SYMBOL(napi_complete); +EXPORT_SYMBOL(napi_complete_done); /* must be called under rcu_read_lock(), as we dont take a reference */ struct napi_struct *napi_by_id(unsigned int napi_id) @@ -4393,10 +4506,23 @@ void napi_hash_del(struct napi_struct *napi) } EXPORT_SYMBOL_GPL(napi_hash_del); +static enum hrtimer_restart napi_watchdog(struct hrtimer *timer) +{ + struct napi_struct *napi; + + napi = container_of(timer, struct napi_struct, timer); + if (napi->gro_list) + napi_schedule(napi); + + return HRTIMER_NORESTART; +} + void netif_napi_add(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int), int weight) { INIT_LIST_HEAD(&napi->poll_list); + hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); + napi->timer.function = napi_watchdog; napi->gro_count = 0; napi->gro_list = NULL; napi->skb = NULL; @@ -4415,6 +4541,20 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi, } EXPORT_SYMBOL(netif_napi_add); +void napi_disable(struct napi_struct *n) +{ + might_sleep(); + set_bit(NAPI_STATE_DISABLE, &n->state); + + while (test_and_set_bit(NAPI_STATE_SCHED, &n->state)) + msleep(1); + + hrtimer_cancel(&n->timer); + + clear_bit(NAPI_STATE_DISABLE, &n->state); +} +EXPORT_SYMBOL(napi_disable); + void netif_napi_del(struct napi_struct *napi) { list_del_init(&napi->dev_list); @@ -4426,99 +4566,112 @@ void netif_napi_del(struct napi_struct *napi) } EXPORT_SYMBOL(netif_napi_del); -static void net_rx_action(struct softirq_action *h) +static int napi_poll(struct napi_struct *n, struct list_head *repoll) { - struct softnet_data *sd = &__get_cpu_var(softnet_data); - unsigned long time_limit = jiffies + 2; - int budget = netdev_budget; void *have; + int work, weight; - local_irq_disable(); + list_del_init(&n->poll_list); - while (!list_empty(&sd->poll_list)) { - struct napi_struct *n; - int work, weight; + have = netpoll_poll_lock(n); - /* If softirq window is exhuasted then punt. - * Allow this to run for 2 jiffies since which will allow - * an average latency of 1.5/HZ. - */ - if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit))) - goto softnet_break; + weight = n->weight; - local_irq_enable(); + /* This NAPI_STATE_SCHED test is for avoiding a race + * with netpoll's poll_napi(). Only the entity which + * obtains the lock and sees NAPI_STATE_SCHED set will + * actually make the ->poll() call. Therefore we avoid + * accidentally calling ->poll() when NAPI is not scheduled. + */ + work = 0; + if (test_bit(NAPI_STATE_SCHED, &n->state)) { + work = n->poll(n, weight); + trace_napi_poll(n); + } - /* Even though interrupts have been re-enabled, this - * access is safe because interrupts can only add new - * entries to the tail of this list, and only ->poll() - * calls can remove this head entry from the list. - */ - n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list); + WARN_ON_ONCE(work > weight); - have = netpoll_poll_lock(n); + if (likely(work < weight)) + goto out_unlock; - weight = n->weight; + /* Drivers must not modify the NAPI state if they + * consume the entire weight. In such cases this code + * still "owns" the NAPI instance and therefore can + * move the instance around on the list at-will. + */ + if (unlikely(napi_disable_pending(n))) { + napi_complete(n); + goto out_unlock; + } - /* This NAPI_STATE_SCHED test is for avoiding a race - * with netpoll's poll_napi(). Only the entity which - * obtains the lock and sees NAPI_STATE_SCHED set will - * actually make the ->poll() call. Therefore we avoid - * accidentally calling ->poll() when NAPI is not scheduled. + if (n->gro_list) { + /* flush too old packets + * If HZ < 1000, flush all packets. */ - work = 0; - if (test_bit(NAPI_STATE_SCHED, &n->state)) { - work = n->poll(n, weight); - trace_napi_poll(n); - } + napi_gro_flush(n, HZ >= 1000); + } - WARN_ON_ONCE(work > weight); + /* Some drivers may have called napi_schedule + * prior to exhausting their budget. + */ + if (unlikely(!list_empty(&n->poll_list))) { + pr_warn_once("%s: Budget exhausted after napi rescheduled\n", + n->dev ? n->dev->name : "backlog"); + goto out_unlock; + } - budget -= work; + list_add_tail(&n->poll_list, repoll); - local_irq_disable(); +out_unlock: + netpoll_poll_unlock(have); - /* Drivers must not modify the NAPI state if they - * consume the entire weight. In such cases this code - * still "owns" the NAPI instance and therefore can - * move the instance around on the list at-will. - */ - if (unlikely(work == weight)) { - if (unlikely(napi_disable_pending(n))) { - local_irq_enable(); - napi_complete(n); - local_irq_disable(); - } else { - if (n->gro_list) { - /* flush too old packets - * If HZ < 1000, flush all packets. - */ - local_irq_enable(); - napi_gro_flush(n, HZ >= 1000); - local_irq_disable(); - } - list_move_tail(&n->poll_list, &sd->poll_list); - } + return work; +} + +static void net_rx_action(struct softirq_action *h) +{ + struct softnet_data *sd = this_cpu_ptr(&softnet_data); + unsigned long time_limit = jiffies + 2; + int budget = netdev_budget; + LIST_HEAD(list); + LIST_HEAD(repoll); + + local_irq_disable(); + list_splice_init(&sd->poll_list, &list); + local_irq_enable(); + + for (;;) { + struct napi_struct *n; + + if (list_empty(&list)) { + if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll)) + return; + break; } - netpoll_poll_unlock(have); + n = list_first_entry(&list, struct napi_struct, poll_list); + budget -= napi_poll(n, &repoll); + + /* If softirq window is exhausted then punt. + * Allow this to run for 2 jiffies since which will allow + * an average latency of 1.5/HZ. + */ + if (unlikely(budget <= 0 || + time_after_eq(jiffies, time_limit))) { + sd->time_squeeze++; + break; + } } -out: - net_rps_action_and_irq_enable(sd); -#ifdef CONFIG_NET_DMA - /* - * There may not be any more sk_buffs coming right now, so push - * any pending DMA copies to hardware - */ - dma_issue_pending_all(); -#endif + local_irq_disable(); - return; + list_splice_tail_init(&sd->poll_list, &list); + list_splice_tail(&repoll, &list); + list_splice(&list, &sd->poll_list); + if (!list_empty(&sd->poll_list)) + __raise_softirq_irqoff(NET_RX_SOFTIRQ); -softnet_break: - sd->time_squeeze++; - __raise_softirq_irqoff(NET_RX_SOFTIRQ); - goto out; + net_rps_action_and_irq_enable(sd); } struct netdev_adjacent { @@ -5718,7 +5871,7 @@ EXPORT_SYMBOL(dev_change_carrier); * Get device physical port ID */ int dev_get_phys_port_id(struct net_device *dev, - struct netdev_phys_port_id *ppid) + struct netdev_phys_item_id *ppid) { const struct net_device_ops *ops = dev->netdev_ops; @@ -5797,6 +5950,8 @@ static void rollback_registered_many(struct list_head *head) synchronize_net(); list_for_each_entry(dev, head, unreg_list) { + struct sk_buff *skb = NULL; + /* Shutdown queueing discipline. */ dev_shutdown(dev); @@ -5806,6 +5961,11 @@ static void rollback_registered_many(struct list_head *head) */ call_netdevice_notifiers(NETDEV_UNREGISTER, dev); + if (!dev->rtnl_link_ops || + dev->rtnl_link_state == RTNL_LINK_INITIALIZED) + skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, + GFP_KERNEL); + /* * Flush the unicast and multicast chains */ @@ -5815,9 +5975,8 @@ static void rollback_registered_many(struct list_head *head) if (dev->netdev_ops->ndo_uninit) dev->netdev_ops->ndo_uninit(dev); - if (!dev->rtnl_link_ops || - dev->rtnl_link_state == RTNL_LINK_INITIALIZED) - rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL); + if (skb) + rtmsg_ifinfo_send(skb, dev, GFP_KERNEL); /* Notifier chain MUST detach us all upper devices. */ WARN_ON(netdev_has_any_upper_dev(dev)); @@ -6589,6 +6748,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev->gso_max_size = GSO_MAX_SIZE; dev->gso_max_segs = GSO_MAX_SEGS; + dev->gso_min_segs = 0; INIT_LIST_HEAD(&dev->napi_list); INIT_LIST_HEAD(&dev->unreg_list); @@ -6598,7 +6758,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, INIT_LIST_HEAD(&dev->adj_list.lower); INIT_LIST_HEAD(&dev->all_adj_list.upper); INIT_LIST_HEAD(&dev->all_adj_list.lower); - dev->priv_flags = IFF_XMIT_DST_RELEASE; + dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; setup(dev); dev->num_tx_queues = txqs; @@ -7020,53 +7180,45 @@ const char *netdev_drivername(const struct net_device *dev) return empty; } -static int __netdev_printk(const char *level, const struct net_device *dev, - struct va_format *vaf) +static void __netdev_printk(const char *level, const struct net_device *dev, + struct va_format *vaf) { - int r; - if (dev && dev->dev.parent) { - r = dev_printk_emit(level[1] - '0', - dev->dev.parent, - "%s %s %s%s: %pV", - dev_driver_string(dev->dev.parent), - dev_name(dev->dev.parent), - netdev_name(dev), netdev_reg_state(dev), - vaf); + dev_printk_emit(level[1] - '0', + dev->dev.parent, + "%s %s %s%s: %pV", + dev_driver_string(dev->dev.parent), + dev_name(dev->dev.parent), + netdev_name(dev), netdev_reg_state(dev), + vaf); } else if (dev) { - r = printk("%s%s%s: %pV", level, netdev_name(dev), - netdev_reg_state(dev), vaf); + printk("%s%s%s: %pV", + level, netdev_name(dev), netdev_reg_state(dev), vaf); } else { - r = printk("%s(NULL net_device): %pV", level, vaf); + printk("%s(NULL net_device): %pV", level, vaf); } - - return r; } -int netdev_printk(const char *level, const struct net_device *dev, - const char *format, ...) +void netdev_printk(const char *level, const struct net_device *dev, + const char *format, ...) { struct va_format vaf; va_list args; - int r; va_start(args, format); vaf.fmt = format; vaf.va = &args; - r = __netdev_printk(level, dev, &vaf); + __netdev_printk(level, dev, &vaf); va_end(args); - - return r; } EXPORT_SYMBOL(netdev_printk); #define define_netdev_printk_level(func, level) \ -int func(const struct net_device *dev, const char *fmt, ...) \ +void func(const struct net_device *dev, const char *fmt, ...) \ { \ - int r; \ struct va_format vaf; \ va_list args; \ \ @@ -7075,11 +7227,9 @@ int func(const struct net_device *dev, const char *fmt, ...) \ vaf.fmt = fmt; \ vaf.va = &args; \ \ - r = __netdev_printk(level, dev, &vaf); \ + __netdev_printk(level, dev, &vaf); \ \ va_end(args); \ - \ - return r; \ } \ EXPORT_SYMBOL(func); @@ -7141,11 +7291,10 @@ static void __net_exit rtnl_lock_unregistering(struct list_head *net_list) */ struct net *net; bool unregistering; - DEFINE_WAIT(wait); + DEFINE_WAIT_FUNC(wait, woken_wake_function); + add_wait_queue(&netdev_unregistering_wq, &wait); for (;;) { - prepare_to_wait(&netdev_unregistering_wq, &wait, - TASK_UNINTERRUPTIBLE); unregistering = false; rtnl_lock(); list_for_each_entry(net, net_list, exit_list) { @@ -7157,9 +7306,10 @@ static void __net_exit rtnl_lock_unregistering(struct list_head *net_list) if (!unregistering) break; __rtnl_unlock(); - schedule(); + + wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); } - finish_wait(&netdev_unregistering_wq, &wait); + remove_wait_queue(&netdev_unregistering_wq, &wait); } static void __net_exit default_device_exit_batch(struct list_head *net_list) diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c index b6b230600b97..c0548d268e1a 100644 --- a/net/core/dev_addr_lists.c +++ b/net/core/dev_addr_lists.c @@ -278,8 +278,8 @@ int __hw_addr_sync_dev(struct netdev_hw_addr_list *list, EXPORT_SYMBOL(__hw_addr_sync_dev); /** - * __hw_addr_unsync_dev - Remove synchonized addresses from device - * @list: address list to remove syncronized addresses from + * __hw_addr_unsync_dev - Remove synchronized addresses from device + * @list: address list to remove synchronized addresses from * @dev: device to sync * @unsync: function to call if address should be removed * diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index cf999e09bcd2..b94b1d293506 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -142,10 +142,12 @@ static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cm case SIOCGIFHWADDR: if (!dev->addr_len) - memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data); + memset(ifr->ifr_hwaddr.sa_data, 0, + sizeof(ifr->ifr_hwaddr.sa_data)); else memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr, - min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len)); + min(sizeof(ifr->ifr_hwaddr.sa_data), + (size_t)dev->addr_len)); ifr->ifr_hwaddr.sa_family = dev->type; return 0; @@ -265,7 +267,8 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) if (ifr->ifr_hwaddr.sa_family != dev->type) return -EINVAL; memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data, - min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len)); + min(sizeof(ifr->ifr_hwaddr.sa_data), + (size_t)dev->addr_len)); call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); return 0; @@ -365,11 +368,8 @@ void dev_load(struct net *net, const char *name) no_module = !dev; if (no_module && capable(CAP_NET_ADMIN)) no_module = request_module("netdev-%s", name); - if (no_module && capable(CAP_SYS_MODULE)) { - if (!request_module("%s", name)) - pr_warn("Loading kernel module for a network device with CAP_SYS_MODULE (deprecated). Use CAP_NET_ADMIN and alias netdev-%s instead.\n", - name); - } + if (no_module && capable(CAP_SYS_MODULE)) + request_module("%s", name); } EXPORT_SYMBOL(dev_load); diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index 50f9a9db5792..252e155c837b 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -146,7 +146,7 @@ static void trace_drop_common(struct sk_buff *skb, void *location) unsigned long flags; local_irq_save(flags); - data = &__get_cpu_var(dm_cpu_data); + data = this_cpu_ptr(&dm_cpu_data); spin_lock(&data->lock); dskb = data->skb; diff --git a/net/core/dst.c b/net/core/dst.c index a028409ee438..e956ce6d1378 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -327,30 +327,6 @@ void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old) } EXPORT_SYMBOL(__dst_destroy_metrics_generic); -/** - * __skb_dst_set_noref - sets skb dst, without a reference - * @skb: buffer - * @dst: dst entry - * @force: if force is set, use noref version even for DST_NOCACHE entries - * - * Sets skb dst, assuming a reference was not taken on dst - * skb_dst_drop() should not dst_release() this dst - */ -void __skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst, bool force) -{ - WARN_ON(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); - /* If dst not in cache, we must take a reference, because - * dst_release() will destroy dst as soon as its refcount becomes zero - */ - if (unlikely((dst->flags & DST_NOCACHE) && !force)) { - dst_hold(dst); - skb_dst_set(skb, dst); - } else { - skb->_skb_refdst = (unsigned long)dst | SKB_DST_NOREF; - } -} -EXPORT_SYMBOL(__skb_dst_set_noref); - /* Dirty hack. We did it in 2.2 (in __dst_free), * we have _very_ good reasons not to repeat * this mistake in 2.3, but we have no choice diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 17cb912793fa..550892cd6b3f 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -25,6 +25,7 @@ #include <linux/slab.h> #include <linux/rtnetlink.h> #include <linux/sched.h> +#include <linux/net.h> /* * Some useful ethtool_ops methods that're device independent. @@ -84,7 +85,6 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] [NETIF_F_GSO_IPIP_BIT] = "tx-ipip-segmentation", [NETIF_F_GSO_SIT_BIT] = "tx-sit-segmentation", [NETIF_F_GSO_UDP_TUNNEL_BIT] = "tx-udp_tnl-segmentation", - [NETIF_F_GSO_MPLS_BIT] = "tx-mpls-segmentation", [NETIF_F_FCOE_CRC_BIT] = "tx-checksum-fcoe-crc", [NETIF_F_SCTP_CSUM_BIT] = "tx-checksum-sctp", @@ -100,6 +100,12 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] [NETIF_F_BUSY_POLL_BIT] = "busy-poll", }; +static const char +rss_hash_func_strings[ETH_RSS_HASH_FUNCS_COUNT][ETH_GSTRING_LEN] = { + [ETH_RSS_HASH_TOP_BIT] = "toeplitz", + [ETH_RSS_HASH_XOR_BIT] = "xor", +}; + static int ethtool_get_features(struct net_device *dev, void __user *useraddr) { struct ethtool_gfeatures cmd = { @@ -185,6 +191,9 @@ static int __ethtool_get_sset_count(struct net_device *dev, int sset) if (sset == ETH_SS_FEATURES) return ARRAY_SIZE(netdev_features_strings); + if (sset == ETH_SS_RSS_HASH_FUNCS) + return ARRAY_SIZE(rss_hash_func_strings); + if (ops->get_sset_count && ops->get_strings) return ops->get_sset_count(dev, sset); else @@ -199,6 +208,9 @@ static void __ethtool_get_strings(struct net_device *dev, if (stringset == ETH_SS_FEATURES) memcpy(data, netdev_features_strings, sizeof(netdev_features_strings)); + else if (stringset == ETH_SS_RSS_HASH_FUNCS) + memcpy(data, rss_hash_func_strings, + sizeof(rss_hash_func_strings)); else /* ops->get_strings is valid because checked earlier */ ops->get_strings(dev, stringset, data); @@ -574,6 +586,16 @@ static int ethtool_copy_validate_indir(u32 *indir, void __user *useraddr, return 0; } +u8 netdev_rss_key[NETDEV_RSS_KEY_LEN]; + +void netdev_rss_key_fill(void *buffer, size_t len) +{ + BUG_ON(len > sizeof(netdev_rss_key)); + net_get_random_once(netdev_rss_key, sizeof(netdev_rss_key)); + memcpy(buffer, netdev_rss_key, len); +} +EXPORT_SYMBOL(netdev_rss_key_fill); + static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev, void __user *useraddr) { @@ -608,7 +630,7 @@ static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev, if (!indir) return -ENOMEM; - ret = dev->ethtool_ops->get_rxfh(dev, indir, NULL); + ret = dev->ethtool_ops->get_rxfh(dev, indir, NULL, NULL); if (ret) goto out; @@ -669,7 +691,7 @@ static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev, goto out; } - ret = ops->set_rxfh(dev, indir, NULL); + ret = ops->set_rxfh(dev, indir, NULL, ETH_RSS_HASH_NO_CHANGE); out: kfree(indir); @@ -687,12 +709,11 @@ static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev, u32 total_size; u32 indir_bytes; u32 *indir = NULL; + u8 dev_hfunc = 0; u8 *hkey = NULL; u8 *rss_config; - if (!(dev->ethtool_ops->get_rxfh_indir_size || - dev->ethtool_ops->get_rxfh_key_size) || - !dev->ethtool_ops->get_rxfh) + if (!ops->get_rxfh) return -EOPNOTSUPP; if (ops->get_rxfh_indir_size) @@ -700,16 +721,14 @@ static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev, if (ops->get_rxfh_key_size) dev_key_size = ops->get_rxfh_key_size(dev); - if ((dev_key_size + dev_indir_size) == 0) - return -EOPNOTSUPP; - if (copy_from_user(&rxfh, useraddr, sizeof(rxfh))) return -EFAULT; user_indir_size = rxfh.indir_size; user_key_size = rxfh.key_size; /* Check that reserved fields are 0 for now */ - if (rxfh.rss_context || rxfh.rsvd[0] || rxfh.rsvd[1]) + if (rxfh.rss_context || rxfh.rsvd8[0] || rxfh.rsvd8[1] || + rxfh.rsvd8[2] || rxfh.rsvd32) return -EINVAL; rxfh.indir_size = dev_indir_size; @@ -717,13 +736,6 @@ static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev, if (copy_to_user(useraddr, &rxfh, sizeof(rxfh))) return -EFAULT; - /* If the user buffer size is 0, this is just a query for the - * device table size and key size. Otherwise, if the User size is - * not equal to device table size or key size it's an error. - */ - if (!user_indir_size && !user_key_size) - return 0; - if ((user_indir_size && (user_indir_size != dev_indir_size)) || (user_key_size && (user_key_size != dev_key_size))) return -EINVAL; @@ -740,14 +752,19 @@ static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev, if (user_key_size) hkey = rss_config + indir_bytes; - ret = dev->ethtool_ops->get_rxfh(dev, indir, hkey); - if (!ret) { - if (copy_to_user(useraddr + - offsetof(struct ethtool_rxfh, rss_config[0]), - rss_config, total_size)) - ret = -EFAULT; - } + ret = dev->ethtool_ops->get_rxfh(dev, indir, hkey, &dev_hfunc); + if (ret) + goto out; + if (copy_to_user(useraddr + offsetof(struct ethtool_rxfh, hfunc), + &dev_hfunc, sizeof(rxfh.hfunc))) { + ret = -EFAULT; + } else if (copy_to_user(useraddr + + offsetof(struct ethtool_rxfh, rss_config[0]), + rss_config, total_size)) { + ret = -EFAULT; + } +out: kfree(rss_config); return ret; @@ -766,33 +783,31 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, u8 *rss_config; u32 rss_cfg_offset = offsetof(struct ethtool_rxfh, rss_config[0]); - if (!(ops->get_rxfh_indir_size || ops->get_rxfh_key_size) || - !ops->get_rxnfc || !ops->set_rxfh) + if (!ops->get_rxnfc || !ops->set_rxfh) return -EOPNOTSUPP; if (ops->get_rxfh_indir_size) dev_indir_size = ops->get_rxfh_indir_size(dev); if (ops->get_rxfh_key_size) dev_key_size = dev->ethtool_ops->get_rxfh_key_size(dev); - if ((dev_key_size + dev_indir_size) == 0) - return -EOPNOTSUPP; if (copy_from_user(&rxfh, useraddr, sizeof(rxfh))) return -EFAULT; /* Check that reserved fields are 0 for now */ - if (rxfh.rss_context || rxfh.rsvd[0] || rxfh.rsvd[1]) + if (rxfh.rss_context || rxfh.rsvd8[0] || rxfh.rsvd8[1] || + rxfh.rsvd8[2] || rxfh.rsvd32) return -EINVAL; - /* If either indir or hash key is valid, proceed further. - * It is not valid to request that both be unchanged. + /* If either indir, hash key or function is valid, proceed further. + * Must request at least one change: indir size, hash key or function. */ if ((rxfh.indir_size && rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE && rxfh.indir_size != dev_indir_size) || (rxfh.key_size && (rxfh.key_size != dev_key_size)) || (rxfh.indir_size == ETH_RXFH_INDIR_NO_CHANGE && - rxfh.key_size == 0)) + rxfh.key_size == 0 && rxfh.hfunc == ETH_RSS_HASH_NO_CHANGE)) return -EINVAL; if (rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE) @@ -835,7 +850,7 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, } } - ret = ops->set_rxfh(dev, indir, hkey); + ret = ops->set_rxfh(dev, indir, hkey, rxfh.hfunc); out: kfree(rss_config); @@ -1036,7 +1051,8 @@ static int ethtool_get_eeprom(struct net_device *dev, void __user *useraddr) { const struct ethtool_ops *ops = dev->ethtool_ops; - if (!ops->get_eeprom || !ops->get_eeprom_len) + if (!ops->get_eeprom || !ops->get_eeprom_len || + !ops->get_eeprom_len(dev)) return -EOPNOTSUPP; return ethtool_get_any_eeprom(dev, useraddr, ops->get_eeprom, @@ -1052,7 +1068,8 @@ static int ethtool_set_eeprom(struct net_device *dev, void __user *useraddr) u8 *data; int ret = 0; - if (!ops->set_eeprom || !ops->get_eeprom_len) + if (!ops->set_eeprom || !ops->get_eeprom_len || + !ops->get_eeprom_len(dev)) return -EOPNOTSUPP; if (copy_from_user(&eeprom, useraddr, sizeof(eeprom))) @@ -1621,6 +1638,81 @@ static int ethtool_get_module_eeprom(struct net_device *dev, modinfo.eeprom_len); } +static int ethtool_tunable_valid(const struct ethtool_tunable *tuna) +{ + switch (tuna->id) { + case ETHTOOL_RX_COPYBREAK: + case ETHTOOL_TX_COPYBREAK: + if (tuna->len != sizeof(u32) || + tuna->type_id != ETHTOOL_TUNABLE_U32) + return -EINVAL; + break; + default: + return -EINVAL; + } + + return 0; +} + +static int ethtool_get_tunable(struct net_device *dev, void __user *useraddr) +{ + int ret; + struct ethtool_tunable tuna; + const struct ethtool_ops *ops = dev->ethtool_ops; + void *data; + + if (!ops->get_tunable) + return -EOPNOTSUPP; + if (copy_from_user(&tuna, useraddr, sizeof(tuna))) + return -EFAULT; + ret = ethtool_tunable_valid(&tuna); + if (ret) + return ret; + data = kmalloc(tuna.len, GFP_USER); + if (!data) + return -ENOMEM; + ret = ops->get_tunable(dev, &tuna, data); + if (ret) + goto out; + useraddr += sizeof(tuna); + ret = -EFAULT; + if (copy_to_user(useraddr, data, tuna.len)) + goto out; + ret = 0; + +out: + kfree(data); + return ret; +} + +static int ethtool_set_tunable(struct net_device *dev, void __user *useraddr) +{ + int ret; + struct ethtool_tunable tuna; + const struct ethtool_ops *ops = dev->ethtool_ops; + void *data; + + if (!ops->set_tunable) + return -EOPNOTSUPP; + if (copy_from_user(&tuna, useraddr, sizeof(tuna))) + return -EFAULT; + ret = ethtool_tunable_valid(&tuna); + if (ret) + return ret; + data = kmalloc(tuna.len, GFP_USER); + if (!data) + return -ENOMEM; + useraddr += sizeof(tuna); + ret = -EFAULT; + if (copy_from_user(data, useraddr, tuna.len)) + goto out; + ret = ops->set_tunable(dev, &tuna, data); + +out: + kfree(data); + return ret; +} + /* The main entry point in this file. Called from net/core/dev_ioctl.c */ int dev_ethtool(struct net *net, struct ifreq *ifr) @@ -1670,6 +1762,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_GCHANNELS: case ETHTOOL_GET_TS_INFO: case ETHTOOL_GEEE: + case ETHTOOL_GTUNABLE: break; default: if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) @@ -1857,6 +1950,12 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_GMODULEEEPROM: rc = ethtool_get_module_eeprom(dev, useraddr); break; + case ETHTOOL_GTUNABLE: + rc = ethtool_get_tunable(dev, useraddr); + break; + case ETHTOOL_STUNABLE: + rc = ethtool_set_tunable(dev, useraddr); + break; default: rc = -EOPNOTSUPP; } diff --git a/net/core/filter.c b/net/core/filter.c index d814b8a89d0f..ec9baea10c16 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -44,6 +44,7 @@ #include <linux/ratelimit.h> #include <linux/seccomp.h> #include <linux/if_vlan.h> +#include <linux/bpf.h> /** * sk_filter - run a packet through a socket filter @@ -51,9 +52,9 @@ * @skb: buffer to filter * * Run the filter code and then cut skb->data to correct size returned by - * sk_run_filter. If pkt_len is 0 we toss packet. If skb->len is smaller + * SK_RUN_FILTER. If pkt_len is 0 we toss packet. If skb->len is smaller * than pkt_len we keep whole skb->data. This is the socket level - * wrapper to sk_run_filter. It returns 0 if the packet should + * wrapper to SK_RUN_FILTER. It returns 0 if the packet should * be accepted or -EPERM if the packet should be tossed. * */ @@ -87,33 +88,9 @@ int sk_filter(struct sock *sk, struct sk_buff *skb) } EXPORT_SYMBOL(sk_filter); -/* Helper to find the offset of pkt_type in sk_buff structure. We want - * to make sure its still a 3bit field starting at a byte boundary; - * taken from arch/x86/net/bpf_jit_comp.c. - */ -#ifdef __BIG_ENDIAN_BITFIELD -#define PKT_TYPE_MAX (7 << 5) -#else -#define PKT_TYPE_MAX 7 -#endif -static unsigned int pkt_type_offset(void) -{ - struct sk_buff skb_probe = { .pkt_type = ~0, }; - u8 *ct = (u8 *) &skb_probe; - unsigned int off; - - for (off = 0; off < sizeof(struct sk_buff); off++) { - if (ct[off] == PKT_TYPE_MAX) - return off; - } - - pr_err_once("Please fix %s, as pkt_type couldn't be found!\n", __func__); - return -1; -} - static u64 __skb_get_pay_offset(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) { - return __skb_get_poff((struct sk_buff *)(unsigned long) ctx); + return skb_get_poff((struct sk_buff *)(unsigned long) ctx); } static u64 __skb_get_nlattr(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) @@ -190,11 +167,8 @@ static bool convert_bpf_extensions(struct sock_filter *fp, break; case SKF_AD_OFF + SKF_AD_PKTTYPE: - *insn = BPF_LDX_MEM(BPF_B, BPF_REG_A, BPF_REG_CTX, - pkt_type_offset()); - if (insn->off < 0) - return false; - insn++; + *insn++ = BPF_LDX_MEM(BPF_B, BPF_REG_A, BPF_REG_CTX, + PKT_TYPE_OFFSET()); *insn = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, PKT_TYPE_MAX); #ifdef __BIG_ENDIAN_BITFIELD insn++; @@ -593,11 +567,8 @@ err: /* Security: * - * A BPF program is able to use 16 cells of memory to store intermediate - * values (check u32 mem[BPF_MEMWORDS] in sk_run_filter()). - * * As we dont want to clear mem[] array for each packet going through - * sk_run_filter(), we check that filter loaded by user never try to read + * __bpf_prog_run(), we check that filter loaded by user never try to read * a cell if not previously written, and we check all branches to be sure * a malicious user doesn't try to abuse us. */ @@ -843,8 +814,12 @@ static void bpf_release_orig_filter(struct bpf_prog *fp) static void __bpf_prog_release(struct bpf_prog *prog) { - bpf_release_orig_filter(prog); - bpf_prog_free(prog); + if (prog->aux->prog_type == BPF_PROG_TYPE_SOCKET_FILTER) { + bpf_prog_put(prog); + } else { + bpf_release_orig_filter(prog); + bpf_prog_free(prog); + } } static void __sk_filter_release(struct sk_filter *fp) @@ -933,7 +908,7 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) /* Expand fp for appending the new filter representation. */ old_fp = fp; - fp = krealloc(old_fp, bpf_prog_size(new_len), GFP_KERNEL); + fp = bpf_prog_realloc(old_fp, bpf_prog_size(new_len), 0); if (!fp) { /* The old_fp is still around in case we couldn't * allocate new memory, so uncharge on that one. @@ -972,7 +947,7 @@ static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp) int err; fp->bpf_func = NULL; - fp->jited = 0; + fp->jited = false; err = bpf_check_classic(fp->insns, fp->len); if (err) { @@ -1013,7 +988,7 @@ int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog) if (fprog->filter == NULL) return -EINVAL; - fp = kmalloc(bpf_prog_size(fprog->len), GFP_KERNEL); + fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0); if (!fp) return -ENOMEM; @@ -1069,12 +1044,12 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) if (fprog->filter == NULL) return -EINVAL; - prog = kmalloc(bpf_fsize, GFP_KERNEL); + prog = bpf_prog_alloc(bpf_fsize, 0); if (!prog) return -ENOMEM; if (copy_from_user(prog->insns, fprog->filter, fsize)) { - kfree(prog); + __bpf_prog_free(prog); return -EFAULT; } @@ -1082,7 +1057,7 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) err = bpf_prog_store_orig_filter(prog, fprog); if (err) { - kfree(prog); + __bpf_prog_free(prog); return -ENOMEM; } @@ -1118,6 +1093,94 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) } EXPORT_SYMBOL_GPL(sk_attach_filter); +#ifdef CONFIG_BPF_SYSCALL +int sk_attach_bpf(u32 ufd, struct sock *sk) +{ + struct sk_filter *fp, *old_fp; + struct bpf_prog *prog; + + if (sock_flag(sk, SOCK_FILTER_LOCKED)) + return -EPERM; + + prog = bpf_prog_get(ufd); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + if (prog->aux->prog_type != BPF_PROG_TYPE_SOCKET_FILTER) { + /* valid fd, but invalid program type */ + bpf_prog_put(prog); + return -EINVAL; + } + + fp = kmalloc(sizeof(*fp), GFP_KERNEL); + if (!fp) { + bpf_prog_put(prog); + return -ENOMEM; + } + fp->prog = prog; + + atomic_set(&fp->refcnt, 0); + + if (!sk_filter_charge(sk, fp)) { + __sk_filter_release(fp); + return -ENOMEM; + } + + old_fp = rcu_dereference_protected(sk->sk_filter, + sock_owned_by_user(sk)); + rcu_assign_pointer(sk->sk_filter, fp); + + if (old_fp) + sk_filter_uncharge(sk, old_fp); + + return 0; +} + +/* allow socket filters to call + * bpf_map_lookup_elem(), bpf_map_update_elem(), bpf_map_delete_elem() + */ +static const struct bpf_func_proto *sock_filter_func_proto(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_map_lookup_elem: + return &bpf_map_lookup_elem_proto; + case BPF_FUNC_map_update_elem: + return &bpf_map_update_elem_proto; + case BPF_FUNC_map_delete_elem: + return &bpf_map_delete_elem_proto; + default: + return NULL; + } +} + +static bool sock_filter_is_valid_access(int off, int size, enum bpf_access_type type) +{ + /* skb fields cannot be accessed yet */ + return false; +} + +static struct bpf_verifier_ops sock_filter_ops = { + .get_func_proto = sock_filter_func_proto, + .is_valid_access = sock_filter_is_valid_access, +}; + +static struct bpf_prog_type_list tl = { + .ops = &sock_filter_ops, + .type = BPF_PROG_TYPE_SOCKET_FILTER, +}; + +static int __init register_sock_filter_ops(void) +{ + bpf_register_prog_type(&tl); + return 0; +} +late_initcall(register_sock_filter_ops); +#else +int sk_attach_bpf(u32 ufd, struct sock *sk) +{ + return -EOPNOTSUPP; +} +#endif int sk_detach_filter(struct sock *sk) { int ret = -ENOENT; diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 5f362c1d0332..45084938c403 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -13,6 +13,7 @@ #include <linux/if_pppox.h> #include <linux/ppp_defs.h> #include <net/flow_keys.h> +#include <scsi/fc/fc_fcoe.h> /* copy saddr & daddr, possibly using 64bit load/store * Equivalent to : flow->src = iph->saddr; @@ -26,36 +27,61 @@ static void iph_to_flow_copy_addrs(struct flow_keys *flow, const struct iphdr *i } /** - * skb_flow_get_ports - extract the upper layer ports and return them - * @skb: buffer to extract the ports from + * __skb_flow_get_ports - extract the upper layer ports and return them + * @skb: sk_buff to extract the ports from * @thoff: transport header offset * @ip_proto: protocol for which to get port offset + * @data: raw buffer pointer to the packet, if NULL use skb->data + * @hlen: packet header length, if @data is NULL use skb_headlen(skb) * * The function will try to retrieve the ports at offset thoff + poff where poff * is the protocol port offset returned from proto_ports_offset */ -__be32 skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto) +__be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto, + void *data, int hlen) { int poff = proto_ports_offset(ip_proto); + if (!data) { + data = skb->data; + hlen = skb_headlen(skb); + } + if (poff >= 0) { __be32 *ports, _ports; - ports = skb_header_pointer(skb, thoff + poff, - sizeof(_ports), &_ports); + ports = __skb_header_pointer(skb, thoff + poff, + sizeof(_ports), data, hlen, &_ports); if (ports) return *ports; } return 0; } -EXPORT_SYMBOL(skb_flow_get_ports); +EXPORT_SYMBOL(__skb_flow_get_ports); -bool skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow) +/** + * __skb_flow_dissect - extract the flow_keys struct and return it + * @skb: sk_buff to extract the flow from, can be NULL if the rest are specified + * @data: raw buffer pointer to the packet, if NULL use skb->data + * @proto: protocol for which to get the flow, if @data is NULL use skb->protocol + * @nhoff: network header offset, if @data is NULL use skb_network_offset(skb) + * @hlen: packet header length, if @data is NULL use skb_headlen(skb) + * + * The function will try to retrieve the struct flow_keys from either the skbuff + * or a raw buffer specified by the rest parameters + */ +bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow, + void *data, __be16 proto, int nhoff, int hlen) { - int nhoff = skb_network_offset(skb); u8 ip_proto; - __be16 proto = skb->protocol; + + if (!data) { + data = skb->data; + proto = skb->protocol; + nhoff = skb_network_offset(skb); + hlen = skb_headlen(skb); + } memset(flow, 0, sizeof(*flow)); @@ -65,7 +91,7 @@ again: const struct iphdr *iph; struct iphdr _iph; ip: - iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph); + iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph); if (!iph || iph->ihl < 5) return false; nhoff += iph->ihl * 4; @@ -74,6 +100,13 @@ ip: if (ip_is_fragment(iph)) ip_proto = 0; + /* skip the address processing if skb is NULL. The assumption + * here is that if there is no skb we are not looking for flow + * info but lengths and protocols. + */ + if (!skb) + break; + iph_to_flow_copy_addrs(flow, iph); break; } @@ -83,14 +116,19 @@ ip: __be32 flow_label; ipv6: - iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph); + iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph); if (!iph) return false; ip_proto = iph->nexthdr; + nhoff += sizeof(struct ipv6hdr); + + /* see comment above in IPv4 section */ + if (!skb) + break; + flow->src = (__force __be32)ipv6_addr_hash(&iph->saddr); flow->dst = (__force __be32)ipv6_addr_hash(&iph->daddr); - nhoff += sizeof(struct ipv6hdr); flow_label = ip6_flowlabel(iph); if (flow_label) { @@ -113,7 +151,7 @@ ipv6: const struct vlan_hdr *vlan; struct vlan_hdr _vlan; - vlan = skb_header_pointer(skb, nhoff, sizeof(_vlan), &_vlan); + vlan = __skb_header_pointer(skb, nhoff, sizeof(_vlan), data, hlen, &_vlan); if (!vlan) return false; @@ -126,7 +164,7 @@ ipv6: struct pppoe_hdr hdr; __be16 proto; } *hdr, _hdr; - hdr = skb_header_pointer(skb, nhoff, sizeof(_hdr), &_hdr); + hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) return false; proto = hdr->proto; @@ -140,6 +178,9 @@ ipv6: return false; } } + case htons(ETH_P_FCOE): + flow->thoff = (u16)(nhoff + FCOE_HEADER_LEN); + /* fall through */ default: return false; } @@ -151,7 +192,7 @@ ipv6: __be16 proto; } *hdr, _hdr; - hdr = skb_header_pointer(skb, nhoff, sizeof(_hdr), &_hdr); + hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) return false; /* @@ -171,8 +212,9 @@ ipv6: const struct ethhdr *eth; struct ethhdr _eth; - eth = skb_header_pointer(skb, nhoff, - sizeof(_eth), &_eth); + eth = __skb_header_pointer(skb, nhoff, + sizeof(_eth), + data, hlen, &_eth); if (!eth) return false; proto = eth->h_proto; @@ -194,12 +236,16 @@ ipv6: flow->n_proto = proto; flow->ip_proto = ip_proto; - flow->ports = skb_flow_get_ports(skb, nhoff, ip_proto); flow->thoff = (u16) nhoff; + /* unless skb is set we don't need to record port info */ + if (skb) + flow->ports = __skb_flow_get_ports(skb, nhoff, ip_proto, + data, hlen); + return true; } -EXPORT_SYMBOL(skb_flow_dissect); +EXPORT_SYMBOL(__skb_flow_dissect); static u32 hashrnd __read_mostly; static __always_inline void __flow_hash_secret_init(void) @@ -286,34 +332,27 @@ u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb, qcount = dev->tc_to_txq[tc].count; } - return (u16) (((u64)skb_get_hash(skb) * qcount) >> 32) + qoffset; + return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset; } EXPORT_SYMBOL(__skb_tx_hash); -/* __skb_get_poff() returns the offset to the payload as far as it could - * be dissected. The main user is currently BPF, so that we can dynamically - * truncate packets without needing to push actual payload to the user - * space and can analyze headers only, instead. - */ -u32 __skb_get_poff(const struct sk_buff *skb) +u32 __skb_get_poff(const struct sk_buff *skb, void *data, + const struct flow_keys *keys, int hlen) { - struct flow_keys keys; - u32 poff = 0; + u32 poff = keys->thoff; - if (!skb_flow_dissect(skb, &keys)) - return 0; - - poff += keys.thoff; - switch (keys.ip_proto) { + switch (keys->ip_proto) { case IPPROTO_TCP: { - const struct tcphdr *tcph; - struct tcphdr _tcph; + /* access doff as u8 to avoid unaligned access */ + const u8 *doff; + u8 _doff; - tcph = skb_header_pointer(skb, poff, sizeof(_tcph), &_tcph); - if (!tcph) + doff = __skb_header_pointer(skb, poff + 12, sizeof(_doff), + data, hlen, &_doff); + if (!doff) return poff; - poff += max_t(u32, sizeof(struct tcphdr), tcph->doff * 4); + poff += max_t(u32, sizeof(struct tcphdr), (*doff & 0xF0) >> 2); break; } case IPPROTO_UDP: @@ -343,6 +382,21 @@ u32 __skb_get_poff(const struct sk_buff *skb) return poff; } +/* skb_get_poff() returns the offset to the payload as far as it could + * be dissected. The main user is currently BPF, so that we can dynamically + * truncate packets without needing to push actual payload to the user + * space and can analyze headers only, instead. + */ +u32 skb_get_poff(const struct sk_buff *skb) +{ + struct flow_keys keys; + + if (!skb_flow_dissect(skb, &keys)) + return 0; + + return __skb_get_poff(skb, skb->data, &keys, skb_headlen(skb)); +} + static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) { #ifdef CONFIG_XPS @@ -359,9 +413,8 @@ static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) if (map->len == 1) queue_index = map->queues[0]; else - queue_index = map->queues[ - ((u64)skb_get_hash(skb) * map->len) >> 32]; - + queue_index = map->queues[reciprocal_scale(skb_get_hash(skb), + map->len)]; if (unlikely(queue_index >= dev->real_num_tx_queues)) queue_index = -1; } diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c index 9d33dfffca19..9dfb88a933e7 100644 --- a/net/core/gen_estimator.c +++ b/net/core/gen_estimator.c @@ -91,6 +91,8 @@ struct gen_estimator u32 avpps; struct rcu_head e_rcu; struct rb_node node; + struct gnet_stats_basic_cpu __percpu *cpu_bstats; + struct rcu_head head; }; struct gen_estimator_head @@ -115,9 +117,8 @@ static void est_timer(unsigned long arg) rcu_read_lock(); list_for_each_entry_rcu(e, &elist[idx].list, list) { - u64 nbytes; + struct gnet_stats_basic_packed b = {0}; u64 brate; - u32 npackets; u32 rate; spin_lock(e->stats_lock); @@ -125,15 +126,15 @@ static void est_timer(unsigned long arg) if (e->bstats == NULL) goto skip; - nbytes = e->bstats->bytes; - npackets = e->bstats->packets; - brate = (nbytes - e->last_bytes)<<(7 - idx); - e->last_bytes = nbytes; + __gnet_stats_copy_basic(&b, e->cpu_bstats, e->bstats); + + brate = (b.bytes - e->last_bytes)<<(7 - idx); + e->last_bytes = b.bytes; e->avbps += (brate >> e->ewma_log) - (e->avbps >> e->ewma_log); e->rate_est->bps = (e->avbps+0xF)>>5; - rate = (npackets - e->last_packets)<<(12 - idx); - e->last_packets = npackets; + rate = (b.packets - e->last_packets)<<(12 - idx); + e->last_packets = b.packets; e->avpps += (rate >> e->ewma_log) - (e->avpps >> e->ewma_log); e->rate_est->pps = (e->avpps+0x1FF)>>10; skip: @@ -203,12 +204,14 @@ struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats * */ int gen_new_estimator(struct gnet_stats_basic_packed *bstats, + struct gnet_stats_basic_cpu __percpu *cpu_bstats, struct gnet_stats_rate_est64 *rate_est, spinlock_t *stats_lock, struct nlattr *opt) { struct gen_estimator *est; struct gnet_estimator *parm = nla_data(opt); + struct gnet_stats_basic_packed b = {0}; int idx; if (nla_len(opt) < sizeof(*parm)) @@ -221,15 +224,18 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats, if (est == NULL) return -ENOBUFS; + __gnet_stats_copy_basic(&b, cpu_bstats, bstats); + idx = parm->interval + 2; est->bstats = bstats; est->rate_est = rate_est; est->stats_lock = stats_lock; est->ewma_log = parm->ewma_log; - est->last_bytes = bstats->bytes; + est->last_bytes = b.bytes; est->avbps = rate_est->bps<<5; - est->last_packets = bstats->packets; + est->last_packets = b.packets; est->avpps = rate_est->pps<<10; + est->cpu_bstats = cpu_bstats; spin_lock_bh(&est_tree_lock); if (!elist[idx].timer.function) { @@ -290,11 +296,12 @@ EXPORT_SYMBOL(gen_kill_estimator); * Returns 0 on success or a negative error code. */ int gen_replace_estimator(struct gnet_stats_basic_packed *bstats, + struct gnet_stats_basic_cpu __percpu *cpu_bstats, struct gnet_stats_rate_est64 *rate_est, spinlock_t *stats_lock, struct nlattr *opt) { gen_kill_estimator(bstats, rate_est); - return gen_new_estimator(bstats, rate_est, stats_lock, opt); + return gen_new_estimator(bstats, cpu_bstats, rate_est, stats_lock, opt); } EXPORT_SYMBOL(gen_replace_estimator); diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c index 2ddbce4cce14..0c08062d1796 100644 --- a/net/core/gen_stats.c +++ b/net/core/gen_stats.c @@ -97,6 +97,43 @@ gnet_stats_start_copy(struct sk_buff *skb, int type, spinlock_t *lock, } EXPORT_SYMBOL(gnet_stats_start_copy); +static void +__gnet_stats_copy_basic_cpu(struct gnet_stats_basic_packed *bstats, + struct gnet_stats_basic_cpu __percpu *cpu) +{ + int i; + + for_each_possible_cpu(i) { + struct gnet_stats_basic_cpu *bcpu = per_cpu_ptr(cpu, i); + unsigned int start; + u64 bytes; + u32 packets; + + do { + start = u64_stats_fetch_begin_irq(&bcpu->syncp); + bytes = bcpu->bstats.bytes; + packets = bcpu->bstats.packets; + } while (u64_stats_fetch_retry_irq(&bcpu->syncp, start)); + + bstats->bytes += bytes; + bstats->packets += packets; + } +} + +void +__gnet_stats_copy_basic(struct gnet_stats_basic_packed *bstats, + struct gnet_stats_basic_cpu __percpu *cpu, + struct gnet_stats_basic_packed *b) +{ + if (cpu) { + __gnet_stats_copy_basic_cpu(bstats, cpu); + } else { + bstats->bytes = b->bytes; + bstats->packets = b->packets; + } +} +EXPORT_SYMBOL(__gnet_stats_copy_basic); + /** * gnet_stats_copy_basic - copy basic statistics into statistic TLV * @d: dumping handle @@ -109,19 +146,25 @@ EXPORT_SYMBOL(gnet_stats_start_copy); * if the room in the socket buffer was not sufficient. */ int -gnet_stats_copy_basic(struct gnet_dump *d, struct gnet_stats_basic_packed *b) +gnet_stats_copy_basic(struct gnet_dump *d, + struct gnet_stats_basic_cpu __percpu *cpu, + struct gnet_stats_basic_packed *b) { + struct gnet_stats_basic_packed bstats = {0}; + + __gnet_stats_copy_basic(&bstats, cpu, b); + if (d->compat_tc_stats) { - d->tc_stats.bytes = b->bytes; - d->tc_stats.packets = b->packets; + d->tc_stats.bytes = bstats.bytes; + d->tc_stats.packets = bstats.packets; } if (d->tail) { struct gnet_stats_basic sb; memset(&sb, 0, sizeof(sb)); - sb.bytes = b->bytes; - sb.packets = b->packets; + sb.bytes = bstats.bytes; + sb.packets = bstats.packets; return gnet_stats_copy(d, TCA_STATS_BASIC, &sb, sizeof(sb)); } return 0; @@ -172,29 +215,74 @@ gnet_stats_copy_rate_est(struct gnet_dump *d, } EXPORT_SYMBOL(gnet_stats_copy_rate_est); +static void +__gnet_stats_copy_queue_cpu(struct gnet_stats_queue *qstats, + const struct gnet_stats_queue __percpu *q) +{ + int i; + + for_each_possible_cpu(i) { + const struct gnet_stats_queue *qcpu = per_cpu_ptr(q, i); + + qstats->qlen = 0; + qstats->backlog += qcpu->backlog; + qstats->drops += qcpu->drops; + qstats->requeues += qcpu->requeues; + qstats->overlimits += qcpu->overlimits; + } +} + +static void __gnet_stats_copy_queue(struct gnet_stats_queue *qstats, + const struct gnet_stats_queue __percpu *cpu, + const struct gnet_stats_queue *q, + __u32 qlen) +{ + if (cpu) { + __gnet_stats_copy_queue_cpu(qstats, cpu); + } else { + qstats->qlen = q->qlen; + qstats->backlog = q->backlog; + qstats->drops = q->drops; + qstats->requeues = q->requeues; + qstats->overlimits = q->overlimits; + } + + qstats->qlen = qlen; +} + /** * gnet_stats_copy_queue - copy queue statistics into statistics TLV * @d: dumping handle + * @cpu_q: per cpu queue statistics * @q: queue statistics + * @qlen: queue length statistics * * Appends the queue statistics to the top level TLV created by - * gnet_stats_start_copy(). + * gnet_stats_start_copy(). Using per cpu queue statistics if + * they are available. * * Returns 0 on success or -1 with the statistic lock released * if the room in the socket buffer was not sufficient. */ int -gnet_stats_copy_queue(struct gnet_dump *d, struct gnet_stats_queue *q) +gnet_stats_copy_queue(struct gnet_dump *d, + struct gnet_stats_queue __percpu *cpu_q, + struct gnet_stats_queue *q, __u32 qlen) { + struct gnet_stats_queue qstats = {0}; + + __gnet_stats_copy_queue(&qstats, cpu_q, q, qlen); + if (d->compat_tc_stats) { - d->tc_stats.drops = q->drops; - d->tc_stats.qlen = q->qlen; - d->tc_stats.backlog = q->backlog; - d->tc_stats.overlimits = q->overlimits; + d->tc_stats.drops = qstats.drops; + d->tc_stats.qlen = qstats.qlen; + d->tc_stats.backlog = qstats.backlog; + d->tc_stats.overlimits = qstats.overlimits; } if (d->tail) - return gnet_stats_copy(d, TCA_STATS_QUEUE, q, sizeof(*q)); + return gnet_stats_copy(d, TCA_STATS_QUEUE, + &qstats, sizeof(qstats)); return 0; } diff --git a/net/core/iovec.c b/net/core/iovec.c index e1ec45ab1e63..dcbe98b3726a 100644 --- a/net/core/iovec.c +++ b/net/core/iovec.c @@ -28,53 +28,6 @@ #include <net/sock.h> /* - * Verify iovec. The caller must ensure that the iovec is big enough - * to hold the message iovec. - * - * Save time not doing access_ok. copy_*_user will make this work - * in any case. - */ - -int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr_storage *address, int mode) -{ - int size, ct, err; - - if (m->msg_name && m->msg_namelen) { - if (mode == VERIFY_READ) { - void __user *namep; - namep = (void __user __force *) m->msg_name; - err = move_addr_to_kernel(namep, m->msg_namelen, - address); - if (err < 0) - return err; - } - m->msg_name = address; - } else { - m->msg_name = NULL; - m->msg_namelen = 0; - } - - size = m->msg_iovlen * sizeof(struct iovec); - if (copy_from_user(iov, (void __user __force *) m->msg_iov, size)) - return -EFAULT; - - m->msg_iov = iov; - err = 0; - - for (ct = 0; ct < m->msg_iovlen; ct++) { - size_t len = iov[ct].iov_len; - - if (len > INT_MAX - err) { - len = INT_MAX - err; - iov[ct].iov_len = len; - } - err += len; - } - - return err; -} - -/* * And now for the all-in-one: copy and checksum from a user iovec * directly to a datagram * Calls to csum_partial but the last must be in 32 bit chunks diff --git a/net/core/link_watch.c b/net/core/link_watch.c index bd0767e6b2b3..49a9e3e06c08 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -21,7 +21,7 @@ #include <linux/spinlock.h> #include <linux/workqueue.h> #include <linux/bitops.h> -#include <asm/types.h> +#include <linux/types.h> enum lw_bits { diff --git a/net/core/neighbour.c b/net/core/neighbour.c index ef31fef25e5a..8e38f17288d3 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -56,7 +56,6 @@ static void __neigh_notify(struct neighbour *n, int type, int flags); static void neigh_update_notify(struct neighbour *neigh); static int pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev); -static struct neigh_table *neigh_tables; #ifdef CONFIG_PROC_FS static const struct file_operations neigh_stat_seq_fops; #endif @@ -87,13 +86,8 @@ static const struct file_operations neigh_stat_seq_fops; the most complicated procedure, which we allow is dev->hard_header. It is supposed, that dev->hard_header is simplistic and does not make callbacks to neighbour tables. - - The last lock is neigh_tbl_lock. It is pure SMP lock, protecting - list of neighbour tables. This list is used only in process context, */ -static DEFINE_RWLOCK(neigh_tbl_lock); - static int neigh_blackhole(struct neighbour *neigh, struct sk_buff *skb) { kfree_skb(skb); @@ -773,7 +767,7 @@ static void neigh_periodic_work(struct work_struct *work) if (time_after(jiffies, tbl->last_rand + 300 * HZ)) { struct neigh_parms *p; tbl->last_rand = jiffies; - for (p = &tbl->parms; p; p = p->next) + list_for_each_entry(p, &tbl->parms_list, list) p->reachable_time = neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME)); } @@ -1446,7 +1440,7 @@ static inline struct neigh_parms *lookup_neigh_parms(struct neigh_table *tbl, { struct neigh_parms *p; - for (p = &tbl->parms; p; p = p->next) { + list_for_each_entry(p, &tbl->parms_list, list) { if ((p->dev && p->dev->ifindex == ifindex && net_eq(neigh_parms_net(p), net)) || (!p->dev && !ifindex && net_eq(net, &init_net))) return p; @@ -1481,8 +1475,7 @@ struct neigh_parms *neigh_parms_alloc(struct net_device *dev, } write_lock_bh(&tbl->lock); - p->next = tbl->parms.next; - tbl->parms.next = p; + list_add(&p->list, &tbl->parms.list); write_unlock_bh(&tbl->lock); neigh_parms_data_state_cleanall(p); @@ -1501,24 +1494,15 @@ static void neigh_rcu_free_parms(struct rcu_head *head) void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms) { - struct neigh_parms **p; - if (!parms || parms == &tbl->parms) return; write_lock_bh(&tbl->lock); - for (p = &tbl->parms.next; *p; p = &(*p)->next) { - if (*p == parms) { - *p = parms->next; - parms->dead = 1; - write_unlock_bh(&tbl->lock); - if (parms->dev) - dev_put(parms->dev); - call_rcu(&parms->rcu_head, neigh_rcu_free_parms); - return; - } - } + list_del(&parms->list); + parms->dead = 1; write_unlock_bh(&tbl->lock); - neigh_dbg(1, "%s: not found\n", __func__); + if (parms->dev) + dev_put(parms->dev); + call_rcu(&parms->rcu_head, neigh_rcu_free_parms); } EXPORT_SYMBOL(neigh_parms_release); @@ -1530,11 +1514,15 @@ static void neigh_parms_destroy(struct neigh_parms *parms) static struct lock_class_key neigh_table_proxy_queue_class; -static void neigh_table_init_no_netlink(struct neigh_table *tbl) +static struct neigh_table *neigh_tables[NEIGH_NR_TABLES] __read_mostly; + +void neigh_table_init(int index, struct neigh_table *tbl) { unsigned long now = jiffies; unsigned long phsize; + INIT_LIST_HEAD(&tbl->parms_list); + list_add(&tbl->parms.list, &tbl->parms_list); write_pnet(&tbl->parms.net, &init_net); atomic_set(&tbl->parms.refcnt, 1); tbl->parms.reachable_time = @@ -1574,34 +1562,14 @@ static void neigh_table_init_no_netlink(struct neigh_table *tbl) tbl->last_flush = now; tbl->last_rand = now + tbl->parms.reachable_time * 20; -} - -void neigh_table_init(struct neigh_table *tbl) -{ - struct neigh_table *tmp; - - neigh_table_init_no_netlink(tbl); - write_lock(&neigh_tbl_lock); - for (tmp = neigh_tables; tmp; tmp = tmp->next) { - if (tmp->family == tbl->family) - break; - } - tbl->next = neigh_tables; - neigh_tables = tbl; - write_unlock(&neigh_tbl_lock); - if (unlikely(tmp)) { - pr_err("Registering multiple tables for family %d\n", - tbl->family); - dump_stack(); - } + neigh_tables[index] = tbl; } EXPORT_SYMBOL(neigh_table_init); -int neigh_table_clear(struct neigh_table *tbl) +int neigh_table_clear(int index, struct neigh_table *tbl) { - struct neigh_table **tp; - + neigh_tables[index] = NULL; /* It is not clean... Fix it to unload IPv6 module safely */ cancel_delayed_work_sync(&tbl->gc_work); del_timer_sync(&tbl->proxy_timer); @@ -1609,14 +1577,6 @@ int neigh_table_clear(struct neigh_table *tbl) neigh_ifdown(tbl, NULL); if (atomic_read(&tbl->entries)) pr_crit("neighbour leakage\n"); - write_lock(&neigh_tbl_lock); - for (tp = &neigh_tables; *tp; tp = &(*tp)->next) { - if (*tp == tbl) { - *tp = tbl->next; - break; - } - } - write_unlock(&neigh_tbl_lock); call_rcu(&rcu_dereference_protected(tbl->nht, 1)->rcu, neigh_hash_free_rcu); @@ -1634,12 +1594,32 @@ int neigh_table_clear(struct neigh_table *tbl) } EXPORT_SYMBOL(neigh_table_clear); +static struct neigh_table *neigh_find_table(int family) +{ + struct neigh_table *tbl = NULL; + + switch (family) { + case AF_INET: + tbl = neigh_tables[NEIGH_ARP_TABLE]; + break; + case AF_INET6: + tbl = neigh_tables[NEIGH_ND_TABLE]; + break; + case AF_DECnet: + tbl = neigh_tables[NEIGH_DN_TABLE]; + break; + } + + return tbl; +} + static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); struct ndmsg *ndm; struct nlattr *dst_attr; struct neigh_table *tbl; + struct neighbour *neigh; struct net_device *dev = NULL; int err = -EINVAL; @@ -1660,39 +1640,31 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh) } } - read_lock(&neigh_tbl_lock); - for (tbl = neigh_tables; tbl; tbl = tbl->next) { - struct neighbour *neigh; + tbl = neigh_find_table(ndm->ndm_family); + if (tbl == NULL) + return -EAFNOSUPPORT; - if (tbl->family != ndm->ndm_family) - continue; - read_unlock(&neigh_tbl_lock); - - if (nla_len(dst_attr) < tbl->key_len) - goto out; - - if (ndm->ndm_flags & NTF_PROXY) { - err = pneigh_delete(tbl, net, nla_data(dst_attr), dev); - goto out; - } + if (nla_len(dst_attr) < tbl->key_len) + goto out; - if (dev == NULL) - goto out; + if (ndm->ndm_flags & NTF_PROXY) { + err = pneigh_delete(tbl, net, nla_data(dst_attr), dev); + goto out; + } - neigh = neigh_lookup(tbl, nla_data(dst_attr), dev); - if (neigh == NULL) { - err = -ENOENT; - goto out; - } + if (dev == NULL) + goto out; - err = neigh_update(neigh, NULL, NUD_FAILED, - NEIGH_UPDATE_F_OVERRIDE | - NEIGH_UPDATE_F_ADMIN); - neigh_release(neigh); + neigh = neigh_lookup(tbl, nla_data(dst_attr), dev); + if (neigh == NULL) { + err = -ENOENT; goto out; } - read_unlock(&neigh_tbl_lock); - err = -EAFNOSUPPORT; + + err = neigh_update(neigh, NULL, NUD_FAILED, + NEIGH_UPDATE_F_OVERRIDE | + NEIGH_UPDATE_F_ADMIN); + neigh_release(neigh); out: return err; @@ -1700,11 +1672,14 @@ out: static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh) { + int flags = NEIGH_UPDATE_F_ADMIN | NEIGH_UPDATE_F_OVERRIDE; struct net *net = sock_net(skb->sk); struct ndmsg *ndm; struct nlattr *tb[NDA_MAX+1]; struct neigh_table *tbl; struct net_device *dev = NULL; + struct neighbour *neigh; + void *dst, *lladdr; int err; ASSERT_RTNL(); @@ -1728,70 +1703,60 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh) goto out; } - read_lock(&neigh_tbl_lock); - for (tbl = neigh_tables; tbl; tbl = tbl->next) { - int flags = NEIGH_UPDATE_F_ADMIN | NEIGH_UPDATE_F_OVERRIDE; - struct neighbour *neigh; - void *dst, *lladdr; + tbl = neigh_find_table(ndm->ndm_family); + if (tbl == NULL) + return -EAFNOSUPPORT; - if (tbl->family != ndm->ndm_family) - continue; - read_unlock(&neigh_tbl_lock); + if (nla_len(tb[NDA_DST]) < tbl->key_len) + goto out; + dst = nla_data(tb[NDA_DST]); + lladdr = tb[NDA_LLADDR] ? nla_data(tb[NDA_LLADDR]) : NULL; - if (nla_len(tb[NDA_DST]) < tbl->key_len) - goto out; - dst = nla_data(tb[NDA_DST]); - lladdr = tb[NDA_LLADDR] ? nla_data(tb[NDA_LLADDR]) : NULL; + if (ndm->ndm_flags & NTF_PROXY) { + struct pneigh_entry *pn; - if (ndm->ndm_flags & NTF_PROXY) { - struct pneigh_entry *pn; + err = -ENOBUFS; + pn = pneigh_lookup(tbl, net, dst, dev, 1); + if (pn) { + pn->flags = ndm->ndm_flags; + err = 0; + } + goto out; + } - err = -ENOBUFS; - pn = pneigh_lookup(tbl, net, dst, dev, 1); - if (pn) { - pn->flags = ndm->ndm_flags; - err = 0; - } + if (dev == NULL) + goto out; + + neigh = neigh_lookup(tbl, dst, dev); + if (neigh == NULL) { + if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { + err = -ENOENT; goto out; } - if (dev == NULL) + neigh = __neigh_lookup_errno(tbl, dst, dev); + if (IS_ERR(neigh)) { + err = PTR_ERR(neigh); + goto out; + } + } else { + if (nlh->nlmsg_flags & NLM_F_EXCL) { + err = -EEXIST; + neigh_release(neigh); goto out; - - neigh = neigh_lookup(tbl, dst, dev); - if (neigh == NULL) { - if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { - err = -ENOENT; - goto out; - } - - neigh = __neigh_lookup_errno(tbl, dst, dev); - if (IS_ERR(neigh)) { - err = PTR_ERR(neigh); - goto out; - } - } else { - if (nlh->nlmsg_flags & NLM_F_EXCL) { - err = -EEXIST; - neigh_release(neigh); - goto out; - } - - if (!(nlh->nlmsg_flags & NLM_F_REPLACE)) - flags &= ~NEIGH_UPDATE_F_OVERRIDE; } - if (ndm->ndm_flags & NTF_USE) { - neigh_event_send(neigh, NULL); - err = 0; - } else - err = neigh_update(neigh, lladdr, ndm->ndm_state, flags); - neigh_release(neigh); - goto out; + if (!(nlh->nlmsg_flags & NLM_F_REPLACE)) + flags &= ~NEIGH_UPDATE_F_OVERRIDE; } - read_unlock(&neigh_tbl_lock); - err = -EAFNOSUPPORT; + if (ndm->ndm_flags & NTF_USE) { + neigh_event_send(neigh, NULL); + err = 0; + } else + err = neigh_update(neigh, lladdr, ndm->ndm_state, flags); + neigh_release(neigh); + out: return err; } @@ -1990,7 +1955,8 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh) struct neigh_table *tbl; struct ndtmsg *ndtmsg; struct nlattr *tb[NDTA_MAX+1]; - int err; + bool found = false; + int err, tidx; err = nlmsg_parse(nlh, sizeof(*ndtmsg), tb, NDTA_MAX, nl_neightbl_policy); @@ -2003,19 +1969,21 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh) } ndtmsg = nlmsg_data(nlh); - read_lock(&neigh_tbl_lock); - for (tbl = neigh_tables; tbl; tbl = tbl->next) { + + for (tidx = 0; tidx < NEIGH_NR_TABLES; tidx++) { + tbl = neigh_tables[tidx]; + if (!tbl) + continue; if (ndtmsg->ndtm_family && tbl->family != ndtmsg->ndtm_family) continue; - - if (nla_strcmp(tb[NDTA_NAME], tbl->id) == 0) + if (nla_strcmp(tb[NDTA_NAME], tbl->id) == 0) { + found = true; break; + } } - if (tbl == NULL) { - err = -ENOENT; - goto errout_locked; - } + if (!found) + return -ENOENT; /* * We acquire tbl->lock to be nice to the periodic timers and @@ -2126,8 +2094,6 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh) errout_tbl_lock: write_unlock_bh(&tbl->lock); -errout_locked: - read_unlock(&neigh_tbl_lock); errout: return err; } @@ -2142,10 +2108,13 @@ static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb) family = ((struct rtgenmsg *) nlmsg_data(cb->nlh))->rtgen_family; - read_lock(&neigh_tbl_lock); - for (tbl = neigh_tables, tidx = 0; tbl; tbl = tbl->next, tidx++) { + for (tidx = 0; tidx < NEIGH_NR_TABLES; tidx++) { struct neigh_parms *p; + tbl = neigh_tables[tidx]; + if (!tbl) + continue; + if (tidx < tbl_skip || (family && tbl->family != family)) continue; @@ -2154,7 +2123,9 @@ static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb) NLM_F_MULTI) <= 0) break; - for (nidx = 0, p = tbl->parms.next; p; p = p->next) { + nidx = 0; + p = list_next_entry(&tbl->parms, list); + list_for_each_entry_from(p, &tbl->parms_list, list) { if (!net_eq(neigh_parms_net(p), net)) continue; @@ -2174,7 +2145,6 @@ static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb) neigh_skip = 0; } out: - read_unlock(&neigh_tbl_lock); cb->args[0] = tidx; cb->args[1] = nidx; @@ -2357,7 +2327,6 @@ static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb) int proxy = 0; int err; - read_lock(&neigh_tbl_lock); family = ((struct rtgenmsg *) nlmsg_data(cb->nlh))->rtgen_family; /* check for full ndmsg structure presence, family member is @@ -2369,8 +2338,11 @@ static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb) s_t = cb->args[0]; - for (tbl = neigh_tables, t = 0; tbl; - tbl = tbl->next, t++) { + for (t = 0; t < NEIGH_NR_TABLES; t++) { + tbl = neigh_tables[t]; + + if (!tbl) + continue; if (t < s_t || (family && tbl->family != family)) continue; if (t > s_t) @@ -2383,7 +2355,6 @@ static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb) if (err < 0) break; } - read_unlock(&neigh_tbl_lock); cb->args[0] = t; return skb->len; diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 9dd06699b09c..999341244434 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -12,6 +12,7 @@ #include <linux/capability.h> #include <linux/kernel.h> #include <linux/netdevice.h> +#include <net/switchdev.h> #include <linux/if_arp.h> #include <linux/slab.h> #include <linux/nsproxy.h> @@ -325,6 +326,23 @@ static ssize_t tx_queue_len_store(struct device *dev, } NETDEVICE_SHOW_RW(tx_queue_len, fmt_ulong); +static int change_gro_flush_timeout(struct net_device *dev, unsigned long val) +{ + dev->gro_flush_timeout = val; + return 0; +} + +static ssize_t gro_flush_timeout_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + return netdev_store(dev, attr, buf, len, change_gro_flush_timeout); +} +NETDEVICE_SHOW_RW(gro_flush_timeout, fmt_ulong); + static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { @@ -387,7 +405,7 @@ static ssize_t phys_port_id_show(struct device *dev, return restart_syscall(); if (dev_isalive(netdev)) { - struct netdev_phys_port_id ppid; + struct netdev_phys_item_id ppid; ret = dev_get_phys_port_id(netdev, &ppid); if (!ret) @@ -399,6 +417,28 @@ static ssize_t phys_port_id_show(struct device *dev, } static DEVICE_ATTR_RO(phys_port_id); +static ssize_t phys_switch_id_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct net_device *netdev = to_net_dev(dev); + ssize_t ret = -EINVAL; + + if (!rtnl_trylock()) + return restart_syscall(); + + if (dev_isalive(netdev)) { + struct netdev_phys_item_id ppid; + + ret = netdev_switch_parent_id_get(netdev, &ppid); + if (!ret) + ret = sprintf(buf, "%*phN\n", ppid.id_len, ppid.id); + } + rtnl_unlock(); + + return ret; +} +static DEVICE_ATTR_RO(phys_switch_id); + static struct attribute *net_class_attrs[] = { &dev_attr_netdev_group.attr, &dev_attr_type.attr, @@ -422,7 +462,9 @@ static struct attribute *net_class_attrs[] = { &dev_attr_mtu.attr, &dev_attr_flags.attr, &dev_attr_tx_queue_len.attr, + &dev_attr_gro_flush_timeout.attr, &dev_attr_phys_port_id.attr, + &dev_attr_phys_switch_id.attr, NULL, }; ATTRIBUTE_GROUPS(net_class); diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 7c6b51a58968..ce780c722e48 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -224,7 +224,7 @@ static void net_free(struct net *net) return; } #endif - kfree(net->gen); + kfree(rcu_access_pointer(net->gen)); kmem_cache_free(net_cachep, net); } @@ -337,17 +337,17 @@ EXPORT_SYMBOL_GPL(__put_net); struct net *get_net_ns_by_fd(int fd) { - struct proc_ns *ei; struct file *file; + struct ns_common *ns; struct net *net; file = proc_ns_fget(fd); if (IS_ERR(file)) return ERR_CAST(file); - ei = get_proc_ns(file_inode(file)); - if (ei->ns_ops == &netns_operations) - net = get_net(ei->ns); + ns = get_proc_ns(file_inode(file)); + if (ns->ops == &netns_operations) + net = get_net(container_of(ns, struct net, ns)); else net = ERR_PTR(-EINVAL); @@ -386,12 +386,15 @@ EXPORT_SYMBOL_GPL(get_net_ns_by_pid); static __net_init int net_ns_net_init(struct net *net) { - return proc_alloc_inum(&net->proc_inum); +#ifdef CONFIG_NET_NS + net->ns.ops = &netns_operations; +#endif + return ns_alloc_inum(&net->ns); } static __net_exit void net_ns_net_exit(struct net *net) { - proc_free_inum(net->proc_inum); + ns_free_inum(&net->ns); } static struct pernet_operations __net_initdata net_ns_ops = { @@ -629,7 +632,7 @@ void unregister_pernet_device(struct pernet_operations *ops) EXPORT_SYMBOL_GPL(unregister_pernet_device); #ifdef CONFIG_NET_NS -static void *netns_get(struct task_struct *task) +static struct ns_common *netns_get(struct task_struct *task) { struct net *net = NULL; struct nsproxy *nsproxy; @@ -640,17 +643,22 @@ static void *netns_get(struct task_struct *task) net = get_net(nsproxy->net_ns); task_unlock(task); - return net; + return net ? &net->ns : NULL; } -static void netns_put(void *ns) +static inline struct net *to_net_ns(struct ns_common *ns) { - put_net(ns); + return container_of(ns, struct net, ns); } -static int netns_install(struct nsproxy *nsproxy, void *ns) +static void netns_put(struct ns_common *ns) { - struct net *net = ns; + put_net(to_net_ns(ns)); +} + +static int netns_install(struct nsproxy *nsproxy, struct ns_common *ns) +{ + struct net *net = to_net_ns(ns); if (!ns_capable(net->user_ns, CAP_SYS_ADMIN) || !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) @@ -661,18 +669,11 @@ static int netns_install(struct nsproxy *nsproxy, void *ns) return 0; } -static unsigned int netns_inum(void *ns) -{ - struct net *net = ns; - return net->proc_inum; -} - const struct proc_ns_operations netns_operations = { .name = "net", .type = CLONE_NEWNET, .get = netns_get, .put = netns_put, .install = netns_install, - .inum = netns_inum, }; #endif diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 907fb5e36c02..e0ad5d16c9c5 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -72,7 +72,6 @@ module_param(carrier_timeout, uint, 0644); static int netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq) { - const struct net_device_ops *ops = dev->netdev_ops; int status = NETDEV_TX_OK; netdev_features_t features; @@ -80,8 +79,7 @@ static int netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev, if (vlan_tx_tag_present(skb) && !vlan_hw_offload_capable(features, skb->vlan_proto)) { - skb = __vlan_put_tag(skb, skb->vlan_proto, - vlan_tx_tag_get(skb)); + skb = __vlan_hwaccel_push_inside(skb); if (unlikely(!skb)) { /* This is actually a packet drop, but we * don't want the code that calls this @@ -89,12 +87,9 @@ static int netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev, */ goto out; } - skb->vlan_tci = 0; } - status = ops->ndo_start_xmit(skb, dev); - if (status == NETDEV_TX_OK) - txq_trans_update(txq); + status = netdev_start_xmit(skb, dev, txq, false); out: return status; @@ -116,7 +111,7 @@ static void queue_process(struct work_struct *work) continue; } - txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb)); + txq = skb_get_tx_queue(dev, skb); local_irq_save(flags); HARD_TX_LOCK(dev, txq, smp_processor_id()); diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 8b849ddfef2e..da934fc3faa8 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -202,6 +202,7 @@ #define F_QUEUE_MAP_CPU (1<<14) /* queue map mirrors smp_processor_id() */ #define F_NODE (1<<15) /* Node memory alloc*/ #define F_UDPCSUM (1<<16) /* Include UDP checksum */ +#define F_NO_TIMESTAMP (1<<17) /* Don't timestamp packets (default TS) */ /* Thread control flag bits */ #define T_STOP (1<<0) /* Stop run */ @@ -386,6 +387,7 @@ struct pktgen_dev { u16 queue_map_min; u16 queue_map_max; __u32 skb_priority; /* skb priority field */ + unsigned int burst; /* number of duplicated packets to burst */ int node; /* Memory node */ #ifdef CONFIG_XFRM @@ -505,7 +507,7 @@ static ssize_t pgctrl_write(struct file *file, const char __user *buf, pktgen_reset_all_threads(pn); else - pr_warning("Unknown command: %s\n", data); + pr_warn("Unknown command: %s\n", data); return count; } @@ -612,6 +614,9 @@ static int pktgen_if_show(struct seq_file *seq, void *v) if (pkt_dev->traffic_class) seq_printf(seq, " traffic_class: 0x%02x\n", pkt_dev->traffic_class); + if (pkt_dev->burst > 1) + seq_printf(seq, " burst: %d\n", pkt_dev->burst); + if (pkt_dev->node >= 0) seq_printf(seq, " node: %d\n", pkt_dev->node); @@ -638,6 +643,9 @@ static int pktgen_if_show(struct seq_file *seq, void *v) if (pkt_dev->flags & F_UDPCSUM) seq_puts(seq, "UDPCSUM "); + if (pkt_dev->flags & F_NO_TIMESTAMP) + seq_puts(seq, "NO_TIMESTAMP "); + if (pkt_dev->flags & F_MPLS_RND) seq_puts(seq, "MPLS_RND "); @@ -857,14 +865,14 @@ static ssize_t pktgen_if_write(struct file *file, pg_result = &(pkt_dev->result[0]); if (count < 1) { - pr_warning("wrong command format\n"); + pr_warn("wrong command format\n"); return -EINVAL; } max = count; tmp = count_trail_chars(user_buffer, max); if (tmp < 0) { - pr_warning("illegal format\n"); + pr_warn("illegal format\n"); return tmp; } i = tmp; @@ -1120,6 +1128,16 @@ static ssize_t pktgen_if_write(struct file *file, pkt_dev->dst_mac_count); return count; } + if (!strcmp(name, "burst")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) + return len; + + i += len; + pkt_dev->burst = value < 1 ? 1 : value; + sprintf(pg_result, "OK: burst=%d", pkt_dev->burst); + return count; + } if (!strcmp(name, "node")) { len = num_arg(&user_buffer[i], 10, &value); if (len < 0) @@ -1243,6 +1261,9 @@ static ssize_t pktgen_if_write(struct file *file, else if (strcmp(f, "!UDPCSUM") == 0) pkt_dev->flags &= ~F_UDPCSUM; + else if (strcmp(f, "NO_TIMESTAMP") == 0) + pkt_dev->flags |= F_NO_TIMESTAMP; + else { sprintf(pg_result, "Flag -:%s:- unknown\nAvailable flags, (prepend ! to un-set flag):\n%s", @@ -1251,6 +1272,7 @@ static ssize_t pktgen_if_write(struct file *file, "MACSRC_RND, MACDST_RND, TXSIZE_RND, IPV6, " "MPLS_RND, VID_RND, SVID_RND, FLOW_SEQ, " "QUEUE_MAP_RND, QUEUE_MAP_CPU, UDPCSUM, " + "NO_TIMESTAMP, " #ifdef CONFIG_XFRM "IPSEC, " #endif @@ -2048,15 +2070,15 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev) ntxq = pkt_dev->odev->real_num_tx_queues; if (ntxq <= pkt_dev->queue_map_min) { - pr_warning("WARNING: Requested queue_map_min (zero-based) (%d) exceeds valid range [0 - %d] for (%d) queues on %s, resetting\n", - pkt_dev->queue_map_min, (ntxq ?: 1) - 1, ntxq, - pkt_dev->odevname); + pr_warn("WARNING: Requested queue_map_min (zero-based) (%d) exceeds valid range [0 - %d] for (%d) queues on %s, resetting\n", + pkt_dev->queue_map_min, (ntxq ?: 1) - 1, ntxq, + pkt_dev->odevname); pkt_dev->queue_map_min = (ntxq ?: 1) - 1; } if (pkt_dev->queue_map_max >= ntxq) { - pr_warning("WARNING: Requested queue_map_max (zero-based) (%d) exceeds valid range [0 - %d] for (%d) queues on %s, resetting\n", - pkt_dev->queue_map_max, (ntxq ?: 1) - 1, ntxq, - pkt_dev->odevname); + pr_warn("WARNING: Requested queue_map_max (zero-based) (%d) exceeds valid range [0 - %d] for (%d) queues on %s, resetting\n", + pkt_dev->queue_map_max, (ntxq ?: 1) - 1, ntxq, + pkt_dev->odevname); pkt_dev->queue_map_max = (ntxq ?: 1) - 1; } @@ -2685,9 +2707,14 @@ static void pktgen_finalize_skb(struct pktgen_dev *pkt_dev, struct sk_buff *skb, pgh->pgh_magic = htonl(PKTGEN_MAGIC); pgh->seq_num = htonl(pkt_dev->seq_num); - do_gettimeofday(×tamp); - pgh->tv_sec = htonl(timestamp.tv_sec); - pgh->tv_usec = htonl(timestamp.tv_usec); + if (pkt_dev->flags & F_NO_TIMESTAMP) { + pgh->tv_sec = 0; + pgh->tv_usec = 0; + } else { + do_gettimeofday(×tamp); + pgh->tv_sec = htonl(timestamp.tv_sec); + pgh->tv_usec = htonl(timestamp.tv_usec); + } } static struct sk_buff *pktgen_alloc_skb(struct net_device *dev, @@ -3160,8 +3187,8 @@ static int pktgen_stop_device(struct pktgen_dev *pkt_dev) int nr_frags = pkt_dev->skb ? skb_shinfo(pkt_dev->skb)->nr_frags : -1; if (!pkt_dev->running) { - pr_warning("interface: %s is already stopped\n", - pkt_dev->odevname); + pr_warn("interface: %s is already stopped\n", + pkt_dev->odevname); return -EINVAL; } @@ -3284,11 +3311,9 @@ static void pktgen_wait_for_skb(struct pktgen_dev *pkt_dev) static void pktgen_xmit(struct pktgen_dev *pkt_dev) { + unsigned int burst = ACCESS_ONCE(pkt_dev->burst); struct net_device *odev = pkt_dev->odev; - netdev_tx_t (*xmit)(struct sk_buff *, struct net_device *) - = odev->netdev_ops->ndo_start_xmit; struct netdev_queue *txq; - u16 queue_map; int ret; /* If device is offline, then don't send */ @@ -3326,8 +3351,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) if (pkt_dev->delay && pkt_dev->last_ok) spin(pkt_dev, pkt_dev->next_tx); - queue_map = skb_get_queue_mapping(pkt_dev->skb); - txq = netdev_get_tx_queue(odev, queue_map); + txq = skb_get_tx_queue(odev, pkt_dev->skb); local_bh_disable(); @@ -3338,16 +3362,19 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) pkt_dev->last_ok = 0; goto unlock; } - atomic_inc(&(pkt_dev->skb->users)); - ret = (*xmit)(pkt_dev->skb, odev); + atomic_add(burst, &pkt_dev->skb->users); + +xmit_more: + ret = netdev_start_xmit(pkt_dev->skb, odev, txq, --burst > 0); switch (ret) { case NETDEV_TX_OK: - txq_trans_update(txq); pkt_dev->last_ok = 1; pkt_dev->sofar++; pkt_dev->seq_num++; pkt_dev->tx_bytes += pkt_dev->last_pkt_size; + if (burst > 0 && !netif_xmit_frozen_or_drv_stopped(txq)) + goto xmit_more; break; case NET_XMIT_DROP: case NET_XMIT_CN: @@ -3366,6 +3393,8 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) atomic_dec(&(pkt_dev->skb->users)); pkt_dev->last_ok = 0; } + if (unlikely(burst)) + atomic_sub(burst, &pkt_dev->skb->users); unlock: HARD_TX_UNLOCK(odev, txq); @@ -3564,6 +3593,7 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname) pkt_dev->svlan_p = 0; pkt_dev->svlan_cfi = 0; pkt_dev->svlan_id = 0xffff; + pkt_dev->burst = 1; pkt_dev->node = -1; err = pktgen_setup_dev(t->net, pkt_dev, ifname); @@ -3684,7 +3714,7 @@ static int pktgen_remove_device(struct pktgen_thread *t, pr_debug("remove_device pkt_dev=%p\n", pkt_dev); if (pkt_dev->running) { - pr_warning("WARNING: trying to remove a running interface, stopping it now\n"); + pr_warn("WARNING: trying to remove a running interface, stopping it now\n"); pktgen_stop_device(pkt_dev); } @@ -3698,8 +3728,7 @@ static int pktgen_remove_device(struct pktgen_thread *t, /* Remove proc before if_list entry, because add_device uses * list to determine if interface already exist, avoid race * with proc_create_data() */ - if (pkt_dev->entry) - proc_remove(pkt_dev->entry); + proc_remove(pkt_dev->entry); /* And update the thread if_list */ _rem_dev_from_if_list(t, pkt_dev); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index f0493e3b7471..9cf6fe9ddc0c 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -36,6 +36,7 @@ #include <linux/mutex.h> #include <linux/if_addr.h> #include <linux/if_bridge.h> +#include <linux/if_vlan.h> #include <linux/pci.h> #include <linux/etherdevice.h> @@ -43,6 +44,7 @@ #include <linux/inet.h> #include <linux/netdevice.h> +#include <net/switchdev.h> #include <net/ip.h> #include <net/protocol.h> #include <net/arp.h> @@ -365,11 +367,10 @@ static void rtnl_lock_unregistering_all(void) { struct net *net; bool unregistering; - DEFINE_WAIT(wait); + DEFINE_WAIT_FUNC(wait, woken_wake_function); + add_wait_queue(&netdev_unregistering_wq, &wait); for (;;) { - prepare_to_wait(&netdev_unregistering_wq, &wait, - TASK_UNINTERRUPTIBLE); unregistering = false; rtnl_lock(); for_each_net(net) { @@ -381,9 +382,10 @@ static void rtnl_lock_unregistering_all(void) if (!unregistering) break; __rtnl_unlock(); - schedule(); + + wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); } - finish_wait(&netdev_unregistering_wq, &wait); + remove_wait_queue(&netdev_unregistering_wq, &wait); } /** @@ -868,7 +870,8 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + rtnl_port_size(dev, ext_filter_mask) /* IFLA_VF_PORTS + IFLA_PORT_SELF */ + rtnl_link_get_size(dev) /* IFLA_LINKINFO */ + rtnl_link_get_af_size(dev) /* IFLA_AF_SPEC */ - + nla_total_size(MAX_PHYS_PORT_ID_LEN); /* IFLA_PHYS_PORT_ID */ + + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_PORT_ID */ + + nla_total_size(MAX_PHYS_ITEM_ID_LEN); /* IFLA_PHYS_SWITCH_ID */ } static int rtnl_vf_ports_fill(struct sk_buff *skb, struct net_device *dev) @@ -952,7 +955,7 @@ static int rtnl_port_fill(struct sk_buff *skb, struct net_device *dev, static int rtnl_phys_port_id_fill(struct sk_buff *skb, struct net_device *dev) { int err; - struct netdev_phys_port_id ppid; + struct netdev_phys_item_id ppid; err = dev_get_phys_port_id(dev, &ppid); if (err) { @@ -967,6 +970,24 @@ static int rtnl_phys_port_id_fill(struct sk_buff *skb, struct net_device *dev) return 0; } +static int rtnl_phys_switch_id_fill(struct sk_buff *skb, struct net_device *dev) +{ + int err; + struct netdev_phys_item_id psid; + + err = netdev_switch_parent_id_get(dev, &psid); + if (err) { + if (err == -EOPNOTSUPP) + return 0; + return err; + } + + if (nla_put(skb, IFLA_PHYS_SWITCH_ID, psid.id_len, psid.id)) + return -EMSGSIZE; + + return 0; +} + static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, int type, u32 pid, u32 seq, u32 change, unsigned int flags, u32 ext_filter_mask) @@ -1039,6 +1060,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, if (rtnl_phys_port_id_fill(skb, dev)) goto nla_put_failure; + if (rtnl_phys_switch_id_fill(skb, dev)) + goto nla_put_failure; + attr = nla_reserve(skb, IFLA_STATS, sizeof(struct rtnl_link_stats)); if (attr == NULL) @@ -1196,8 +1220,9 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_PROMISCUITY] = { .type = NLA_U32 }, [IFLA_NUM_TX_QUEUES] = { .type = NLA_U32 }, [IFLA_NUM_RX_QUEUES] = { .type = NLA_U32 }, - [IFLA_PHYS_PORT_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_PORT_ID_LEN }, + [IFLA_PHYS_PORT_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN }, [IFLA_CARRIER_CHANGES] = { .type = NLA_U32 }, /* ignored */ + [IFLA_PHYS_SWITCH_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN }, }; static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { @@ -1481,9 +1506,12 @@ static int do_set_master(struct net_device *dev, int ifindex) return 0; } +#define DO_SETLINK_MODIFIED 0x01 +/* notify flag means notify + modified. */ +#define DO_SETLINK_NOTIFY 0x03 static int do_setlink(const struct sk_buff *skb, struct net_device *dev, struct ifinfomsg *ifm, - struct nlattr **tb, char *ifname, int modified) + struct nlattr **tb, char *ifname, int status) { const struct net_device_ops *ops = dev->netdev_ops; int err; @@ -1495,6 +1523,7 @@ static int do_setlink(const struct sk_buff *skb, goto errout; } if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) { + put_net(net); err = -EPERM; goto errout; } @@ -1502,7 +1531,7 @@ static int do_setlink(const struct sk_buff *skb, put_net(net); if (err) goto errout; - modified = 1; + status |= DO_SETLINK_MODIFIED; } if (tb[IFLA_MAP]) { @@ -1531,7 +1560,7 @@ static int do_setlink(const struct sk_buff *skb, if (err < 0) goto errout; - modified = 1; + status |= DO_SETLINK_NOTIFY; } if (tb[IFLA_ADDRESS]) { @@ -1551,19 +1580,19 @@ static int do_setlink(const struct sk_buff *skb, kfree(sa); if (err) goto errout; - modified = 1; + status |= DO_SETLINK_MODIFIED; } if (tb[IFLA_MTU]) { err = dev_set_mtu(dev, nla_get_u32(tb[IFLA_MTU])); if (err < 0) goto errout; - modified = 1; + status |= DO_SETLINK_MODIFIED; } if (tb[IFLA_GROUP]) { dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP])); - modified = 1; + status |= DO_SETLINK_NOTIFY; } /* @@ -1575,7 +1604,7 @@ static int do_setlink(const struct sk_buff *skb, err = dev_change_name(dev, ifname); if (err < 0) goto errout; - modified = 1; + status |= DO_SETLINK_MODIFIED; } if (tb[IFLA_IFALIAS]) { @@ -1583,7 +1612,7 @@ static int do_setlink(const struct sk_buff *skb, nla_len(tb[IFLA_IFALIAS])); if (err < 0) goto errout; - modified = 1; + status |= DO_SETLINK_NOTIFY; } if (tb[IFLA_BROADCAST]) { @@ -1601,25 +1630,35 @@ static int do_setlink(const struct sk_buff *skb, err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER])); if (err) goto errout; - modified = 1; + status |= DO_SETLINK_MODIFIED; } if (tb[IFLA_CARRIER]) { err = dev_change_carrier(dev, nla_get_u8(tb[IFLA_CARRIER])); if (err) goto errout; - modified = 1; + status |= DO_SETLINK_MODIFIED; } - if (tb[IFLA_TXQLEN]) - dev->tx_queue_len = nla_get_u32(tb[IFLA_TXQLEN]); + if (tb[IFLA_TXQLEN]) { + unsigned long value = nla_get_u32(tb[IFLA_TXQLEN]); + + if (dev->tx_queue_len ^ value) + status |= DO_SETLINK_NOTIFY; + + dev->tx_queue_len = value; + } if (tb[IFLA_OPERSTATE]) set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE])); if (tb[IFLA_LINKMODE]) { + unsigned char value = nla_get_u8(tb[IFLA_LINKMODE]); + write_lock_bh(&dev_base_lock); - dev->link_mode = nla_get_u8(tb[IFLA_LINKMODE]); + if (dev->link_mode ^ value) + status |= DO_SETLINK_NOTIFY; + dev->link_mode = value; write_unlock_bh(&dev_base_lock); } @@ -1634,7 +1673,7 @@ static int do_setlink(const struct sk_buff *skb, err = do_setvfinfo(dev, attr); if (err < 0) goto errout; - modified = 1; + status |= DO_SETLINK_NOTIFY; } } err = 0; @@ -1664,7 +1703,7 @@ static int do_setlink(const struct sk_buff *skb, err = ops->ndo_set_vf_port(dev, vf, port); if (err < 0) goto errout; - modified = 1; + status |= DO_SETLINK_NOTIFY; } } err = 0; @@ -1682,7 +1721,7 @@ static int do_setlink(const struct sk_buff *skb, err = ops->ndo_set_vf_port(dev, PORT_SELF_VF, port); if (err < 0) goto errout; - modified = 1; + status |= DO_SETLINK_NOTIFY; } if (tb[IFLA_AF_SPEC]) { @@ -1699,15 +1738,20 @@ static int do_setlink(const struct sk_buff *skb, if (err < 0) goto errout; - modified = 1; + status |= DO_SETLINK_NOTIFY; } } err = 0; errout: - if (err < 0 && modified) - net_warn_ratelimited("A link change request failed with some changes committed already. Interface %s may have been left with an inconsistent configuration, please check.\n", - dev->name); + if (status & DO_SETLINK_MODIFIED) { + if (status & DO_SETLINK_NOTIFY) + netdev_state_change(dev); + + if (err < 0) + net_warn_ratelimited("A link change request failed with some changes committed already. Interface %s may have been left with an inconsistent configuration, please check.\n", + dev->name); + } return err; } @@ -1989,7 +2033,7 @@ replay: } if (dev) { - int modified = 0; + int status = 0; if (nlh->nlmsg_flags & NLM_F_EXCL) return -EEXIST; @@ -2004,7 +2048,7 @@ replay: err = ops->changelink(dev, tb, data); if (err < 0) return err; - modified = 1; + status |= DO_SETLINK_NOTIFY; } if (linkinfo[IFLA_INFO_SLAVE_DATA]) { @@ -2015,10 +2059,10 @@ replay: tb, slave_data); if (err < 0) return err; - modified = 1; + status |= DO_SETLINK_NOTIFY; } - return do_setlink(skb, dev, ifm, tb, ifname, modified); + return do_setlink(skb, dev, ifm, tb, ifname, status); } if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { @@ -2202,8 +2246,8 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } -void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change, - gfp_t flags) +struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev, + unsigned int change, gfp_t flags) { struct net *net = dev_net(dev); struct sk_buff *skb; @@ -2221,11 +2265,28 @@ void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change, kfree_skb(skb); goto errout; } - rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, flags); - return; + return skb; errout: if (err < 0) rtnl_set_sk_err(net, RTNLGRP_LINK, err); + return NULL; +} + +void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev, gfp_t flags) +{ + struct net *net = dev_net(dev); + + rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, flags); +} + +void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change, + gfp_t flags) +{ + struct sk_buff *skb; + + skb = rtmsg_ifinfo_build_skb(type, dev, change, flags); + if (skb) + rtmsg_ifinfo_send(skb, dev, flags); } EXPORT_SYMBOL(rtmsg_ifinfo); @@ -2294,7 +2355,7 @@ errout: int ndo_dflt_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, - const unsigned char *addr, + const unsigned char *addr, u16 vid, u16 flags) { int err = -EINVAL; @@ -2307,6 +2368,11 @@ int ndo_dflt_fdb_add(struct ndmsg *ndm, return err; } + if (vid) { + pr_info("%s: vlans aren't supported yet for dev_uc|mc_add()\n", dev->name); + return err; + } + if (is_unicast_ether_addr(addr) || is_link_local_ether_addr(addr)) err = dev_uc_add_excl(dev, addr); else if (is_multicast_ether_addr(addr)) @@ -2320,6 +2386,28 @@ int ndo_dflt_fdb_add(struct ndmsg *ndm, } EXPORT_SYMBOL(ndo_dflt_fdb_add); +static int fdb_vid_parse(struct nlattr *vlan_attr, u16 *p_vid) +{ + u16 vid = 0; + + if (vlan_attr) { + if (nla_len(vlan_attr) != sizeof(u16)) { + pr_info("PF_BRIDGE: RTM_NEWNEIGH with invalid vlan\n"); + return -EINVAL; + } + + vid = nla_get_u16(vlan_attr); + + if (!vid || vid >= VLAN_VID_MASK) { + pr_info("PF_BRIDGE: RTM_NEWNEIGH with invalid vlan id %d\n", + vid); + return -EINVAL; + } + } + *p_vid = vid; + return 0; +} + static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); @@ -2327,6 +2415,7 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh) struct nlattr *tb[NDA_MAX+1]; struct net_device *dev; u8 *addr; + u16 vid; int err; err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL); @@ -2352,6 +2441,10 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh) addr = nla_data(tb[NDA_LLADDR]); + err = fdb_vid_parse(tb[NDA_VLAN], &vid); + if (err) + return err; + err = -EOPNOTSUPP; /* Support fdb on master device the net/bridge default case */ @@ -2360,7 +2453,8 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh) struct net_device *br_dev = netdev_master_upper_dev_get(dev); const struct net_device_ops *ops = br_dev->netdev_ops; - err = ops->ndo_fdb_add(ndm, tb, dev, addr, nlh->nlmsg_flags); + err = ops->ndo_fdb_add(ndm, tb, dev, addr, vid, + nlh->nlmsg_flags); if (err) goto out; else @@ -2371,9 +2465,10 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh) if ((ndm->ndm_flags & NTF_SELF)) { if (dev->netdev_ops->ndo_fdb_add) err = dev->netdev_ops->ndo_fdb_add(ndm, tb, dev, addr, + vid, nlh->nlmsg_flags); else - err = ndo_dflt_fdb_add(ndm, tb, dev, addr, + err = ndo_dflt_fdb_add(ndm, tb, dev, addr, vid, nlh->nlmsg_flags); if (!err) { @@ -2391,7 +2486,7 @@ out: int ndo_dflt_fdb_del(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, - const unsigned char *addr) + const unsigned char *addr, u16 vid) { int err = -EINVAL; @@ -2420,6 +2515,7 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh) struct net_device *dev; int err = -EINVAL; __u8 *addr; + u16 vid; if (!netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; @@ -2447,6 +2543,10 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh) addr = nla_data(tb[NDA_LLADDR]); + err = fdb_vid_parse(tb[NDA_VLAN], &vid); + if (err) + return err; + err = -EOPNOTSUPP; /* Support fdb on master device the net/bridge default case */ @@ -2456,7 +2556,7 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh) const struct net_device_ops *ops = br_dev->netdev_ops; if (ops->ndo_fdb_del) - err = ops->ndo_fdb_del(ndm, tb, dev, addr); + err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid); if (err) goto out; @@ -2467,9 +2567,10 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh) /* Embedded bridge, macvlan, and any other device support */ if (ndm->ndm_flags & NTF_SELF) { if (dev->netdev_ops->ndo_fdb_del) - err = dev->netdev_ops->ndo_fdb_del(ndm, tb, dev, addr); + err = dev->netdev_ops->ndo_fdb_del(ndm, tb, dev, addr, + vid); else - err = ndo_dflt_fdb_del(ndm, tb, dev, addr); + err = ndo_dflt_fdb_del(ndm, tb, dev, addr, vid); if (!err) { rtnl_fdb_notify(dev, addr, RTM_DELNEIGH); @@ -2609,12 +2710,22 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } +static int brport_nla_put_flag(struct sk_buff *skb, u32 flags, u32 mask, + unsigned int attrnum, unsigned int flag) +{ + if (mask & flag) + return nla_put_u8(skb, attrnum, !!(flags & flag)); + return 0; +} + int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq, - struct net_device *dev, u16 mode) + struct net_device *dev, u16 mode, + u32 flags, u32 mask) { struct nlmsghdr *nlh; struct ifinfomsg *ifm; struct nlattr *br_afspec; + struct nlattr *protinfo; u8 operstate = netif_running(dev) ? dev->operstate : IF_OPER_DOWN; struct net_device *br_dev = netdev_master_upper_dev_get(dev); @@ -2646,13 +2757,46 @@ int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq, if (!br_afspec) goto nla_put_failure; - if (nla_put_u16(skb, IFLA_BRIDGE_FLAGS, BRIDGE_FLAGS_SELF) || - nla_put_u16(skb, IFLA_BRIDGE_MODE, mode)) { + if (nla_put_u16(skb, IFLA_BRIDGE_FLAGS, BRIDGE_FLAGS_SELF)) { nla_nest_cancel(skb, br_afspec); goto nla_put_failure; } + + if (mode != BRIDGE_MODE_UNDEF) { + if (nla_put_u16(skb, IFLA_BRIDGE_MODE, mode)) { + nla_nest_cancel(skb, br_afspec); + goto nla_put_failure; + } + } nla_nest_end(skb, br_afspec); + protinfo = nla_nest_start(skb, IFLA_PROTINFO | NLA_F_NESTED); + if (!protinfo) + goto nla_put_failure; + + if (brport_nla_put_flag(skb, flags, mask, + IFLA_BRPORT_MODE, BR_HAIRPIN_MODE) || + brport_nla_put_flag(skb, flags, mask, + IFLA_BRPORT_GUARD, BR_BPDU_GUARD) || + brport_nla_put_flag(skb, flags, mask, + IFLA_BRPORT_FAST_LEAVE, + BR_MULTICAST_FAST_LEAVE) || + brport_nla_put_flag(skb, flags, mask, + IFLA_BRPORT_PROTECT, BR_ROOT_BLOCK) || + brport_nla_put_flag(skb, flags, mask, + IFLA_BRPORT_LEARNING, BR_LEARNING) || + brport_nla_put_flag(skb, flags, mask, + IFLA_BRPORT_LEARNING_SYNC, BR_LEARNING_SYNC) || + brport_nla_put_flag(skb, flags, mask, + IFLA_BRPORT_UNICAST_FLOOD, BR_FLOOD) || + brport_nla_put_flag(skb, flags, mask, + IFLA_BRPORT_PROXYARP, BR_PROXYARP)) { + nla_nest_cancel(skb, protinfo); + goto nla_put_failure; + } + + nla_nest_end(skb, protinfo); + return nlmsg_end(skb, nlh); nla_put_failure: nlmsg_cancel(skb, nlh); @@ -2667,13 +2811,20 @@ static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb) int idx = 0; u32 portid = NETLINK_CB(cb->skb).portid; u32 seq = cb->nlh->nlmsg_seq; - struct nlattr *extfilt; u32 filter_mask = 0; - extfilt = nlmsg_find_attr(cb->nlh, sizeof(struct ifinfomsg), - IFLA_EXT_MASK); - if (extfilt) - filter_mask = nla_get_u32(extfilt); + if (nlmsg_len(cb->nlh) > sizeof(struct ifinfomsg)) { + struct nlattr *extfilt; + + extfilt = nlmsg_find_attr(cb->nlh, sizeof(struct ifinfomsg), + IFLA_EXT_MASK); + if (extfilt) { + if (nla_len(extfilt) < sizeof(filter_mask)) + return -EINVAL; + + filter_mask = nla_get_u32(extfilt); + } + } rcu_read_lock(); for_each_netdev_rcu(net, dev) { @@ -2780,6 +2931,9 @@ static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh) if (br_spec) { nla_for_each_nested(attr, br_spec, rem) { if (nla_type(attr) == IFLA_BRIDGE_FLAGS) { + if (nla_len(attr) < sizeof(flags)) + return -EINVAL; + have_flags = true; flags = nla_get_u16(attr); break; @@ -2850,6 +3004,9 @@ static int rtnl_bridge_dellink(struct sk_buff *skb, struct nlmsghdr *nlh) if (br_spec) { nla_for_each_nested(attr, br_spec, rem) { if (nla_type(attr) == IFLA_BRIDGE_FLAGS) { + if (nla_len(attr) < sizeof(flags)) + return -EINVAL; + have_flags = true; flags = nla_get_u16(attr); break; diff --git a/net/core/scm.c b/net/core/scm.c index b442e7e25e60..3b6899b7d810 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -129,8 +129,7 @@ int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p) struct cmsghdr *cmsg; int err; - for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) - { + for_each_cmsghdr(cmsg, msg) { err = -EINVAL; /* Verify that cmsg_len is at least sizeof(struct cmsghdr) */ diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c index ba71212f0251..51dd3193a33e 100644 --- a/net/core/secure_seq.c +++ b/net/core/secure_seq.c @@ -35,7 +35,7 @@ static u32 seq_scale(u32 seq) * overlaps less than one time per MSL (2 minutes). * Choosing a clock of 64 ns period is OK. (period of 274 s) */ - return seq + (ktime_to_ns(ktime_get_real()) >> 6); + return seq + (ktime_get_real_ns() >> 6); } #endif @@ -135,7 +135,7 @@ u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr, md5_transform(hash, net_secret); seq = hash[0] | (((u64)hash[1]) << 32); - seq += ktime_to_ns(ktime_get_real()); + seq += ktime_get_real_ns(); seq &= (1ull << 48) - 1; return seq; @@ -163,7 +163,7 @@ u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr, md5_transform(hash, secret); seq = hash[0] | (((u64)hash[1]) << 32); - seq += ktime_to_ns(ktime_get_real()); + seq += ktime_get_real_ns(); seq &= (1ull << 48) - 1; return seq; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 8d289697cc7a..395c15b82087 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -257,16 +257,16 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, kmemcheck_annotate_variable(shinfo->destructor_arg); if (flags & SKB_ALLOC_FCLONE) { - struct sk_buff *child = skb + 1; - atomic_t *fclone_ref = (atomic_t *) (child + 1); + struct sk_buff_fclones *fclones; - kmemcheck_annotate_bitfield(child, flags1); - kmemcheck_annotate_bitfield(child, flags2); + fclones = container_of(skb, struct sk_buff_fclones, skb1); + + kmemcheck_annotate_bitfield(&fclones->skb2, flags1); skb->fclone = SKB_FCLONE_ORIG; - atomic_set(fclone_ref, 1); + atomic_set(&fclones->fclone_ref, 1); - child->fclone = SKB_FCLONE_UNAVAILABLE; - child->pfmemalloc = pfmemalloc; + fclones->skb2.fclone = SKB_FCLONE_CLONE; + fclones->skb2.pfmemalloc = pfmemalloc; } out: return skb; @@ -336,48 +336,85 @@ struct netdev_alloc_cache { unsigned int pagecnt_bias; }; static DEFINE_PER_CPU(struct netdev_alloc_cache, netdev_alloc_cache); +static DEFINE_PER_CPU(struct netdev_alloc_cache, napi_alloc_cache); -static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) +static struct page *__page_frag_refill(struct netdev_alloc_cache *nc, + gfp_t gfp_mask) { - struct netdev_alloc_cache *nc; - void *data = NULL; - int order; - unsigned long flags; + const unsigned int order = NETDEV_FRAG_PAGE_MAX_ORDER; + struct page *page = NULL; + gfp_t gfp = gfp_mask; + + if (order) { + gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY; + page = alloc_pages_node(NUMA_NO_NODE, gfp_mask, order); + nc->frag.size = PAGE_SIZE << (page ? order : 0); + } - local_irq_save(flags); - nc = &__get_cpu_var(netdev_alloc_cache); - if (unlikely(!nc->frag.page)) { + if (unlikely(!page)) + page = alloc_pages_node(NUMA_NO_NODE, gfp, 0); + + nc->frag.page = page; + + return page; +} + +static void *__alloc_page_frag(struct netdev_alloc_cache __percpu *cache, + unsigned int fragsz, gfp_t gfp_mask) +{ + struct netdev_alloc_cache *nc = this_cpu_ptr(cache); + struct page *page = nc->frag.page; + unsigned int size; + int offset; + + if (unlikely(!page)) { refill: - for (order = NETDEV_FRAG_PAGE_MAX_ORDER; ;) { - gfp_t gfp = gfp_mask; + page = __page_frag_refill(nc, gfp_mask); + if (!page) + return NULL; - if (order) - gfp |= __GFP_COMP | __GFP_NOWARN; - nc->frag.page = alloc_pages(gfp, order); - if (likely(nc->frag.page)) - break; - if (--order < 0) - goto end; - } - nc->frag.size = PAGE_SIZE << order; -recycle: - atomic_set(&nc->frag.page->_count, NETDEV_PAGECNT_MAX_BIAS); - nc->pagecnt_bias = NETDEV_PAGECNT_MAX_BIAS; - nc->frag.offset = 0; + /* if size can vary use frag.size else just use PAGE_SIZE */ + size = NETDEV_FRAG_PAGE_MAX_ORDER ? nc->frag.size : PAGE_SIZE; + + /* Even if we own the page, we do not use atomic_set(). + * This would break get_page_unless_zero() users. + */ + atomic_add(size - 1, &page->_count); + + /* reset page count bias and offset to start of new frag */ + nc->pagecnt_bias = size; + nc->frag.offset = size; } - if (nc->frag.offset + fragsz > nc->frag.size) { - /* avoid unnecessary locked operations if possible */ - if ((atomic_read(&nc->frag.page->_count) == nc->pagecnt_bias) || - atomic_sub_and_test(nc->pagecnt_bias, &nc->frag.page->_count)) - goto recycle; - goto refill; + offset = nc->frag.offset - fragsz; + if (unlikely(offset < 0)) { + if (!atomic_sub_and_test(nc->pagecnt_bias, &page->_count)) + goto refill; + + /* if size can vary use frag.size else just use PAGE_SIZE */ + size = NETDEV_FRAG_PAGE_MAX_ORDER ? nc->frag.size : PAGE_SIZE; + + /* OK, page count is 0, we can safely set it */ + atomic_set(&page->_count, size); + + /* reset page count bias and offset to start of new frag */ + nc->pagecnt_bias = size; + offset = size - fragsz; } - data = page_address(nc->frag.page) + nc->frag.offset; - nc->frag.offset += fragsz; nc->pagecnt_bias--; -end: + nc->frag.offset = offset; + + return page_address(page) + offset; +} + +static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) +{ + unsigned long flags; + void *data; + + local_irq_save(flags); + data = __alloc_page_frag(&netdev_alloc_cache, fragsz, gfp_mask); local_irq_restore(flags); return data; } @@ -395,11 +432,25 @@ void *netdev_alloc_frag(unsigned int fragsz) } EXPORT_SYMBOL(netdev_alloc_frag); +static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) +{ + return __alloc_page_frag(&napi_alloc_cache, fragsz, gfp_mask); +} + +void *napi_alloc_frag(unsigned int fragsz) +{ + return __napi_alloc_frag(fragsz, GFP_ATOMIC | __GFP_COLD); +} +EXPORT_SYMBOL(napi_alloc_frag); + /** - * __netdev_alloc_skb - allocate an skbuff for rx on a specific device - * @dev: network device to receive on + * __alloc_rx_skb - allocate an skbuff for rx * @length: length to allocate * @gfp_mask: get_free_pages mask, passed to alloc_skb + * @flags: If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for + * allocations in case we have to fallback to __alloc_skb() + * If SKB_ALLOC_NAPI is set, page fragment will be allocated + * from napi_cache instead of netdev_cache. * * Allocate a new &sk_buff and assign it a usage count of one. The * buffer has unspecified headroom built in. Users should allocate @@ -408,11 +459,11 @@ EXPORT_SYMBOL(netdev_alloc_frag); * * %NULL is returned if there is no free memory. */ -struct sk_buff *__netdev_alloc_skb(struct net_device *dev, - unsigned int length, gfp_t gfp_mask) +static struct sk_buff *__alloc_rx_skb(unsigned int length, gfp_t gfp_mask, + int flags) { struct sk_buff *skb = NULL; - unsigned int fragsz = SKB_DATA_ALIGN(length + NET_SKB_PAD) + + unsigned int fragsz = SKB_DATA_ALIGN(length) + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); if (fragsz <= PAGE_SIZE && !(gfp_mask & (__GFP_WAIT | GFP_DMA))) { @@ -421,7 +472,9 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, if (sk_memalloc_socks()) gfp_mask |= __GFP_MEMALLOC; - data = __netdev_alloc_frag(fragsz, gfp_mask); + data = (flags & SKB_ALLOC_NAPI) ? + __napi_alloc_frag(fragsz, gfp_mask) : + __netdev_alloc_frag(fragsz, gfp_mask); if (likely(data)) { skb = build_skb(data, fragsz); @@ -429,17 +482,72 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, put_page(virt_to_head_page(data)); } } else { - skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, + skb = __alloc_skb(length, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE); } + return skb; +} + +/** + * __netdev_alloc_skb - allocate an skbuff for rx on a specific device + * @dev: network device to receive on + * @length: length to allocate + * @gfp_mask: get_free_pages mask, passed to alloc_skb + * + * Allocate a new &sk_buff and assign it a usage count of one. The + * buffer has NET_SKB_PAD headroom built in. Users should allocate + * the headroom they think they need without accounting for the + * built in space. The built in space is used for optimisations. + * + * %NULL is returned if there is no free memory. + */ +struct sk_buff *__netdev_alloc_skb(struct net_device *dev, + unsigned int length, gfp_t gfp_mask) +{ + struct sk_buff *skb; + + length += NET_SKB_PAD; + skb = __alloc_rx_skb(length, gfp_mask, 0); + if (likely(skb)) { skb_reserve(skb, NET_SKB_PAD); skb->dev = dev; } + return skb; } EXPORT_SYMBOL(__netdev_alloc_skb); +/** + * __napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance + * @napi: napi instance this buffer was allocated for + * @length: length to allocate + * @gfp_mask: get_free_pages mask, passed to alloc_skb and alloc_pages + * + * Allocate a new sk_buff for use in NAPI receive. This buffer will + * attempt to allocate the head from a special reserved region used + * only for NAPI Rx allocation. By doing this we can save several + * CPU cycles by avoiding having to disable and re-enable IRQs. + * + * %NULL is returned if there is no free memory. + */ +struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, + unsigned int length, gfp_t gfp_mask) +{ + struct sk_buff *skb; + + length += NET_SKB_PAD + NET_IP_ALIGN; + skb = __alloc_rx_skb(length, gfp_mask, SKB_ALLOC_NAPI); + + if (likely(skb)) { + skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN); + skb->dev = napi->dev; + } + + return skb; +} +EXPORT_SYMBOL(__napi_alloc_skb); + void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off, int size, unsigned int truesize) { @@ -491,32 +599,33 @@ static void skb_free_head(struct sk_buff *skb) static void skb_release_data(struct sk_buff *skb) { - if (!skb->cloned || - !atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1, - &skb_shinfo(skb)->dataref)) { - if (skb_shinfo(skb)->nr_frags) { - int i; - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) - skb_frag_unref(skb, i); - } + struct skb_shared_info *shinfo = skb_shinfo(skb); + int i; - /* - * If skb buf is from userspace, we need to notify the caller - * the lower device DMA has done; - */ - if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { - struct ubuf_info *uarg; + if (skb->cloned && + atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1, + &shinfo->dataref)) + return; - uarg = skb_shinfo(skb)->destructor_arg; - if (uarg->callback) - uarg->callback(uarg, true); - } + for (i = 0; i < shinfo->nr_frags; i++) + __skb_frag_unref(&shinfo->frags[i]); - if (skb_has_frag_list(skb)) - skb_drop_fraglist(skb); + /* + * If skb buf is from userspace, we need to notify the caller + * the lower device DMA has done; + */ + if (shinfo->tx_flags & SKBTX_DEV_ZEROCOPY) { + struct ubuf_info *uarg; - skb_free_head(skb); + uarg = shinfo->destructor_arg; + if (uarg->callback) + uarg->callback(uarg, true); } + + if (shinfo->frag_list) + kfree_skb_list(shinfo->frag_list); + + skb_free_head(skb); } /* @@ -524,33 +633,32 @@ static void skb_release_data(struct sk_buff *skb) */ static void kfree_skbmem(struct sk_buff *skb) { - struct sk_buff *other; - atomic_t *fclone_ref; + struct sk_buff_fclones *fclones; switch (skb->fclone) { case SKB_FCLONE_UNAVAILABLE: kmem_cache_free(skbuff_head_cache, skb); - break; + return; case SKB_FCLONE_ORIG: - fclone_ref = (atomic_t *) (skb + 2); - if (atomic_dec_and_test(fclone_ref)) - kmem_cache_free(skbuff_fclone_cache, skb); - break; - - case SKB_FCLONE_CLONE: - fclone_ref = (atomic_t *) (skb + 1); - other = skb - 1; + fclones = container_of(skb, struct sk_buff_fclones, skb1); - /* The clone portion is available for - * fast-cloning again. + /* We usually free the clone (TX completion) before original skb + * This test would have no chance to be true for the clone, + * while here, branch prediction will be good. */ - skb->fclone = SKB_FCLONE_UNAVAILABLE; + if (atomic_read(&fclones->fclone_ref) == 1) + goto fastpath; + break; - if (atomic_dec_and_test(fclone_ref)) - kmem_cache_free(skbuff_fclone_cache, other); + default: /* SKB_FCLONE_CLONE */ + fclones = container_of(skb, struct sk_buff_fclones, skb2); break; } + if (!atomic_dec_and_test(&fclones->fclone_ref)) + return; +fastpath: + kmem_cache_free(skbuff_fclone_cache, fclones); } static void skb_release_head_state(struct sk_buff *skb) @@ -566,7 +674,7 @@ static void skb_release_head_state(struct sk_buff *skb) #if IS_ENABLED(CONFIG_NF_CONNTRACK) nf_conntrack_put(skb->nfct); #endif -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) nf_bridge_put(skb->nf_bridge); #endif /* XXX: IS this still necessary? - JHS */ @@ -674,57 +782,61 @@ void consume_skb(struct sk_buff *skb) } EXPORT_SYMBOL(consume_skb); +/* Make sure a field is enclosed inside headers_start/headers_end section */ +#define CHECK_SKB_FIELD(field) \ + BUILD_BUG_ON(offsetof(struct sk_buff, field) < \ + offsetof(struct sk_buff, headers_start)); \ + BUILD_BUG_ON(offsetof(struct sk_buff, field) > \ + offsetof(struct sk_buff, headers_end)); \ + static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) { new->tstamp = old->tstamp; + /* We do not copy old->sk */ new->dev = old->dev; - new->transport_header = old->transport_header; - new->network_header = old->network_header; - new->mac_header = old->mac_header; - new->inner_protocol = old->inner_protocol; - new->inner_transport_header = old->inner_transport_header; - new->inner_network_header = old->inner_network_header; - new->inner_mac_header = old->inner_mac_header; + memcpy(new->cb, old->cb, sizeof(old->cb)); skb_dst_copy(new, old); - skb_copy_hash(new, old); - new->ooo_okay = old->ooo_okay; - new->no_fcs = old->no_fcs; - new->encapsulation = old->encapsulation; - new->encap_hdr_csum = old->encap_hdr_csum; - new->csum_valid = old->csum_valid; - new->csum_complete_sw = old->csum_complete_sw; #ifdef CONFIG_XFRM new->sp = secpath_get(old->sp); #endif - memcpy(new->cb, old->cb, sizeof(old->cb)); - new->csum = old->csum; - new->ignore_df = old->ignore_df; - new->pkt_type = old->pkt_type; - new->ip_summed = old->ip_summed; - skb_copy_queue_mapping(new, old); - new->priority = old->priority; -#if IS_ENABLED(CONFIG_IP_VS) - new->ipvs_property = old->ipvs_property; + __nf_copy(new, old, false); + + /* Note : this field could be in headers_start/headers_end section + * It is not yet because we do not want to have a 16 bit hole + */ + new->queue_mapping = old->queue_mapping; + + memcpy(&new->headers_start, &old->headers_start, + offsetof(struct sk_buff, headers_end) - + offsetof(struct sk_buff, headers_start)); + CHECK_SKB_FIELD(protocol); + CHECK_SKB_FIELD(csum); + CHECK_SKB_FIELD(hash); + CHECK_SKB_FIELD(priority); + CHECK_SKB_FIELD(skb_iif); + CHECK_SKB_FIELD(vlan_proto); + CHECK_SKB_FIELD(vlan_tci); + CHECK_SKB_FIELD(transport_header); + CHECK_SKB_FIELD(network_header); + CHECK_SKB_FIELD(mac_header); + CHECK_SKB_FIELD(inner_protocol); + CHECK_SKB_FIELD(inner_transport_header); + CHECK_SKB_FIELD(inner_network_header); + CHECK_SKB_FIELD(inner_mac_header); + CHECK_SKB_FIELD(mark); +#ifdef CONFIG_NETWORK_SECMARK + CHECK_SKB_FIELD(secmark); +#endif +#ifdef CONFIG_NET_RX_BUSY_POLL + CHECK_SKB_FIELD(napi_id); #endif - new->pfmemalloc = old->pfmemalloc; - new->protocol = old->protocol; - new->mark = old->mark; - new->skb_iif = old->skb_iif; - __nf_copy(new, old); #ifdef CONFIG_NET_SCHED - new->tc_index = old->tc_index; + CHECK_SKB_FIELD(tc_index); #ifdef CONFIG_NET_CLS_ACT - new->tc_verd = old->tc_verd; + CHECK_SKB_FIELD(tc_verd); #endif #endif - new->vlan_proto = old->vlan_proto; - new->vlan_tci = old->vlan_tci; - - skb_copy_secmark(new, old); -#ifdef CONFIG_NET_RX_BUSY_POLL - new->napi_id = old->napi_id; -#endif } /* @@ -855,17 +967,18 @@ EXPORT_SYMBOL_GPL(skb_copy_ubufs); struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) { + struct sk_buff_fclones *fclones = container_of(skb, + struct sk_buff_fclones, + skb1); struct sk_buff *n; if (skb_orphan_frags(skb, gfp_mask)) return NULL; - n = skb + 1; if (skb->fclone == SKB_FCLONE_ORIG && - n->fclone == SKB_FCLONE_UNAVAILABLE) { - atomic_t *fclone_ref = (atomic_t *) (n + 1); - n->fclone = SKB_FCLONE_CLONE; - atomic_inc(fclone_ref); + atomic_read(&fclones->fclone_ref) == 1) { + n = &fclones->skb2; + atomic_set(&fclones->fclone_ref, 2); } else { if (skb_pfmemalloc(skb)) gfp_mask |= __GFP_MEMALLOC; @@ -875,7 +988,6 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) return NULL; kmemcheck_annotate_bitfield(n, flags1); - kmemcheck_annotate_bitfield(n, flags2); n->fclone = SKB_FCLONE_UNAVAILABLE; } @@ -2988,7 +3100,7 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, if (nskb->len == len + doffset) goto perform_csum_check; - if (!sg) { + if (!sg && !nskb->remcsum_offload) { nskb->ip_summed = CHECKSUM_NONE; nskb->csum = skb_copy_and_csum_bits(head_skb, offset, skb_put(nskb, len), @@ -3060,7 +3172,7 @@ skip_fraglist: nskb->truesize += nskb->data_len; perform_csum_check: - if (!csum) { + if (!csum && !nskb->remcsum_offload) { nskb->csum = skb_checksum(nskb, doffset, nskb->len - doffset, 0); nskb->ip_summed = CHECKSUM_NONE; @@ -3069,6 +3181,21 @@ perform_csum_check: } } while ((offset += len) < head_skb->len); + /* Some callers want to get the end of the list. + * Put it in segs->prev to avoid walking the list. + * (see validate_xmit_skb_list() for example) + */ + segs->prev = tail; + + /* Following permits correct backpressure, for protocols + * using skb_set_owner_w(). + * Idea is to tranfert ownership from head_skb to last segment. + */ + if (head_skb->destructor == sock_wfree) { + swap(tail->truesize, head_skb->truesize); + swap(tail->destructor, head_skb->destructor); + swap(tail->sk, head_skb->sk); + } return segs; err: @@ -3182,7 +3309,7 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) skb_shinfo(nskb)->frag_list = p; skb_shinfo(nskb)->gso_size = pinfo->gso_size; pinfo->gso_size = 0; - skb_header_release(p); + __skb_header_release(p); NAPI_GRO_CB(nskb)->last = p; nskb->data_len += p->len; @@ -3214,7 +3341,7 @@ merge: else NAPI_GRO_CB(p)->last->next = skb; NAPI_GRO_CB(p)->last = skb; - skb_header_release(skb); + __skb_header_release(skb); lp = p; done: @@ -3230,7 +3357,6 @@ done: NAPI_GRO_CB(skb)->same_flow = 1; return 0; } -EXPORT_SYMBOL_GPL(skb_gro_receive); void __init skb_init(void) { @@ -3240,8 +3366,7 @@ void __init skb_init(void) SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache", - (2*sizeof(struct sk_buff)) + - sizeof(atomic_t), + sizeof(struct sk_buff_fclones), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); @@ -3494,32 +3619,66 @@ int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb) } EXPORT_SYMBOL(sock_queue_err_skb); -void __skb_tstamp_tx(struct sk_buff *orig_skb, - struct skb_shared_hwtstamps *hwtstamps, - struct sock *sk, int tstype) +struct sk_buff *sock_dequeue_err_skb(struct sock *sk) { - struct sock_exterr_skb *serr; - struct sk_buff *skb; - int err; + struct sk_buff_head *q = &sk->sk_error_queue; + struct sk_buff *skb, *skb_next; + int err = 0; - if (!sk) - return; + spin_lock_bh(&q->lock); + skb = __skb_dequeue(q); + if (skb && (skb_next = skb_peek(q))) + err = SKB_EXT_ERR(skb_next)->ee.ee_errno; + spin_unlock_bh(&q->lock); - if (hwtstamps) { - *skb_hwtstamps(orig_skb) = - *hwtstamps; - } else { - /* - * no hardware time stamps available, - * so keep the shared tx_flags and only - * store software time stamp - */ - orig_skb->tstamp = ktime_get_real(); + sk->sk_err = err; + if (err) + sk->sk_error_report(sk); + + return skb; +} +EXPORT_SYMBOL(sock_dequeue_err_skb); + +/** + * skb_clone_sk - create clone of skb, and take reference to socket + * @skb: the skb to clone + * + * This function creates a clone of a buffer that holds a reference on + * sk_refcnt. Buffers created via this function are meant to be + * returned using sock_queue_err_skb, or free via kfree_skb. + * + * When passing buffers allocated with this function to sock_queue_err_skb + * it is necessary to wrap the call with sock_hold/sock_put in order to + * prevent the socket from being released prior to being enqueued on + * the sk_error_queue. + */ +struct sk_buff *skb_clone_sk(struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + struct sk_buff *clone; + + if (!sk || !atomic_inc_not_zero(&sk->sk_refcnt)) + return NULL; + + clone = skb_clone(skb, GFP_ATOMIC); + if (!clone) { + sock_put(sk); + return NULL; } - skb = skb_clone(orig_skb, GFP_ATOMIC); - if (!skb) - return; + clone->sk = sk; + clone->destructor = sock_efree; + + return clone; +} +EXPORT_SYMBOL(skb_clone_sk); + +static void __skb_complete_tx_timestamp(struct sk_buff *skb, + struct sock *sk, + int tstype) +{ + struct sock_exterr_skb *serr; + int err; serr = SKB_EXT_ERR(skb); memset(serr, 0, sizeof(*serr)); @@ -3537,6 +3696,42 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb, if (err) kfree_skb(skb); } + +void skb_complete_tx_timestamp(struct sk_buff *skb, + struct skb_shared_hwtstamps *hwtstamps) +{ + struct sock *sk = skb->sk; + + /* take a reference to prevent skb_orphan() from freeing the socket */ + sock_hold(sk); + + *skb_hwtstamps(skb) = *hwtstamps; + __skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND); + + sock_put(sk); +} +EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp); + +void __skb_tstamp_tx(struct sk_buff *orig_skb, + struct skb_shared_hwtstamps *hwtstamps, + struct sock *sk, int tstype) +{ + struct sk_buff *skb; + + if (!sk) + return; + + if (hwtstamps) + *skb_hwtstamps(orig_skb) = *hwtstamps; + else + orig_skb->tstamp = ktime_get_real(); + + skb = skb_clone(orig_skb, GFP_ATOMIC); + if (!skb) + return; + + __skb_complete_tx_timestamp(skb, sk, tstype); +} EXPORT_SYMBOL_GPL(__skb_tstamp_tx); void skb_tstamp_tx(struct sk_buff *orig_skb, @@ -3561,9 +3756,14 @@ void skb_complete_wifi_ack(struct sk_buff *skb, bool acked) serr->ee.ee_errno = ENOMSG; serr->ee.ee_origin = SO_EE_ORIGIN_TXSTATUS; + /* take a reference to prevent skb_orphan() from freeing the socket */ + sock_hold(sk); + err = sock_queue_err_skb(sk, skb); if (err) kfree_skb(skb); + + sock_put(sk); } EXPORT_SYMBOL_GPL(skb_complete_wifi_ack); @@ -3864,7 +4064,8 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, return false; if (len <= skb_tailroom(to)) { - BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len)); + if (len) + BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len)); *delta_truesize = 0; return true; } @@ -3947,6 +4148,7 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet) skb->ignore_df = 0; skb_dst_drop(skb); skb->mark = 0; + skb_init_secmark(skb); secpath_reset(skb); nf_reset(skb); nf_reset_trace(skb); @@ -3966,15 +4168,22 @@ EXPORT_SYMBOL_GPL(skb_scrub_packet); unsigned int skb_gso_transport_seglen(const struct sk_buff *skb) { const struct skb_shared_info *shinfo = skb_shinfo(skb); + unsigned int thlen = 0; - if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) - return tcp_hdrlen(skb) + shinfo->gso_size; + if (skb->encapsulation) { + thlen = skb_inner_transport_header(skb) - + skb_transport_header(skb); + if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) + thlen += inner_tcp_hdrlen(skb); + } else if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) { + thlen = tcp_hdrlen(skb); + } /* UFO sets gso_size to the size of the fragmentation * payload, i.e. the size of the L4 (UDP) header is already * accounted for. */ - return shinfo->gso_size; + return thlen + shinfo->gso_size; } EXPORT_SYMBOL_GPL(skb_gso_transport_seglen); @@ -4029,3 +4238,188 @@ err_free: return NULL; } EXPORT_SYMBOL(skb_vlan_untag); + +int skb_ensure_writable(struct sk_buff *skb, int write_len) +{ + if (!pskb_may_pull(skb, write_len)) + return -ENOMEM; + + if (!skb_cloned(skb) || skb_clone_writable(skb, write_len)) + return 0; + + return pskb_expand_head(skb, 0, 0, GFP_ATOMIC); +} +EXPORT_SYMBOL(skb_ensure_writable); + +/* remove VLAN header from packet and update csum accordingly. */ +static int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci) +{ + struct vlan_hdr *vhdr; + unsigned int offset = skb->data - skb_mac_header(skb); + int err; + + __skb_push(skb, offset); + err = skb_ensure_writable(skb, VLAN_ETH_HLEN); + if (unlikely(err)) + goto pull; + + skb_postpull_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN); + + vhdr = (struct vlan_hdr *)(skb->data + ETH_HLEN); + *vlan_tci = ntohs(vhdr->h_vlan_TCI); + + memmove(skb->data + VLAN_HLEN, skb->data, 2 * ETH_ALEN); + __skb_pull(skb, VLAN_HLEN); + + vlan_set_encap_proto(skb, vhdr); + skb->mac_header += VLAN_HLEN; + + if (skb_network_offset(skb) < ETH_HLEN) + skb_set_network_header(skb, ETH_HLEN); + + skb_reset_mac_len(skb); +pull: + __skb_pull(skb, offset); + + return err; +} + +int skb_vlan_pop(struct sk_buff *skb) +{ + u16 vlan_tci; + __be16 vlan_proto; + int err; + + if (likely(vlan_tx_tag_present(skb))) { + skb->vlan_tci = 0; + } else { + if (unlikely((skb->protocol != htons(ETH_P_8021Q) && + skb->protocol != htons(ETH_P_8021AD)) || + skb->len < VLAN_ETH_HLEN)) + return 0; + + err = __skb_vlan_pop(skb, &vlan_tci); + if (err) + return err; + } + /* move next vlan tag to hw accel tag */ + if (likely((skb->protocol != htons(ETH_P_8021Q) && + skb->protocol != htons(ETH_P_8021AD)) || + skb->len < VLAN_ETH_HLEN)) + return 0; + + vlan_proto = skb->protocol; + err = __skb_vlan_pop(skb, &vlan_tci); + if (unlikely(err)) + return err; + + __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci); + return 0; +} +EXPORT_SYMBOL(skb_vlan_pop); + +int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) +{ + if (vlan_tx_tag_present(skb)) { + unsigned int offset = skb->data - skb_mac_header(skb); + int err; + + /* __vlan_insert_tag expect skb->data pointing to mac header. + * So change skb->data before calling it and change back to + * original position later + */ + __skb_push(skb, offset); + err = __vlan_insert_tag(skb, skb->vlan_proto, + vlan_tx_tag_get(skb)); + if (err) + return err; + skb->protocol = skb->vlan_proto; + skb->mac_len += VLAN_HLEN; + __skb_pull(skb, offset); + + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->csum = csum_add(skb->csum, csum_partial(skb->data + + (2 * ETH_ALEN), VLAN_HLEN, 0)); + } + __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci); + return 0; +} +EXPORT_SYMBOL(skb_vlan_push); + +/** + * alloc_skb_with_frags - allocate skb with page frags + * + * @header_len: size of linear part + * @data_len: needed length in frags + * @max_page_order: max page order desired. + * @errcode: pointer to error code if any + * @gfp_mask: allocation mask + * + * This can be used to allocate a paged skb, given a maximal order for frags. + */ +struct sk_buff *alloc_skb_with_frags(unsigned long header_len, + unsigned long data_len, + int max_page_order, + int *errcode, + gfp_t gfp_mask) +{ + int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT; + unsigned long chunk; + struct sk_buff *skb; + struct page *page; + gfp_t gfp_head; + int i; + + *errcode = -EMSGSIZE; + /* Note this test could be relaxed, if we succeed to allocate + * high order pages... + */ + if (npages > MAX_SKB_FRAGS) + return NULL; + + gfp_head = gfp_mask; + if (gfp_head & __GFP_WAIT) + gfp_head |= __GFP_REPEAT; + + *errcode = -ENOBUFS; + skb = alloc_skb(header_len, gfp_head); + if (!skb) + return NULL; + + skb->truesize += npages << PAGE_SHIFT; + + for (i = 0; npages > 0; i++) { + int order = max_page_order; + + while (order) { + if (npages >= 1 << order) { + page = alloc_pages(gfp_mask | + __GFP_COMP | + __GFP_NOWARN | + __GFP_NORETRY, + order); + if (page) + goto fill_page; + /* Do not retry other high order allocations */ + order = 1; + max_page_order = 0; + } + order--; + } + page = alloc_page(gfp_mask); + if (!page) + goto failure; +fill_page: + chunk = min_t(unsigned long, data_len, + PAGE_SIZE << order); + skb_fill_page_desc(skb, i, page, 0, chunk); + data_len -= chunk; + npages -= 1 << order; + } + return skb; + +failure: + kfree_skb(skb); + return NULL; +} +EXPORT_SYMBOL(alloc_skb_with_frags); diff --git a/net/core/sock.c b/net/core/sock.c index 9c3f823e76a9..1c7a33db1314 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -437,7 +437,6 @@ static void sock_disable_timestamp(struct sock *sk, unsigned long flags) int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { int err; - int skb_len; unsigned long flags; struct sk_buff_head *list = &sk->sk_receive_queue; @@ -459,13 +458,6 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) skb->dev = NULL; skb_set_owner_r(skb, sk); - /* Cache the SKB length before we tack it onto the receive - * queue. Once it is added it no longer belongs to us and - * may be freed by other threads of control pulling packets - * from the queue. - */ - skb_len = skb->len; - /* we escape from rcu protected region, make sure we dont leak * a norefcounted dst */ @@ -896,6 +888,19 @@ set_rcvbuf: } break; + case SO_ATTACH_BPF: + ret = -EINVAL; + if (optlen == sizeof(u32)) { + u32 ufd; + + ret = -EFAULT; + if (copy_from_user(&ufd, optval, sizeof(ufd))) + break; + + ret = sk_attach_bpf(ufd, sk); + } + break; + case SO_DETACH_FILTER: ret = sk_detach_filter(sk); break; @@ -1221,6 +1226,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname, v.val = sk->sk_max_pacing_rate; break; + case SO_INCOMING_CPU: + v.val = sk->sk_incoming_cpu; + break; + default: return -ENOPROTOOPT; } @@ -1489,9 +1498,6 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) atomic_set(&newsk->sk_omem_alloc, 0); skb_queue_head_init(&newsk->sk_receive_queue); skb_queue_head_init(&newsk->sk_write_queue); -#ifdef CONFIG_NET_DMA - skb_queue_head_init(&newsk->sk_async_wait_queue); -#endif spin_lock_init(&newsk->sk_dst_lock); rwlock_init(&newsk->sk_callback_lock); @@ -1528,6 +1534,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) newsk->sk_err = 0; newsk->sk_priority = 0; + newsk->sk_incoming_cpu = raw_smp_processor_id(); /* * Before updating sk_refcnt, we must commit prior changes to memory * (Documentation/RCU/rculist_nulls.txt for details) @@ -1645,18 +1652,24 @@ void sock_rfree(struct sk_buff *skb) } EXPORT_SYMBOL(sock_rfree); +void sock_efree(struct sk_buff *skb) +{ + sock_put(skb->sk); +} +EXPORT_SYMBOL(sock_efree); + +#ifdef CONFIG_INET void sock_edemux(struct sk_buff *skb) { struct sock *sk = skb->sk; -#ifdef CONFIG_INET if (sk->sk_state == TCP_TIME_WAIT) inet_twsk_put(inet_twsk(sk)); else -#endif sock_put(sk); } EXPORT_SYMBOL(sock_edemux); +#endif kuid_t sock_i_uid(struct sock *sk) { @@ -1718,16 +1731,34 @@ void *sock_kmalloc(struct sock *sk, int size, gfp_t priority) } EXPORT_SYMBOL(sock_kmalloc); -/* - * Free an option memory block. +/* Free an option memory block. Note, we actually want the inline + * here as this allows gcc to detect the nullify and fold away the + * condition entirely. */ -void sock_kfree_s(struct sock *sk, void *mem, int size) +static inline void __sock_kfree_s(struct sock *sk, void *mem, int size, + const bool nullify) { - kfree(mem); + if (WARN_ON_ONCE(!mem)) + return; + if (nullify) + kzfree(mem); + else + kfree(mem); atomic_sub(size, &sk->sk_omem_alloc); } + +void sock_kfree_s(struct sock *sk, void *mem, int size) +{ + __sock_kfree_s(sk, mem, size, false); +} EXPORT_SYMBOL(sock_kfree_s); +void sock_kzfree_s(struct sock *sk, void *mem, int size) +{ + __sock_kfree_s(sk, mem, size, true); +} +EXPORT_SYMBOL(sock_kzfree_s); + /* It is almost wait_for_tcp_memory minus release_sock/lock_sock. I think, these locks should be removed for datagram sockets. */ @@ -1764,21 +1795,12 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, unsigned long data_len, int noblock, int *errcode, int max_page_order) { - struct sk_buff *skb = NULL; - unsigned long chunk; - gfp_t gfp_mask; + struct sk_buff *skb; long timeo; int err; - int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT; - struct page *page; - int i; - - err = -EMSGSIZE; - if (npages > MAX_SKB_FRAGS) - goto failure; timeo = sock_sndtimeo(sk, noblock); - while (!skb) { + for (;;) { err = sock_error(sk); if (err != 0) goto failure; @@ -1787,66 +1809,27 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, if (sk->sk_shutdown & SEND_SHUTDOWN) goto failure; - if (atomic_read(&sk->sk_wmem_alloc) >= sk->sk_sndbuf) { - set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); - set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); - err = -EAGAIN; - if (!timeo) - goto failure; - if (signal_pending(current)) - goto interrupted; - timeo = sock_wait_for_wmem(sk, timeo); - continue; - } - - err = -ENOBUFS; - gfp_mask = sk->sk_allocation; - if (gfp_mask & __GFP_WAIT) - gfp_mask |= __GFP_REPEAT; + if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf) + break; - skb = alloc_skb(header_len, gfp_mask); - if (!skb) + set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + err = -EAGAIN; + if (!timeo) goto failure; - - skb->truesize += data_len; - - for (i = 0; npages > 0; i++) { - int order = max_page_order; - - while (order) { - if (npages >= 1 << order) { - page = alloc_pages(sk->sk_allocation | - __GFP_COMP | - __GFP_NOWARN | - __GFP_NORETRY, - order); - if (page) - goto fill_page; - /* Do not retry other high order allocations */ - order = 1; - max_page_order = 0; - } - order--; - } - page = alloc_page(sk->sk_allocation); - if (!page) - goto failure; -fill_page: - chunk = min_t(unsigned long, data_len, - PAGE_SIZE << order); - skb_fill_page_desc(skb, i, page, 0, chunk); - data_len -= chunk; - npages -= 1 << order; - } + if (signal_pending(current)) + goto interrupted; + timeo = sock_wait_for_wmem(sk, timeo); } - - skb_set_owner_w(skb, sk); + skb = alloc_skb_with_frags(header_len, data_len, max_page_order, + errcode, sk->sk_allocation); + if (skb) + skb_set_owner_w(skb, sk); return skb; interrupted: err = sock_intr_errno(timeo); failure: - kfree_skb(skb); *errcode = err; return NULL; } @@ -2308,9 +2291,6 @@ void sock_init_data(struct socket *sock, struct sock *sk) skb_queue_head_init(&sk->sk_receive_queue); skb_queue_head_init(&sk->sk_write_queue); skb_queue_head_init(&sk->sk_error_queue); -#ifdef CONFIG_NET_DMA - skb_queue_head_init(&sk->sk_async_wait_queue); -#endif sk->sk_send_head = NULL; @@ -2498,11 +2478,11 @@ int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, int level, int type) { struct sock_exterr_skb *serr; - struct sk_buff *skb, *skb2; + struct sk_buff *skb; int copied, err; err = -EAGAIN; - skb = skb_dequeue(&sk->sk_error_queue); + skb = sock_dequeue_err_skb(sk); if (skb == NULL) goto out; @@ -2511,7 +2491,7 @@ int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, msg->msg_flags |= MSG_TRUNC; copied = len; } - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto out_free_skb; @@ -2523,16 +2503,6 @@ int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, msg->msg_flags |= MSG_ERRQUEUE; err = copied; - /* Reset and regenerate socket error */ - spin_lock_bh(&sk->sk_error_queue.lock); - sk->sk_err = 0; - if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) { - sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno; - spin_unlock_bh(&sk->sk_error_queue.lock); - sk->sk_error_report(sk); - } else - spin_unlock_bh(&sk->sk_error_queue.lock); - out_free_skb: kfree_skb(skb); out: diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index cf9cd13509a7..31baba2a71ce 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -26,6 +26,8 @@ static int zero = 0; static int one = 1; static int ushort_max = USHRT_MAX; +static int net_msg_warn; /* Unused, but still a sysctl */ + #ifdef CONFIG_RPS static int rps_sock_flow_sysctl(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -215,6 +217,18 @@ static int set_default_qdisc(struct ctl_table *table, int write, } #endif +static int proc_do_rss_key(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct ctl_table fake_table; + char buf[NETDEV_RSS_KEY_LEN * 3]; + + snprintf(buf, sizeof(buf), "%*phC", NETDEV_RSS_KEY_LEN, netdev_rss_key); + fake_table.data = buf; + fake_table.maxlen = sizeof(buf); + return proc_dostring(&fake_table, write, buffer, lenp, ppos); +} + static struct ctl_table net_core_table[] = { #ifdef CONFIG_NET { @@ -263,6 +277,13 @@ static struct ctl_table net_core_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "netdev_rss_key", + .data = &netdev_rss_key, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = proc_do_rss_key, + }, #ifdef CONFIG_BPF_JIT { .procname = "bpf_jit_enable", diff --git a/net/core/timestamping.c b/net/core/timestamping.c index a8770391ea5b..43d3dd62fcc8 100644 --- a/net/core/timestamping.c +++ b/net/core/timestamping.c @@ -36,10 +36,9 @@ void skb_clone_tx_timestamp(struct sk_buff *skb) { struct phy_device *phydev; struct sk_buff *clone; - struct sock *sk = skb->sk; unsigned int type; - if (!sk) + if (!skb->sk) return; type = classify(skb); @@ -48,50 +47,14 @@ void skb_clone_tx_timestamp(struct sk_buff *skb) phydev = skb->dev->phydev; if (likely(phydev->drv->txtstamp)) { - if (!atomic_inc_not_zero(&sk->sk_refcnt)) + clone = skb_clone_sk(skb); + if (!clone) return; - - clone = skb_clone(skb, GFP_ATOMIC); - if (!clone) { - sock_put(sk); - return; - } - - clone->sk = sk; phydev->drv->txtstamp(phydev, clone, type); } } EXPORT_SYMBOL_GPL(skb_clone_tx_timestamp); -void skb_complete_tx_timestamp(struct sk_buff *skb, - struct skb_shared_hwtstamps *hwtstamps) -{ - struct sock *sk = skb->sk; - struct sock_exterr_skb *serr; - int err; - - if (!hwtstamps) { - sock_put(sk); - kfree_skb(skb); - return; - } - - *skb_hwtstamps(skb) = *hwtstamps; - - serr = SKB_EXT_ERR(skb); - memset(serr, 0, sizeof(*serr)); - serr->ee.ee_errno = ENOMSG; - serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; - skb->sk = NULL; - - err = sock_queue_err_skb(sk, skb); - - sock_put(sk); - if (err) - kfree_skb(skb); -} -EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp); - bool skb_defer_rx_timestamp(struct sk_buff *skb) { struct phy_device *phydev; diff --git a/net/core/tso.c b/net/core/tso.c index 8c3203c585b0..630b30b4fb53 100644 --- a/net/core/tso.c +++ b/net/core/tso.c @@ -1,6 +1,7 @@ #include <linux/export.h> #include <net/ip.h> #include <net/tso.h> +#include <asm/unaligned.h> /* Calculate expected number of TX descriptors */ int tso_count_descs(struct sk_buff *skb) @@ -23,7 +24,7 @@ void tso_build_hdr(struct sk_buff *skb, char *hdr, struct tso_t *tso, iph->id = htons(tso->ip_id); iph->tot_len = htons(size + hdr_len - mac_hdr_len); tcph = (struct tcphdr *)(hdr + skb_transport_offset(skb)); - tcph->seq = htonl(tso->tcp_seq); + put_unaligned_be32(tso->tcp_seq, &tcph->seq); tso->ip_id++; if (!is_last) { diff --git a/net/core/user_dma.c b/net/core/user_dma.c deleted file mode 100644 index 1b5fefdb8198..000000000000 --- a/net/core/user_dma.c +++ /dev/null @@ -1,131 +0,0 @@ -/* - * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved. - * Portions based on net/core/datagram.c and copyrighted by their authors. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., 59 - * Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - * The full GNU General Public License is included in this distribution in the - * file called COPYING. - */ - -/* - * This code allows the net stack to make use of a DMA engine for - * skb to iovec copies. - */ - -#include <linux/dmaengine.h> -#include <linux/socket.h> -#include <linux/export.h> -#include <net/tcp.h> -#include <net/netdma.h> - -#define NET_DMA_DEFAULT_COPYBREAK 4096 - -int sysctl_tcp_dma_copybreak = NET_DMA_DEFAULT_COPYBREAK; -EXPORT_SYMBOL(sysctl_tcp_dma_copybreak); - -/** - * dma_skb_copy_datagram_iovec - Copy a datagram to an iovec. - * @skb - buffer to copy - * @offset - offset in the buffer to start copying from - * @iovec - io vector to copy to - * @len - amount of data to copy from buffer to iovec - * @pinned_list - locked iovec buffer data - * - * Note: the iovec is modified during the copy. - */ -int dma_skb_copy_datagram_iovec(struct dma_chan *chan, - struct sk_buff *skb, int offset, struct iovec *to, - size_t len, struct dma_pinned_list *pinned_list) -{ - int start = skb_headlen(skb); - int i, copy = start - offset; - struct sk_buff *frag_iter; - dma_cookie_t cookie = 0; - - /* Copy header. */ - if (copy > 0) { - if (copy > len) - copy = len; - cookie = dma_memcpy_to_iovec(chan, to, pinned_list, - skb->data + offset, copy); - if (cookie < 0) - goto fault; - len -= copy; - if (len == 0) - goto end; - offset += copy; - } - - /* Copy paged appendix. Hmm... why does this look so complicated? */ - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { - int end; - const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; - - WARN_ON(start > offset + len); - - end = start + skb_frag_size(frag); - copy = end - offset; - if (copy > 0) { - struct page *page = skb_frag_page(frag); - - if (copy > len) - copy = len; - - cookie = dma_memcpy_pg_to_iovec(chan, to, pinned_list, page, - frag->page_offset + offset - start, copy); - if (cookie < 0) - goto fault; - len -= copy; - if (len == 0) - goto end; - offset += copy; - } - start = end; - } - - skb_walk_frags(skb, frag_iter) { - int end; - - WARN_ON(start > offset + len); - - end = start + frag_iter->len; - copy = end - offset; - if (copy > 0) { - if (copy > len) - copy = len; - cookie = dma_skb_copy_datagram_iovec(chan, frag_iter, - offset - start, - to, copy, - pinned_list); - if (cookie < 0) - goto fault; - len -= copy; - if (len == 0) - goto end; - offset += copy; - } - start = end; - } - -end: - if (!len) { - skb->dma_cookie = cookie; - return cookie; - } - -fault: - return -EFAULT; -} diff --git a/net/core/utils.c b/net/core/utils.c index eed34338736c..7b803884c162 100644 --- a/net/core/utils.c +++ b/net/core/utils.c @@ -33,9 +33,6 @@ #include <asm/byteorder.h> #include <asm/uaccess.h> -int net_msg_warn __read_mostly = 1; -EXPORT_SYMBOL(net_msg_warn); - DEFINE_RATELIMIT_STATE(net_ratelimit_state, 5 * HZ, 10); /* * All net warning printk()s should be guarded by this function. @@ -306,16 +303,14 @@ EXPORT_SYMBOL(in6_pton); void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb, __be32 from, __be32 to, int pseudohdr) { - __be32 diff[] = { ~from, to }; if (skb->ip_summed != CHECKSUM_PARTIAL) { - *sum = csum_fold(csum_partial(diff, sizeof(diff), - ~csum_unfold(*sum))); + *sum = csum_fold(csum_add(csum_sub(~csum_unfold(*sum), from), + to)); if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr) - skb->csum = ~csum_partial(diff, sizeof(diff), - ~skb->csum); + skb->csum = ~csum_add(csum_sub(~(skb->csum), from), to); } else if (pseudohdr) - *sum = ~csum_fold(csum_partial(diff, sizeof(diff), - csum_unfold(*sum))); + *sum = ~csum_fold(csum_add(csum_sub(csum_unfold(*sum), from), + to)); } EXPORT_SYMBOL(inet_proto_csum_replace4); |
