summaryrefslogtreecommitdiff
path: root/net/core
diff options
context:
space:
mode:
Diffstat (limited to 'net/core')
-rw-r--r--net/core/Makefile1
-rw-r--r--net/core/datagram.c301
-rw-r--r--net/core/dev.c788
-rw-r--r--net/core/dev_addr_lists.c4
-rw-r--r--net/core/dev_ioctl.c16
-rw-r--r--net/core/drop_monitor.c2
-rw-r--r--net/core/dst.c24
-rw-r--r--net/core/ethtool.c169
-rw-r--r--net/core/filter.c151
-rw-r--r--net/core/flow_dissector.c135
-rw-r--r--net/core/gen_estimator.c29
-rw-r--r--net/core/gen_stats.c112
-rw-r--r--net/core/iovec.c47
-rw-r--r--net/core/link_watch.c2
-rw-r--r--net/core/neighbour.c279
-rw-r--r--net/core/net-sysfs.c44
-rw-r--r--net/core/net_namespace.c41
-rw-r--r--net/core/netpoll.c11
-rw-r--r--net/core/pktgen.c79
-rw-r--r--net/core/rtnetlink.c259
-rw-r--r--net/core/scm.c3
-rw-r--r--net/core/secure_seq.c6
-rw-r--r--net/core/skbuff.c712
-rw-r--r--net/core/sock.c162
-rw-r--r--net/core/sysctl_net_core.c21
-rw-r--r--net/core/timestamping.c43
-rw-r--r--net/core/tso.c3
-rw-r--r--net/core/user_dma.c131
-rw-r--r--net/core/utils.c15
29 files changed, 2127 insertions, 1463 deletions
diff --git a/net/core/Makefile b/net/core/Makefile
index 71093d94ad2b..235e6c50708d 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -16,7 +16,6 @@ obj-y += net-sysfs.o
obj-$(CONFIG_PROC_FS) += net-procfs.o
obj-$(CONFIG_NET_PKTGEN) += pktgen.o
obj-$(CONFIG_NETPOLL) += netpoll.o
-obj-$(CONFIG_NET_DMA) += user_dma.o
obj-$(CONFIG_FIB_RULES) += fib_rules.o
obj-$(CONFIG_TRACEPOINTS) += net-traces.o
obj-$(CONFIG_NET_DROP_MONITOR) += drop_monitor.o
diff --git a/net/core/datagram.c b/net/core/datagram.c
index fdbc9a81d4c2..df493d68330c 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -49,6 +49,7 @@
#include <linux/spinlock.h>
#include <linux/slab.h>
#include <linux/pagemap.h>
+#include <linux/uio.h>
#include <net/protocol.h>
#include <linux/skbuff.h>
@@ -309,16 +310,14 @@ int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags)
EXPORT_SYMBOL(skb_kill_datagram);
/**
- * skb_copy_datagram_iovec - Copy a datagram to an iovec.
+ * skb_copy_datagram_iter - Copy a datagram to an iovec iterator.
* @skb: buffer to copy
* @offset: offset in the buffer to start copying from
- * @to: io vector to copy to
+ * @to: iovec iterator to copy to
* @len: amount of data to copy from buffer to iovec
- *
- * Note: the iovec is modified during the copy.
*/
-int skb_copy_datagram_iovec(const struct sk_buff *skb, int offset,
- struct iovec *to, int len)
+int skb_copy_datagram_iter(const struct sk_buff *skb, int offset,
+ struct iov_iter *to, int len)
{
int start = skb_headlen(skb);
int i, copy = start - offset;
@@ -330,8 +329,8 @@ int skb_copy_datagram_iovec(const struct sk_buff *skb, int offset,
if (copy > 0) {
if (copy > len)
copy = len;
- if (memcpy_toiovec(to, skb->data + offset, copy))
- goto fault;
+ if (copy_to_iter(skb->data + offset, copy, to) != copy)
+ goto short_copy;
if ((len -= copy) == 0)
return 0;
offset += copy;
@@ -346,18 +345,12 @@ int skb_copy_datagram_iovec(const struct sk_buff *skb, int offset,
end = start + skb_frag_size(frag);
if ((copy = end - offset) > 0) {
- int err;
- u8 *vaddr;
- struct page *page = skb_frag_page(frag);
-
if (copy > len)
copy = len;
- vaddr = kmap(page);
- err = memcpy_toiovec(to, vaddr + frag->page_offset +
- offset - start, copy);
- kunmap(page);
- if (err)
- goto fault;
+ if (copy_page_to_iter(skb_frag_page(frag),
+ frag->page_offset + offset -
+ start, copy, to) != copy)
+ goto short_copy;
if (!(len -= copy))
return 0;
offset += copy;
@@ -374,9 +367,8 @@ int skb_copy_datagram_iovec(const struct sk_buff *skb, int offset,
if ((copy = end - offset) > 0) {
if (copy > len)
copy = len;
- if (skb_copy_datagram_iovec(frag_iter,
- offset - start,
- to, copy))
+ if (skb_copy_datagram_iter(frag_iter, offset - start,
+ to, copy))
goto fault;
if ((len -= copy) == 0)
return 0;
@@ -387,113 +379,33 @@ int skb_copy_datagram_iovec(const struct sk_buff *skb, int offset,
if (!len)
return 0;
+ /* This is not really a user copy fault, but rather someone
+ * gave us a bogus length on the skb. We should probably
+ * print a warning here as it may indicate a kernel bug.
+ */
+
fault:
return -EFAULT;
-}
-EXPORT_SYMBOL(skb_copy_datagram_iovec);
-/**
- * skb_copy_datagram_const_iovec - Copy a datagram to an iovec.
- * @skb: buffer to copy
- * @offset: offset in the buffer to start copying from
- * @to: io vector to copy to
- * @to_offset: offset in the io vector to start copying to
- * @len: amount of data to copy from buffer to iovec
- *
- * Returns 0 or -EFAULT.
- * Note: the iovec is not modified during the copy.
- */
-int skb_copy_datagram_const_iovec(const struct sk_buff *skb, int offset,
- const struct iovec *to, int to_offset,
- int len)
-{
- int start = skb_headlen(skb);
- int i, copy = start - offset;
- struct sk_buff *frag_iter;
+short_copy:
+ if (iov_iter_count(to))
+ goto fault;
- /* Copy header. */
- if (copy > 0) {
- if (copy > len)
- copy = len;
- if (memcpy_toiovecend(to, skb->data + offset, to_offset, copy))
- goto fault;
- if ((len -= copy) == 0)
- return 0;
- offset += copy;
- to_offset += copy;
- }
-
- /* Copy paged appendix. Hmm... why does this look so complicated? */
- for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
- int end;
- const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
-
- WARN_ON(start > offset + len);
-
- end = start + skb_frag_size(frag);
- if ((copy = end - offset) > 0) {
- int err;
- u8 *vaddr;
- struct page *page = skb_frag_page(frag);
-
- if (copy > len)
- copy = len;
- vaddr = kmap(page);
- err = memcpy_toiovecend(to, vaddr + frag->page_offset +
- offset - start, to_offset, copy);
- kunmap(page);
- if (err)
- goto fault;
- if (!(len -= copy))
- return 0;
- offset += copy;
- to_offset += copy;
- }
- start = end;
- }
-
- skb_walk_frags(skb, frag_iter) {
- int end;
-
- WARN_ON(start > offset + len);
-
- end = start + frag_iter->len;
- if ((copy = end - offset) > 0) {
- if (copy > len)
- copy = len;
- if (skb_copy_datagram_const_iovec(frag_iter,
- offset - start,
- to, to_offset,
- copy))
- goto fault;
- if ((len -= copy) == 0)
- return 0;
- offset += copy;
- to_offset += copy;
- }
- start = end;
- }
- if (!len)
- return 0;
-
-fault:
- return -EFAULT;
+ return 0;
}
-EXPORT_SYMBOL(skb_copy_datagram_const_iovec);
+EXPORT_SYMBOL(skb_copy_datagram_iter);
/**
- * skb_copy_datagram_from_iovec - Copy a datagram from an iovec.
+ * skb_copy_datagram_from_iter - Copy a datagram from an iov_iter.
* @skb: buffer to copy
* @offset: offset in the buffer to start copying to
- * @from: io vector to copy to
- * @from_offset: offset in the io vector to start copying from
+ * @from: the copy source
* @len: amount of data to copy to buffer from iovec
*
* Returns 0 or -EFAULT.
- * Note: the iovec is not modified during the copy.
*/
-int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset,
- const struct iovec *from, int from_offset,
+int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset,
+ struct iov_iter *from,
int len)
{
int start = skb_headlen(skb);
@@ -504,13 +416,11 @@ int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset,
if (copy > 0) {
if (copy > len)
copy = len;
- if (memcpy_fromiovecend(skb->data + offset, from, from_offset,
- copy))
+ if (copy_from_iter(skb->data + offset, copy, from) != copy)
goto fault;
if ((len -= copy) == 0)
return 0;
offset += copy;
- from_offset += copy;
}
/* Copy paged appendix. Hmm... why does this look so complicated? */
@@ -522,24 +432,19 @@ int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset,
end = start + skb_frag_size(frag);
if ((copy = end - offset) > 0) {
- int err;
- u8 *vaddr;
- struct page *page = skb_frag_page(frag);
+ size_t copied;
if (copy > len)
copy = len;
- vaddr = kmap(page);
- err = memcpy_fromiovecend(vaddr + frag->page_offset +
- offset - start,
- from, from_offset, copy);
- kunmap(page);
- if (err)
+ copied = copy_page_from_iter(skb_frag_page(frag),
+ frag->page_offset + offset - start,
+ copy, from);
+ if (copied != copy)
goto fault;
if (!(len -= copy))
return 0;
offset += copy;
- from_offset += copy;
}
start = end;
}
@@ -553,16 +458,13 @@ int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset,
if ((copy = end - offset) > 0) {
if (copy > len)
copy = len;
- if (skb_copy_datagram_from_iovec(frag_iter,
- offset - start,
- from,
- from_offset,
- copy))
+ if (skb_copy_datagram_from_iter(frag_iter,
+ offset - start,
+ from, copy))
goto fault;
if ((len -= copy) == 0)
return 0;
offset += copy;
- from_offset += copy;
}
start = end;
}
@@ -572,101 +474,82 @@ int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset,
fault:
return -EFAULT;
}
-EXPORT_SYMBOL(skb_copy_datagram_from_iovec);
+EXPORT_SYMBOL(skb_copy_datagram_from_iter);
/**
- * zerocopy_sg_from_iovec - Build a zerocopy datagram from an iovec
+ * zerocopy_sg_from_iter - Build a zerocopy datagram from an iov_iter
* @skb: buffer to copy
- * @from: io vector to copy from
- * @offset: offset in the io vector to start copying from
- * @count: amount of vectors to copy to buffer from
+ * @from: the source to copy from
*
* The function will first copy up to headlen, and then pin the userspace
* pages and build frags through them.
*
* Returns 0, -EFAULT or -EMSGSIZE.
- * Note: the iovec is not modified during the copy
*/
-int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from,
- int offset, size_t count)
+int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from)
{
- int len = iov_length(from, count) - offset;
+ int len = iov_iter_count(from);
int copy = min_t(int, skb_headlen(skb), len);
- int size;
- int i = 0;
+ int frag = 0;
/* copy up to skb headlen */
- if (skb_copy_datagram_from_iovec(skb, 0, from, offset, copy))
+ if (skb_copy_datagram_from_iter(skb, 0, from, copy))
return -EFAULT;
- if (len == copy)
- return 0;
-
- offset += copy;
- while (count--) {
- struct page *page[MAX_SKB_FRAGS];
- int num_pages;
- unsigned long base;
+ while (iov_iter_count(from)) {
+ struct page *pages[MAX_SKB_FRAGS];
+ size_t start;
+ ssize_t copied;
unsigned long truesize;
+ int n = 0;
- /* Skip over from offset and copied */
- if (offset >= from->iov_len) {
- offset -= from->iov_len;
- ++from;
- continue;
- }
- len = from->iov_len - offset;
- base = (unsigned long)from->iov_base + offset;
- size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT;
- if (i + size > MAX_SKB_FRAGS)
+ if (frag == MAX_SKB_FRAGS)
return -EMSGSIZE;
- num_pages = get_user_pages_fast(base, size, 0, &page[i]);
- if (num_pages != size) {
- release_pages(&page[i], num_pages, 0);
+
+ copied = iov_iter_get_pages(from, pages, ~0U,
+ MAX_SKB_FRAGS - frag, &start);
+ if (copied < 0)
return -EFAULT;
- }
- truesize = size * PAGE_SIZE;
- skb->data_len += len;
- skb->len += len;
+
+ iov_iter_advance(from, copied);
+
+ truesize = PAGE_ALIGN(copied + start);
+ skb->data_len += copied;
+ skb->len += copied;
skb->truesize += truesize;
atomic_add(truesize, &skb->sk->sk_wmem_alloc);
- while (len) {
- int off = base & ~PAGE_MASK;
- int size = min_t(int, len, PAGE_SIZE - off);
- skb_fill_page_desc(skb, i, page[i], off, size);
- base += size;
- len -= size;
- i++;
+ while (copied) {
+ int size = min_t(int, copied, PAGE_SIZE - start);
+ skb_fill_page_desc(skb, frag++, pages[n], start, size);
+ start = 0;
+ copied -= size;
+ n++;
}
- offset = 0;
- ++from;
}
return 0;
}
-EXPORT_SYMBOL(zerocopy_sg_from_iovec);
+EXPORT_SYMBOL(zerocopy_sg_from_iter);
static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
- u8 __user *to, int len,
+ struct iov_iter *to, int len,
__wsum *csump)
{
int start = skb_headlen(skb);
int i, copy = start - offset;
struct sk_buff *frag_iter;
int pos = 0;
+ int n;
/* Copy header. */
if (copy > 0) {
- int err = 0;
if (copy > len)
copy = len;
- *csump = csum_and_copy_to_user(skb->data + offset, to, copy,
- *csump, &err);
- if (err)
+ n = csum_and_copy_to_iter(skb->data + offset, copy, csump, to);
+ if (n != copy)
goto fault;
if ((len -= copy) == 0)
return 0;
offset += copy;
- to += copy;
pos = copy;
}
@@ -678,26 +561,22 @@ static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
end = start + skb_frag_size(frag);
if ((copy = end - offset) > 0) {
- __wsum csum2;
- int err = 0;
- u8 *vaddr;
+ __wsum csum2 = 0;
struct page *page = skb_frag_page(frag);
+ u8 *vaddr = kmap(page);
if (copy > len)
copy = len;
- vaddr = kmap(page);
- csum2 = csum_and_copy_to_user(vaddr +
- frag->page_offset +
- offset - start,
- to, copy, 0, &err);
+ n = csum_and_copy_to_iter(vaddr + frag->page_offset +
+ offset - start, copy,
+ &csum2, to);
kunmap(page);
- if (err)
+ if (n != copy)
goto fault;
*csump = csum_block_add(*csump, csum2, pos);
if (!(len -= copy))
return 0;
offset += copy;
- to += copy;
pos += copy;
}
start = end;
@@ -722,7 +601,6 @@ static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
if ((len -= copy) == 0)
return 0;
offset += copy;
- to += copy;
pos += copy;
}
start = end;
@@ -775,20 +653,19 @@ __sum16 __skb_checksum_complete(struct sk_buff *skb)
EXPORT_SYMBOL(__skb_checksum_complete);
/**
- * skb_copy_and_csum_datagram_iovec - Copy and checksum skb to user iovec.
+ * skb_copy_and_csum_datagram_msg - Copy and checksum skb to user iovec.
* @skb: skbuff
* @hlen: hardware length
- * @iov: io vector
+ * @msg: destination
*
* Caller _must_ check that skb will fit to this iovec.
*
* Returns: 0 - success.
* -EINVAL - checksum failure.
- * -EFAULT - fault during copy. Beware, in this case iovec
- * can be modified!
+ * -EFAULT - fault during copy.
*/
-int skb_copy_and_csum_datagram_iovec(struct sk_buff *skb,
- int hlen, struct iovec *iov)
+int skb_copy_and_csum_datagram_msg(struct sk_buff *skb,
+ int hlen, struct msghdr *msg)
{
__wsum csum;
int chunk = skb->len - hlen;
@@ -796,28 +673,20 @@ int skb_copy_and_csum_datagram_iovec(struct sk_buff *skb,
if (!chunk)
return 0;
- /* Skip filled elements.
- * Pretty silly, look at memcpy_toiovec, though 8)
- */
- while (!iov->iov_len)
- iov++;
-
- if (iov->iov_len < chunk) {
+ if (iov_iter_count(&msg->msg_iter) < chunk) {
if (__skb_checksum_complete(skb))
goto csum_error;
- if (skb_copy_datagram_iovec(skb, hlen, iov, chunk))
+ if (skb_copy_datagram_msg(skb, hlen, msg, chunk))
goto fault;
} else {
csum = csum_partial(skb->data, hlen, skb->csum);
- if (skb_copy_and_csum_datagram(skb, hlen, iov->iov_base,
+ if (skb_copy_and_csum_datagram(skb, hlen, &msg->msg_iter,
chunk, &csum))
goto fault;
if (csum_fold(csum))
goto csum_error;
if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE))
netdev_rx_csum_fault(skb->dev);
- iov->iov_len -= chunk;
- iov->iov_base += chunk;
}
return 0;
csum_error:
@@ -825,7 +694,7 @@ csum_error:
fault:
return -EFAULT;
}
-EXPORT_SYMBOL(skb_copy_and_csum_datagram_iovec);
+EXPORT_SYMBOL(skb_copy_and_csum_datagram_msg);
/**
* datagram_poll - generic datagram poll
diff --git a/net/core/dev.c b/net/core/dev.c
index cf8a95f48cff..683d493aa1bf 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -118,6 +118,7 @@
#include <linux/if_vlan.h>
#include <linux/ip.h>
#include <net/ip.h>
+#include <net/mpls.h>
#include <linux/ipv6.h>
#include <linux/in.h>
#include <linux/jhash.h>
@@ -133,6 +134,7 @@
#include <linux/vmalloc.h>
#include <linux/if_macvlan.h>
#include <linux/errqueue.h>
+#include <linux/hrtimer.h>
#include "net-sysfs.h"
@@ -897,23 +899,25 @@ struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
EXPORT_SYMBOL(dev_getfirstbyhwtype);
/**
- * dev_get_by_flags_rcu - find any device with given flags
+ * __dev_get_by_flags - find any device with given flags
* @net: the applicable net namespace
* @if_flags: IFF_* values
* @mask: bitmask of bits in if_flags to check
*
* Search for any interface with the given flags. Returns NULL if a device
* is not found or a pointer to the device. Must be called inside
- * rcu_read_lock(), and result refcount is unchanged.
+ * rtnl_lock(), and result refcount is unchanged.
*/
-struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
- unsigned short mask)
+struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
+ unsigned short mask)
{
struct net_device *dev, *ret;
+ ASSERT_RTNL();
+
ret = NULL;
- for_each_netdev_rcu(net, dev) {
+ for_each_netdev(net, dev) {
if (((dev->flags ^ if_flags) & mask) == 0) {
ret = dev;
break;
@@ -921,7 +925,7 @@ struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags
}
return ret;
}
-EXPORT_SYMBOL(dev_get_by_flags_rcu);
+EXPORT_SYMBOL(__dev_get_by_flags);
/**
* dev_valid_name - check if name is okay for network device
@@ -1284,7 +1288,6 @@ static int __dev_open(struct net_device *dev)
clear_bit(__LINK_STATE_START, &dev->state);
else {
dev->flags |= IFF_UP;
- net_dmaengine_get();
dev_set_rx_mode(dev);
dev_activate(dev);
add_device_randomness(dev->dev_addr, dev->addr_len);
@@ -1363,7 +1366,6 @@ static int __dev_close_many(struct list_head *head)
ops->ndo_stop(dev);
dev->flags &= ~IFF_UP;
- net_dmaengine_put();
netpoll_poll_enable(dev);
}
@@ -1435,22 +1437,17 @@ EXPORT_SYMBOL(dev_close);
*/
void dev_disable_lro(struct net_device *dev)
{
- /*
- * If we're trying to disable lro on a vlan device
- * use the underlying physical device instead
- */
- if (is_vlan_dev(dev))
- dev = vlan_dev_real_dev(dev);
-
- /* the same for macvlan devices */
- if (netif_is_macvlan(dev))
- dev = macvlan_dev_real_dev(dev);
+ struct net_device *lower_dev;
+ struct list_head *iter;
dev->wanted_features &= ~NETIF_F_LRO;
netdev_update_features(dev);
if (unlikely(dev->features & NETIF_F_LRO))
netdev_WARN(dev, "failed to disable LRO!\n");
+
+ netdev_for_each_lower_dev(dev, lower_dev, iter)
+ dev_disable_lro(lower_dev);
}
EXPORT_SYMBOL(dev_disable_lro);
@@ -1697,6 +1694,7 @@ int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
skb_scrub_packet(skb, true);
skb->protocol = eth_type_trans(skb, dev);
+ skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
return 0;
}
@@ -2153,7 +2151,7 @@ static inline void __netif_reschedule(struct Qdisc *q)
unsigned long flags;
local_irq_save(flags);
- sd = &__get_cpu_var(softnet_data);
+ sd = this_cpu_ptr(&softnet_data);
q->next_sched = NULL;
*sd->output_queue_tailp = q;
sd->output_queue_tailp = &q->next_sched;
@@ -2177,6 +2175,53 @@ static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
return (struct dev_kfree_skb_cb *)skb->cb;
}
+void netif_schedule_queue(struct netdev_queue *txq)
+{
+ rcu_read_lock();
+ if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
+ struct Qdisc *q = rcu_dereference(txq->qdisc);
+
+ __netif_schedule(q);
+ }
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL(netif_schedule_queue);
+
+/**
+ * netif_wake_subqueue - allow sending packets on subqueue
+ * @dev: network device
+ * @queue_index: sub queue index
+ *
+ * Resume individual transmit queue of a device with multiple transmit queues.
+ */
+void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
+{
+ struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
+
+ if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
+ struct Qdisc *q;
+
+ rcu_read_lock();
+ q = rcu_dereference(txq->qdisc);
+ __netif_schedule(q);
+ rcu_read_unlock();
+ }
+}
+EXPORT_SYMBOL(netif_wake_subqueue);
+
+void netif_tx_wake_queue(struct netdev_queue *dev_queue)
+{
+ if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
+ struct Qdisc *q;
+
+ rcu_read_lock();
+ q = rcu_dereference(dev_queue->qdisc);
+ __netif_schedule(q);
+ rcu_read_unlock();
+ }
+}
+EXPORT_SYMBOL(netif_tx_wake_queue);
+
void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
{
unsigned long flags;
@@ -2373,16 +2418,6 @@ struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
rcu_read_lock();
list_for_each_entry_rcu(ptype, &offload_base, list) {
if (ptype->type == type && ptype->callbacks.gso_segment) {
- if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
- int err;
-
- err = ptype->callbacks.gso_send_check(skb);
- segs = ERR_PTR(err);
- if (err || skb_gso_ok(skb, features))
- break;
- __skb_push(skb, (skb->data -
- skb_network_header(skb)));
- }
segs = ptype->callbacks.gso_segment(skb, features);
break;
}
@@ -2485,61 +2520,15 @@ static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
return 0;
}
-struct dev_gso_cb {
- void (*destructor)(struct sk_buff *skb);
-};
-
-#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
-
-static void dev_gso_skb_destructor(struct sk_buff *skb)
-{
- struct dev_gso_cb *cb;
-
- kfree_skb_list(skb->next);
- skb->next = NULL;
-
- cb = DEV_GSO_CB(skb);
- if (cb->destructor)
- cb->destructor(skb);
-}
-
-/**
- * dev_gso_segment - Perform emulated hardware segmentation on skb.
- * @skb: buffer to segment
- * @features: device features as applicable to this skb
- *
- * This function segments the given skb and stores the list of segments
- * in skb->next.
- */
-static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
-{
- struct sk_buff *segs;
-
- segs = skb_gso_segment(skb, features);
-
- /* Verifying header integrity only. */
- if (!segs)
- return 0;
-
- if (IS_ERR(segs))
- return PTR_ERR(segs);
-
- skb->next = segs;
- DEV_GSO_CB(skb)->destructor = skb->destructor;
- skb->destructor = dev_gso_skb_destructor;
-
- return 0;
-}
-
/* If MPLS offload request, verify we are testing hardware MPLS features
* instead of standard features for the netdev.
*/
-#ifdef CONFIG_NET_MPLS_GSO
+#if IS_ENABLED(CONFIG_NET_MPLS_GSO)
static netdev_features_t net_mpls_features(struct sk_buff *skb,
netdev_features_t features,
__be16 type)
{
- if (type == htons(ETH_P_MPLS_UC) || type == htons(ETH_P_MPLS_MC))
+ if (eth_p_mpls(type))
features &= skb->dev->mpls_features;
return features;
@@ -2574,21 +2563,33 @@ static netdev_features_t harmonize_features(struct sk_buff *skb,
netdev_features_t netif_skb_features(struct sk_buff *skb)
{
+ struct net_device *dev = skb->dev;
+ netdev_features_t features = dev->features;
+ u16 gso_segs = skb_shinfo(skb)->gso_segs;
__be16 protocol = skb->protocol;
- netdev_features_t features = skb->dev->features;
- if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
+ if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs)
features &= ~NETIF_F_GSO_MASK;
- if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) {
- struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
- protocol = veh->h_vlan_encapsulated_proto;
- } else if (!vlan_tx_tag_present(skb)) {
- return harmonize_features(skb, features);
+ /* If encapsulation offload request, verify we are testing
+ * hardware encapsulation features instead of standard
+ * features for the netdev
+ */
+ if (skb->encapsulation)
+ features &= dev->hw_enc_features;
+
+ if (!vlan_tx_tag_present(skb)) {
+ if (unlikely(protocol == htons(ETH_P_8021Q) ||
+ protocol == htons(ETH_P_8021AD))) {
+ struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
+ protocol = veh->h_vlan_encapsulated_proto;
+ } else {
+ goto finalize;
+ }
}
features = netdev_intersect_features(features,
- skb->dev->vlan_features |
+ dev->vlan_features |
NETIF_F_HW_VLAN_CTAG_TX |
NETIF_F_HW_VLAN_STAG_TX);
@@ -2601,123 +2602,147 @@ netdev_features_t netif_skb_features(struct sk_buff *skb)
NETIF_F_HW_VLAN_CTAG_TX |
NETIF_F_HW_VLAN_STAG_TX);
+finalize:
+ if (dev->netdev_ops->ndo_features_check)
+ features &= dev->netdev_ops->ndo_features_check(skb, dev,
+ features);
+
return harmonize_features(skb, features);
}
EXPORT_SYMBOL(netif_skb_features);
-int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
- struct netdev_queue *txq)
+static int xmit_one(struct sk_buff *skb, struct net_device *dev,
+ struct netdev_queue *txq, bool more)
{
- const struct net_device_ops *ops = dev->netdev_ops;
- int rc = NETDEV_TX_OK;
- unsigned int skb_len;
+ unsigned int len;
+ int rc;
- if (likely(!skb->next)) {
- netdev_features_t features;
+ if (!list_empty(&ptype_all))
+ dev_queue_xmit_nit(skb, dev);
- /*
- * If device doesn't need skb->dst, release it right now while
- * its hot in this cpu cache
- */
- if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
- skb_dst_drop(skb);
+ len = skb->len;
+ trace_net_dev_start_xmit(skb, dev);
+ rc = netdev_start_xmit(skb, dev, txq, more);
+ trace_net_dev_xmit(skb, rc, dev, len);
- features = netif_skb_features(skb);
+ return rc;
+}
+
+struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
+ struct netdev_queue *txq, int *ret)
+{
+ struct sk_buff *skb = first;
+ int rc = NETDEV_TX_OK;
- if (vlan_tx_tag_present(skb) &&
- !vlan_hw_offload_capable(features, skb->vlan_proto)) {
- skb = __vlan_put_tag(skb, skb->vlan_proto,
- vlan_tx_tag_get(skb));
- if (unlikely(!skb))
- goto out;
+ while (skb) {
+ struct sk_buff *next = skb->next;
- skb->vlan_tci = 0;
+ skb->next = NULL;
+ rc = xmit_one(skb, dev, txq, next != NULL);
+ if (unlikely(!dev_xmit_complete(rc))) {
+ skb->next = next;
+ goto out;
}
- /* If encapsulation offload request, verify we are testing
- * hardware encapsulation features instead of standard
- * features for the netdev
- */
- if (skb->encapsulation)
- features &= dev->hw_enc_features;
+ skb = next;
+ if (netif_xmit_stopped(txq) && skb) {
+ rc = NETDEV_TX_BUSY;
+ break;
+ }
+ }
- if (netif_needs_gso(skb, features)) {
- if (unlikely(dev_gso_segment(skb, features)))
- goto out_kfree_skb;
- if (skb->next)
- goto gso;
- } else {
- if (skb_needs_linearize(skb, features) &&
- __skb_linearize(skb))
- goto out_kfree_skb;
+out:
+ *ret = rc;
+ return skb;
+}
- /* If packet is not checksummed and device does not
- * support checksumming for this protocol, complete
- * checksumming here.
- */
- if (skb->ip_summed == CHECKSUM_PARTIAL) {
- if (skb->encapsulation)
- skb_set_inner_transport_header(skb,
- skb_checksum_start_offset(skb));
- else
- skb_set_transport_header(skb,
- skb_checksum_start_offset(skb));
- if (!(features & NETIF_F_ALL_CSUM) &&
- skb_checksum_help(skb))
- goto out_kfree_skb;
- }
- }
+static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
+ netdev_features_t features)
+{
+ if (vlan_tx_tag_present(skb) &&
+ !vlan_hw_offload_capable(features, skb->vlan_proto))
+ skb = __vlan_hwaccel_push_inside(skb);
+ return skb;
+}
- if (!list_empty(&ptype_all))
- dev_queue_xmit_nit(skb, dev);
+static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
+{
+ netdev_features_t features;
- skb_len = skb->len;
- trace_net_dev_start_xmit(skb, dev);
- rc = ops->ndo_start_xmit(skb, dev);
- trace_net_dev_xmit(skb, rc, dev, skb_len);
- if (rc == NETDEV_TX_OK)
- txq_trans_update(txq);
- return rc;
- }
+ if (skb->next)
+ return skb;
-gso:
- do {
- struct sk_buff *nskb = skb->next;
+ features = netif_skb_features(skb);
+ skb = validate_xmit_vlan(skb, features);
+ if (unlikely(!skb))
+ goto out_null;
- skb->next = nskb->next;
- nskb->next = NULL;
+ if (netif_needs_gso(dev, skb, features)) {
+ struct sk_buff *segs;
- if (!list_empty(&ptype_all))
- dev_queue_xmit_nit(nskb, dev);
-
- skb_len = nskb->len;
- trace_net_dev_start_xmit(nskb, dev);
- rc = ops->ndo_start_xmit(nskb, dev);
- trace_net_dev_xmit(nskb, rc, dev, skb_len);
- if (unlikely(rc != NETDEV_TX_OK)) {
- if (rc & ~NETDEV_TX_MASK)
- goto out_kfree_gso_skb;
- nskb->next = skb->next;
- skb->next = nskb;
- return rc;
+ segs = skb_gso_segment(skb, features);
+ if (IS_ERR(segs)) {
+ goto out_kfree_skb;
+ } else if (segs) {
+ consume_skb(skb);
+ skb = segs;
+ }
+ } else {
+ if (skb_needs_linearize(skb, features) &&
+ __skb_linearize(skb))
+ goto out_kfree_skb;
+
+ /* If packet is not checksummed and device does not
+ * support checksumming for this protocol, complete
+ * checksumming here.
+ */
+ if (skb->ip_summed == CHECKSUM_PARTIAL) {
+ if (skb->encapsulation)
+ skb_set_inner_transport_header(skb,
+ skb_checksum_start_offset(skb));
+ else
+ skb_set_transport_header(skb,
+ skb_checksum_start_offset(skb));
+ if (!(features & NETIF_F_ALL_CSUM) &&
+ skb_checksum_help(skb))
+ goto out_kfree_skb;
}
- txq_trans_update(txq);
- if (unlikely(netif_xmit_stopped(txq) && skb->next))
- return NETDEV_TX_BUSY;
- } while (skb->next);
-
-out_kfree_gso_skb:
- if (likely(skb->next == NULL)) {
- skb->destructor = DEV_GSO_CB(skb)->destructor;
- consume_skb(skb);
- return rc;
}
+
+ return skb;
+
out_kfree_skb:
kfree_skb(skb);
-out:
- return rc;
+out_null:
+ return NULL;
+}
+
+struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
+{
+ struct sk_buff *next, *head = NULL, *tail;
+
+ for (; skb != NULL; skb = next) {
+ next = skb->next;
+ skb->next = NULL;
+
+ /* in case skb wont be segmented, point to itself */
+ skb->prev = skb;
+
+ skb = validate_xmit_skb(skb, dev);
+ if (!skb)
+ continue;
+
+ if (!head)
+ head = skb;
+ else
+ tail->next = skb;
+ /* If skb was segmented, skb->prev points to
+ * the last segment. If not, it still contains skb.
+ */
+ tail = skb->prev;
+ }
+ return head;
}
-EXPORT_SYMBOL_GPL(dev_hard_start_xmit);
static void qdisc_pkt_len_init(struct sk_buff *skb)
{
@@ -2780,12 +2805,10 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
* waiting to be sent out; and the qdisc is not running -
* xmit the skb directly.
*/
- if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
- skb_dst_force(skb);
qdisc_bstats_update(q, skb);
- if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
+ if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
if (unlikely(contended)) {
spin_unlock(&q->busylock);
contended = false;
@@ -2796,7 +2819,6 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
rc = NET_XMIT_SUCCESS;
} else {
- skb_dst_force(skb);
rc = q->enqueue(skb, q) & NET_XMIT_MASK;
if (qdisc_run_begin(q)) {
if (unlikely(contended)) {
@@ -2893,6 +2915,14 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
skb_update_prio(skb);
+ /* If device/qdisc don't need skb->dst, release it right now while
+ * its hot in this cpu cache.
+ */
+ if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
+ skb_dst_drop(skb);
+ else
+ skb_dst_force(skb);
+
txq = netdev_pick_tx(dev, skb, accel_priv);
q = rcu_dereference_bh(txq->qdisc);
@@ -2925,11 +2955,15 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
goto recursion_alert;
+ skb = validate_xmit_skb(skb, dev);
+ if (!skb)
+ goto drop;
+
HARD_TX_LOCK(dev, txq, cpu);
if (!netif_xmit_stopped(txq)) {
__this_cpu_inc(xmit_recursion);
- rc = dev_hard_start_xmit(skb, dev, txq);
+ skb = dev_hard_start_xmit(skb, dev, txq, &rc);
__this_cpu_dec(xmit_recursion);
if (dev_xmit_complete(rc)) {
HARD_TX_UNLOCK(dev, txq);
@@ -2950,10 +2984,11 @@ recursion_alert:
}
rc = -ENETDOWN;
+drop:
rcu_read_unlock_bh();
atomic_long_inc(&dev->tx_dropped);
- kfree_skb(skb);
+ kfree_skb_list(skb);
return rc;
out:
rcu_read_unlock_bh();
@@ -3130,8 +3165,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
}
if (map) {
- tcpu = map->cpus[((u64) hash * map->len) >> 32];
-
+ tcpu = map->cpus[reciprocal_scale(hash, map->len)];
if (cpu_online(tcpu)) {
cpu = tcpu;
goto done;
@@ -3201,7 +3235,7 @@ static void rps_trigger_softirq(void *data)
static int rps_ipi_queued(struct softnet_data *sd)
{
#ifdef CONFIG_RPS
- struct softnet_data *mysd = &__get_cpu_var(softnet_data);
+ struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
if (sd != mysd) {
sd->rps_ipi_next = mysd->rps_ipi_list;
@@ -3228,7 +3262,7 @@ static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
if (qlen < (netdev_max_backlog >> 1))
return false;
- sd = &__get_cpu_var(softnet_data);
+ sd = this_cpu_ptr(&softnet_data);
rcu_read_lock();
fl = rcu_dereference(sd->flow_limit);
@@ -3272,7 +3306,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
rps_lock(sd);
qlen = skb_queue_len(&sd->input_pkt_queue);
if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
- if (skb_queue_len(&sd->input_pkt_queue)) {
+ if (qlen) {
enqueue:
__skb_queue_tail(&sd->input_pkt_queue, skb);
input_queue_tail_incr_save(sd, qtail);
@@ -3375,7 +3409,7 @@ EXPORT_SYMBOL(netif_rx_ni);
static void net_tx_action(struct softirq_action *h)
{
- struct softnet_data *sd = &__get_cpu_var(softnet_data);
+ struct softnet_data *sd = this_cpu_ptr(&softnet_data);
if (sd->completion_queue) {
struct sk_buff *clist;
@@ -3467,7 +3501,7 @@ static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
- q = rxq->qdisc;
+ q = rcu_dereference(rxq->qdisc);
if (q != &noop_qdisc) {
spin_lock(qdisc_lock(q));
if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
@@ -3484,7 +3518,7 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb,
{
struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
- if (!rxq || rxq->qdisc == &noop_qdisc)
+ if (!rxq || rcu_access_pointer(rxq->qdisc) == &noop_qdisc)
goto out;
if (*pt_prev) {
@@ -3800,7 +3834,7 @@ EXPORT_SYMBOL(netif_receive_skb);
static void flush_backlog(void *arg)
{
struct net_device *dev = arg;
- struct softnet_data *sd = &__get_cpu_var(softnet_data);
+ struct softnet_data *sd = this_cpu_ptr(&softnet_data);
struct sk_buff *skb, *tmp;
rps_lock(sd);
@@ -3965,11 +3999,10 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
if (!(skb->dev->features & NETIF_F_GRO))
goto normal;
- if (skb_is_gso(skb) || skb_has_frag_list(skb))
+ if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad)
goto normal;
gro_list_prepare(napi, skb);
- NAPI_GRO_CB(skb)->csum = skb->csum; /* Needed for CHECKSUM_COMPLETE */
rcu_read_lock();
list_for_each_entry_rcu(ptype, head, list) {
@@ -3983,6 +4016,22 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
NAPI_GRO_CB(skb)->free = 0;
NAPI_GRO_CB(skb)->udp_mark = 0;
+ /* Setup for GRO checksum validation */
+ switch (skb->ip_summed) {
+ case CHECKSUM_COMPLETE:
+ NAPI_GRO_CB(skb)->csum = skb->csum;
+ NAPI_GRO_CB(skb)->csum_valid = 1;
+ NAPI_GRO_CB(skb)->csum_cnt = 0;
+ break;
+ case CHECKSUM_UNNECESSARY:
+ NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
+ NAPI_GRO_CB(skb)->csum_valid = 0;
+ break;
+ default:
+ NAPI_GRO_CB(skb)->csum_cnt = 0;
+ NAPI_GRO_CB(skb)->csum_valid = 0;
+ }
+
pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
break;
}
@@ -4110,6 +4159,10 @@ EXPORT_SYMBOL(napi_gro_receive);
static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
{
+ if (unlikely(skb->pfmemalloc)) {
+ consume_skb(skb);
+ return;
+ }
__skb_pull(skb, skb_headlen(skb));
/* restore the reserve we had after netdev_alloc_skb_ip_align() */
skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
@@ -4128,7 +4181,7 @@ struct sk_buff *napi_get_frags(struct napi_struct *napi)
struct sk_buff *skb = napi->skb;
if (!skb) {
- skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
+ skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
napi->skb = skb;
}
return skb;
@@ -4212,6 +4265,31 @@ gro_result_t napi_gro_frags(struct napi_struct *napi)
}
EXPORT_SYMBOL(napi_gro_frags);
+/* Compute the checksum from gro_offset and return the folded value
+ * after adding in any pseudo checksum.
+ */
+__sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
+{
+ __wsum wsum;
+ __sum16 sum;
+
+ wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
+
+ /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
+ sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
+ if (likely(!sum)) {
+ if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
+ !skb->csum_complete_sw)
+ netdev_rx_csum_fault(skb->dev);
+ }
+
+ NAPI_GRO_CB(skb)->csum = wsum;
+ NAPI_GRO_CB(skb)->csum_valid = 1;
+
+ return sum;
+}
+EXPORT_SYMBOL(__skb_gro_checksum_complete);
+
/*
* net_rps_action_and_irq_enable sends any pending IPI's for rps.
* Note: called with local irq disabled, but exits with local irq enabled.
@@ -4240,20 +4318,28 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd)
local_irq_enable();
}
+static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
+{
+#ifdef CONFIG_RPS
+ return sd->rps_ipi_list != NULL;
+#else
+ return false;
+#endif
+}
+
static int process_backlog(struct napi_struct *napi, int quota)
{
int work = 0;
struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
-#ifdef CONFIG_RPS
/* Check if we have pending ipi, its better to send them now,
* not waiting net_rx_action() end.
*/
- if (sd->rps_ipi_list) {
+ if (sd_has_rps_ipi_waiting(sd)) {
local_irq_disable();
net_rps_action_and_irq_enable(sd);
}
-#endif
+
napi->weight = weight_p;
local_irq_disable();
while (1) {
@@ -4280,7 +4366,6 @@ static int process_backlog(struct napi_struct *napi, int quota)
* We can use a plain write instead of clear_bit(),
* and we dont need an smp_mb() memory barrier.
*/
- list_del(&napi->poll_list);
napi->state = 0;
rps_unlock(sd);
@@ -4300,30 +4385,42 @@ static int process_backlog(struct napi_struct *napi, int quota)
* __napi_schedule - schedule for receive
* @n: entry to schedule
*
- * The entry's receive function will be scheduled to run
+ * The entry's receive function will be scheduled to run.
+ * Consider using __napi_schedule_irqoff() if hard irqs are masked.
*/
void __napi_schedule(struct napi_struct *n)
{
unsigned long flags;
local_irq_save(flags);
- ____napi_schedule(&__get_cpu_var(softnet_data), n);
+ ____napi_schedule(this_cpu_ptr(&softnet_data), n);
local_irq_restore(flags);
}
EXPORT_SYMBOL(__napi_schedule);
+/**
+ * __napi_schedule_irqoff - schedule for receive
+ * @n: entry to schedule
+ *
+ * Variant of __napi_schedule() assuming hard irqs are masked
+ */
+void __napi_schedule_irqoff(struct napi_struct *n)
+{
+ ____napi_schedule(this_cpu_ptr(&softnet_data), n);
+}
+EXPORT_SYMBOL(__napi_schedule_irqoff);
+
void __napi_complete(struct napi_struct *n)
{
BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
- BUG_ON(n->gro_list);
- list_del(&n->poll_list);
+ list_del_init(&n->poll_list);
smp_mb__before_atomic();
clear_bit(NAPI_STATE_SCHED, &n->state);
}
EXPORT_SYMBOL(__napi_complete);
-void napi_complete(struct napi_struct *n)
+void napi_complete_done(struct napi_struct *n, int work_done)
{
unsigned long flags;
@@ -4334,12 +4431,28 @@ void napi_complete(struct napi_struct *n)
if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
return;
- napi_gro_flush(n, false);
- local_irq_save(flags);
- __napi_complete(n);
- local_irq_restore(flags);
+ if (n->gro_list) {
+ unsigned long timeout = 0;
+
+ if (work_done)
+ timeout = n->dev->gro_flush_timeout;
+
+ if (timeout)
+ hrtimer_start(&n->timer, ns_to_ktime(timeout),
+ HRTIMER_MODE_REL_PINNED);
+ else
+ napi_gro_flush(n, false);
+ }
+ if (likely(list_empty(&n->poll_list))) {
+ WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
+ } else {
+ /* If n->poll_list is not empty, we need to mask irqs */
+ local_irq_save(flags);
+ __napi_complete(n);
+ local_irq_restore(flags);
+ }
}
-EXPORT_SYMBOL(napi_complete);
+EXPORT_SYMBOL(napi_complete_done);
/* must be called under rcu_read_lock(), as we dont take a reference */
struct napi_struct *napi_by_id(unsigned int napi_id)
@@ -4393,10 +4506,23 @@ void napi_hash_del(struct napi_struct *napi)
}
EXPORT_SYMBOL_GPL(napi_hash_del);
+static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
+{
+ struct napi_struct *napi;
+
+ napi = container_of(timer, struct napi_struct, timer);
+ if (napi->gro_list)
+ napi_schedule(napi);
+
+ return HRTIMER_NORESTART;
+}
+
void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
int (*poll)(struct napi_struct *, int), int weight)
{
INIT_LIST_HEAD(&napi->poll_list);
+ hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
+ napi->timer.function = napi_watchdog;
napi->gro_count = 0;
napi->gro_list = NULL;
napi->skb = NULL;
@@ -4415,6 +4541,20 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
}
EXPORT_SYMBOL(netif_napi_add);
+void napi_disable(struct napi_struct *n)
+{
+ might_sleep();
+ set_bit(NAPI_STATE_DISABLE, &n->state);
+
+ while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
+ msleep(1);
+
+ hrtimer_cancel(&n->timer);
+
+ clear_bit(NAPI_STATE_DISABLE, &n->state);
+}
+EXPORT_SYMBOL(napi_disable);
+
void netif_napi_del(struct napi_struct *napi)
{
list_del_init(&napi->dev_list);
@@ -4426,99 +4566,112 @@ void netif_napi_del(struct napi_struct *napi)
}
EXPORT_SYMBOL(netif_napi_del);
-static void net_rx_action(struct softirq_action *h)
+static int napi_poll(struct napi_struct *n, struct list_head *repoll)
{
- struct softnet_data *sd = &__get_cpu_var(softnet_data);
- unsigned long time_limit = jiffies + 2;
- int budget = netdev_budget;
void *have;
+ int work, weight;
- local_irq_disable();
+ list_del_init(&n->poll_list);
- while (!list_empty(&sd->poll_list)) {
- struct napi_struct *n;
- int work, weight;
+ have = netpoll_poll_lock(n);
- /* If softirq window is exhuasted then punt.
- * Allow this to run for 2 jiffies since which will allow
- * an average latency of 1.5/HZ.
- */
- if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit)))
- goto softnet_break;
+ weight = n->weight;
- local_irq_enable();
+ /* This NAPI_STATE_SCHED test is for avoiding a race
+ * with netpoll's poll_napi(). Only the entity which
+ * obtains the lock and sees NAPI_STATE_SCHED set will
+ * actually make the ->poll() call. Therefore we avoid
+ * accidentally calling ->poll() when NAPI is not scheduled.
+ */
+ work = 0;
+ if (test_bit(NAPI_STATE_SCHED, &n->state)) {
+ work = n->poll(n, weight);
+ trace_napi_poll(n);
+ }
- /* Even though interrupts have been re-enabled, this
- * access is safe because interrupts can only add new
- * entries to the tail of this list, and only ->poll()
- * calls can remove this head entry from the list.
- */
- n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
+ WARN_ON_ONCE(work > weight);
- have = netpoll_poll_lock(n);
+ if (likely(work < weight))
+ goto out_unlock;
- weight = n->weight;
+ /* Drivers must not modify the NAPI state if they
+ * consume the entire weight. In such cases this code
+ * still "owns" the NAPI instance and therefore can
+ * move the instance around on the list at-will.
+ */
+ if (unlikely(napi_disable_pending(n))) {
+ napi_complete(n);
+ goto out_unlock;
+ }
- /* This NAPI_STATE_SCHED test is for avoiding a race
- * with netpoll's poll_napi(). Only the entity which
- * obtains the lock and sees NAPI_STATE_SCHED set will
- * actually make the ->poll() call. Therefore we avoid
- * accidentally calling ->poll() when NAPI is not scheduled.
+ if (n->gro_list) {
+ /* flush too old packets
+ * If HZ < 1000, flush all packets.
*/
- work = 0;
- if (test_bit(NAPI_STATE_SCHED, &n->state)) {
- work = n->poll(n, weight);
- trace_napi_poll(n);
- }
+ napi_gro_flush(n, HZ >= 1000);
+ }
- WARN_ON_ONCE(work > weight);
+ /* Some drivers may have called napi_schedule
+ * prior to exhausting their budget.
+ */
+ if (unlikely(!list_empty(&n->poll_list))) {
+ pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
+ n->dev ? n->dev->name : "backlog");
+ goto out_unlock;
+ }
- budget -= work;
+ list_add_tail(&n->poll_list, repoll);
- local_irq_disable();
+out_unlock:
+ netpoll_poll_unlock(have);
- /* Drivers must not modify the NAPI state if they
- * consume the entire weight. In such cases this code
- * still "owns" the NAPI instance and therefore can
- * move the instance around on the list at-will.
- */
- if (unlikely(work == weight)) {
- if (unlikely(napi_disable_pending(n))) {
- local_irq_enable();
- napi_complete(n);
- local_irq_disable();
- } else {
- if (n->gro_list) {
- /* flush too old packets
- * If HZ < 1000, flush all packets.
- */
- local_irq_enable();
- napi_gro_flush(n, HZ >= 1000);
- local_irq_disable();
- }
- list_move_tail(&n->poll_list, &sd->poll_list);
- }
+ return work;
+}
+
+static void net_rx_action(struct softirq_action *h)
+{
+ struct softnet_data *sd = this_cpu_ptr(&softnet_data);
+ unsigned long time_limit = jiffies + 2;
+ int budget = netdev_budget;
+ LIST_HEAD(list);
+ LIST_HEAD(repoll);
+
+ local_irq_disable();
+ list_splice_init(&sd->poll_list, &list);
+ local_irq_enable();
+
+ for (;;) {
+ struct napi_struct *n;
+
+ if (list_empty(&list)) {
+ if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
+ return;
+ break;
}
- netpoll_poll_unlock(have);
+ n = list_first_entry(&list, struct napi_struct, poll_list);
+ budget -= napi_poll(n, &repoll);
+
+ /* If softirq window is exhausted then punt.
+ * Allow this to run for 2 jiffies since which will allow
+ * an average latency of 1.5/HZ.
+ */
+ if (unlikely(budget <= 0 ||
+ time_after_eq(jiffies, time_limit))) {
+ sd->time_squeeze++;
+ break;
+ }
}
-out:
- net_rps_action_and_irq_enable(sd);
-#ifdef CONFIG_NET_DMA
- /*
- * There may not be any more sk_buffs coming right now, so push
- * any pending DMA copies to hardware
- */
- dma_issue_pending_all();
-#endif
+ local_irq_disable();
- return;
+ list_splice_tail_init(&sd->poll_list, &list);
+ list_splice_tail(&repoll, &list);
+ list_splice(&list, &sd->poll_list);
+ if (!list_empty(&sd->poll_list))
+ __raise_softirq_irqoff(NET_RX_SOFTIRQ);
-softnet_break:
- sd->time_squeeze++;
- __raise_softirq_irqoff(NET_RX_SOFTIRQ);
- goto out;
+ net_rps_action_and_irq_enable(sd);
}
struct netdev_adjacent {
@@ -5718,7 +5871,7 @@ EXPORT_SYMBOL(dev_change_carrier);
* Get device physical port ID
*/
int dev_get_phys_port_id(struct net_device *dev,
- struct netdev_phys_port_id *ppid)
+ struct netdev_phys_item_id *ppid)
{
const struct net_device_ops *ops = dev->netdev_ops;
@@ -5797,6 +5950,8 @@ static void rollback_registered_many(struct list_head *head)
synchronize_net();
list_for_each_entry(dev, head, unreg_list) {
+ struct sk_buff *skb = NULL;
+
/* Shutdown queueing discipline. */
dev_shutdown(dev);
@@ -5806,6 +5961,11 @@ static void rollback_registered_many(struct list_head *head)
*/
call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
+ if (!dev->rtnl_link_ops ||
+ dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
+ skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
+ GFP_KERNEL);
+
/*
* Flush the unicast and multicast chains
*/
@@ -5815,9 +5975,8 @@ static void rollback_registered_many(struct list_head *head)
if (dev->netdev_ops->ndo_uninit)
dev->netdev_ops->ndo_uninit(dev);
- if (!dev->rtnl_link_ops ||
- dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
- rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
+ if (skb)
+ rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
/* Notifier chain MUST detach us all upper devices. */
WARN_ON(netdev_has_any_upper_dev(dev));
@@ -6589,6 +6748,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
dev->gso_max_size = GSO_MAX_SIZE;
dev->gso_max_segs = GSO_MAX_SEGS;
+ dev->gso_min_segs = 0;
INIT_LIST_HEAD(&dev->napi_list);
INIT_LIST_HEAD(&dev->unreg_list);
@@ -6598,7 +6758,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
INIT_LIST_HEAD(&dev->adj_list.lower);
INIT_LIST_HEAD(&dev->all_adj_list.upper);
INIT_LIST_HEAD(&dev->all_adj_list.lower);
- dev->priv_flags = IFF_XMIT_DST_RELEASE;
+ dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
setup(dev);
dev->num_tx_queues = txqs;
@@ -7020,53 +7180,45 @@ const char *netdev_drivername(const struct net_device *dev)
return empty;
}
-static int __netdev_printk(const char *level, const struct net_device *dev,
- struct va_format *vaf)
+static void __netdev_printk(const char *level, const struct net_device *dev,
+ struct va_format *vaf)
{
- int r;
-
if (dev && dev->dev.parent) {
- r = dev_printk_emit(level[1] - '0',
- dev->dev.parent,
- "%s %s %s%s: %pV",
- dev_driver_string(dev->dev.parent),
- dev_name(dev->dev.parent),
- netdev_name(dev), netdev_reg_state(dev),
- vaf);
+ dev_printk_emit(level[1] - '0',
+ dev->dev.parent,
+ "%s %s %s%s: %pV",
+ dev_driver_string(dev->dev.parent),
+ dev_name(dev->dev.parent),
+ netdev_name(dev), netdev_reg_state(dev),
+ vaf);
} else if (dev) {
- r = printk("%s%s%s: %pV", level, netdev_name(dev),
- netdev_reg_state(dev), vaf);
+ printk("%s%s%s: %pV",
+ level, netdev_name(dev), netdev_reg_state(dev), vaf);
} else {
- r = printk("%s(NULL net_device): %pV", level, vaf);
+ printk("%s(NULL net_device): %pV", level, vaf);
}
-
- return r;
}
-int netdev_printk(const char *level, const struct net_device *dev,
- const char *format, ...)
+void netdev_printk(const char *level, const struct net_device *dev,
+ const char *format, ...)
{
struct va_format vaf;
va_list args;
- int r;
va_start(args, format);
vaf.fmt = format;
vaf.va = &args;
- r = __netdev_printk(level, dev, &vaf);
+ __netdev_printk(level, dev, &vaf);
va_end(args);
-
- return r;
}
EXPORT_SYMBOL(netdev_printk);
#define define_netdev_printk_level(func, level) \
-int func(const struct net_device *dev, const char *fmt, ...) \
+void func(const struct net_device *dev, const char *fmt, ...) \
{ \
- int r; \
struct va_format vaf; \
va_list args; \
\
@@ -7075,11 +7227,9 @@ int func(const struct net_device *dev, const char *fmt, ...) \
vaf.fmt = fmt; \
vaf.va = &args; \
\
- r = __netdev_printk(level, dev, &vaf); \
+ __netdev_printk(level, dev, &vaf); \
\
va_end(args); \
- \
- return r; \
} \
EXPORT_SYMBOL(func);
@@ -7141,11 +7291,10 @@ static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
*/
struct net *net;
bool unregistering;
- DEFINE_WAIT(wait);
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+ add_wait_queue(&netdev_unregistering_wq, &wait);
for (;;) {
- prepare_to_wait(&netdev_unregistering_wq, &wait,
- TASK_UNINTERRUPTIBLE);
unregistering = false;
rtnl_lock();
list_for_each_entry(net, net_list, exit_list) {
@@ -7157,9 +7306,10 @@ static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
if (!unregistering)
break;
__rtnl_unlock();
- schedule();
+
+ wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
}
- finish_wait(&netdev_unregistering_wq, &wait);
+ remove_wait_queue(&netdev_unregistering_wq, &wait);
}
static void __net_exit default_device_exit_batch(struct list_head *net_list)
diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c
index b6b230600b97..c0548d268e1a 100644
--- a/net/core/dev_addr_lists.c
+++ b/net/core/dev_addr_lists.c
@@ -278,8 +278,8 @@ int __hw_addr_sync_dev(struct netdev_hw_addr_list *list,
EXPORT_SYMBOL(__hw_addr_sync_dev);
/**
- * __hw_addr_unsync_dev - Remove synchonized addresses from device
- * @list: address list to remove syncronized addresses from
+ * __hw_addr_unsync_dev - Remove synchronized addresses from device
+ * @list: address list to remove synchronized addresses from
* @dev: device to sync
* @unsync: function to call if address should be removed
*
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index cf999e09bcd2..b94b1d293506 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -142,10 +142,12 @@ static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cm
case SIOCGIFHWADDR:
if (!dev->addr_len)
- memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
+ memset(ifr->ifr_hwaddr.sa_data, 0,
+ sizeof(ifr->ifr_hwaddr.sa_data));
else
memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
- min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
+ min(sizeof(ifr->ifr_hwaddr.sa_data),
+ (size_t)dev->addr_len));
ifr->ifr_hwaddr.sa_family = dev->type;
return 0;
@@ -265,7 +267,8 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
if (ifr->ifr_hwaddr.sa_family != dev->type)
return -EINVAL;
memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
- min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
+ min(sizeof(ifr->ifr_hwaddr.sa_data),
+ (size_t)dev->addr_len));
call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
return 0;
@@ -365,11 +368,8 @@ void dev_load(struct net *net, const char *name)
no_module = !dev;
if (no_module && capable(CAP_NET_ADMIN))
no_module = request_module("netdev-%s", name);
- if (no_module && capable(CAP_SYS_MODULE)) {
- if (!request_module("%s", name))
- pr_warn("Loading kernel module for a network device with CAP_SYS_MODULE (deprecated). Use CAP_NET_ADMIN and alias netdev-%s instead.\n",
- name);
- }
+ if (no_module && capable(CAP_SYS_MODULE))
+ request_module("%s", name);
}
EXPORT_SYMBOL(dev_load);
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index 50f9a9db5792..252e155c837b 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -146,7 +146,7 @@ static void trace_drop_common(struct sk_buff *skb, void *location)
unsigned long flags;
local_irq_save(flags);
- data = &__get_cpu_var(dm_cpu_data);
+ data = this_cpu_ptr(&dm_cpu_data);
spin_lock(&data->lock);
dskb = data->skb;
diff --git a/net/core/dst.c b/net/core/dst.c
index a028409ee438..e956ce6d1378 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -327,30 +327,6 @@ void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old)
}
EXPORT_SYMBOL(__dst_destroy_metrics_generic);
-/**
- * __skb_dst_set_noref - sets skb dst, without a reference
- * @skb: buffer
- * @dst: dst entry
- * @force: if force is set, use noref version even for DST_NOCACHE entries
- *
- * Sets skb dst, assuming a reference was not taken on dst
- * skb_dst_drop() should not dst_release() this dst
- */
-void __skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst, bool force)
-{
- WARN_ON(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
- /* If dst not in cache, we must take a reference, because
- * dst_release() will destroy dst as soon as its refcount becomes zero
- */
- if (unlikely((dst->flags & DST_NOCACHE) && !force)) {
- dst_hold(dst);
- skb_dst_set(skb, dst);
- } else {
- skb->_skb_refdst = (unsigned long)dst | SKB_DST_NOREF;
- }
-}
-EXPORT_SYMBOL(__skb_dst_set_noref);
-
/* Dirty hack. We did it in 2.2 (in __dst_free),
* we have _very_ good reasons not to repeat
* this mistake in 2.3, but we have no choice
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 17cb912793fa..550892cd6b3f 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -25,6 +25,7 @@
#include <linux/slab.h>
#include <linux/rtnetlink.h>
#include <linux/sched.h>
+#include <linux/net.h>
/*
* Some useful ethtool_ops methods that're device independent.
@@ -84,7 +85,6 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
[NETIF_F_GSO_IPIP_BIT] = "tx-ipip-segmentation",
[NETIF_F_GSO_SIT_BIT] = "tx-sit-segmentation",
[NETIF_F_GSO_UDP_TUNNEL_BIT] = "tx-udp_tnl-segmentation",
- [NETIF_F_GSO_MPLS_BIT] = "tx-mpls-segmentation",
[NETIF_F_FCOE_CRC_BIT] = "tx-checksum-fcoe-crc",
[NETIF_F_SCTP_CSUM_BIT] = "tx-checksum-sctp",
@@ -100,6 +100,12 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
[NETIF_F_BUSY_POLL_BIT] = "busy-poll",
};
+static const char
+rss_hash_func_strings[ETH_RSS_HASH_FUNCS_COUNT][ETH_GSTRING_LEN] = {
+ [ETH_RSS_HASH_TOP_BIT] = "toeplitz",
+ [ETH_RSS_HASH_XOR_BIT] = "xor",
+};
+
static int ethtool_get_features(struct net_device *dev, void __user *useraddr)
{
struct ethtool_gfeatures cmd = {
@@ -185,6 +191,9 @@ static int __ethtool_get_sset_count(struct net_device *dev, int sset)
if (sset == ETH_SS_FEATURES)
return ARRAY_SIZE(netdev_features_strings);
+ if (sset == ETH_SS_RSS_HASH_FUNCS)
+ return ARRAY_SIZE(rss_hash_func_strings);
+
if (ops->get_sset_count && ops->get_strings)
return ops->get_sset_count(dev, sset);
else
@@ -199,6 +208,9 @@ static void __ethtool_get_strings(struct net_device *dev,
if (stringset == ETH_SS_FEATURES)
memcpy(data, netdev_features_strings,
sizeof(netdev_features_strings));
+ else if (stringset == ETH_SS_RSS_HASH_FUNCS)
+ memcpy(data, rss_hash_func_strings,
+ sizeof(rss_hash_func_strings));
else
/* ops->get_strings is valid because checked earlier */
ops->get_strings(dev, stringset, data);
@@ -574,6 +586,16 @@ static int ethtool_copy_validate_indir(u32 *indir, void __user *useraddr,
return 0;
}
+u8 netdev_rss_key[NETDEV_RSS_KEY_LEN];
+
+void netdev_rss_key_fill(void *buffer, size_t len)
+{
+ BUG_ON(len > sizeof(netdev_rss_key));
+ net_get_random_once(netdev_rss_key, sizeof(netdev_rss_key));
+ memcpy(buffer, netdev_rss_key, len);
+}
+EXPORT_SYMBOL(netdev_rss_key_fill);
+
static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev,
void __user *useraddr)
{
@@ -608,7 +630,7 @@ static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev,
if (!indir)
return -ENOMEM;
- ret = dev->ethtool_ops->get_rxfh(dev, indir, NULL);
+ ret = dev->ethtool_ops->get_rxfh(dev, indir, NULL, NULL);
if (ret)
goto out;
@@ -669,7 +691,7 @@ static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev,
goto out;
}
- ret = ops->set_rxfh(dev, indir, NULL);
+ ret = ops->set_rxfh(dev, indir, NULL, ETH_RSS_HASH_NO_CHANGE);
out:
kfree(indir);
@@ -687,12 +709,11 @@ static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev,
u32 total_size;
u32 indir_bytes;
u32 *indir = NULL;
+ u8 dev_hfunc = 0;
u8 *hkey = NULL;
u8 *rss_config;
- if (!(dev->ethtool_ops->get_rxfh_indir_size ||
- dev->ethtool_ops->get_rxfh_key_size) ||
- !dev->ethtool_ops->get_rxfh)
+ if (!ops->get_rxfh)
return -EOPNOTSUPP;
if (ops->get_rxfh_indir_size)
@@ -700,16 +721,14 @@ static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev,
if (ops->get_rxfh_key_size)
dev_key_size = ops->get_rxfh_key_size(dev);
- if ((dev_key_size + dev_indir_size) == 0)
- return -EOPNOTSUPP;
-
if (copy_from_user(&rxfh, useraddr, sizeof(rxfh)))
return -EFAULT;
user_indir_size = rxfh.indir_size;
user_key_size = rxfh.key_size;
/* Check that reserved fields are 0 for now */
- if (rxfh.rss_context || rxfh.rsvd[0] || rxfh.rsvd[1])
+ if (rxfh.rss_context || rxfh.rsvd8[0] || rxfh.rsvd8[1] ||
+ rxfh.rsvd8[2] || rxfh.rsvd32)
return -EINVAL;
rxfh.indir_size = dev_indir_size;
@@ -717,13 +736,6 @@ static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev,
if (copy_to_user(useraddr, &rxfh, sizeof(rxfh)))
return -EFAULT;
- /* If the user buffer size is 0, this is just a query for the
- * device table size and key size. Otherwise, if the User size is
- * not equal to device table size or key size it's an error.
- */
- if (!user_indir_size && !user_key_size)
- return 0;
-
if ((user_indir_size && (user_indir_size != dev_indir_size)) ||
(user_key_size && (user_key_size != dev_key_size)))
return -EINVAL;
@@ -740,14 +752,19 @@ static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev,
if (user_key_size)
hkey = rss_config + indir_bytes;
- ret = dev->ethtool_ops->get_rxfh(dev, indir, hkey);
- if (!ret) {
- if (copy_to_user(useraddr +
- offsetof(struct ethtool_rxfh, rss_config[0]),
- rss_config, total_size))
- ret = -EFAULT;
- }
+ ret = dev->ethtool_ops->get_rxfh(dev, indir, hkey, &dev_hfunc);
+ if (ret)
+ goto out;
+ if (copy_to_user(useraddr + offsetof(struct ethtool_rxfh, hfunc),
+ &dev_hfunc, sizeof(rxfh.hfunc))) {
+ ret = -EFAULT;
+ } else if (copy_to_user(useraddr +
+ offsetof(struct ethtool_rxfh, rss_config[0]),
+ rss_config, total_size)) {
+ ret = -EFAULT;
+ }
+out:
kfree(rss_config);
return ret;
@@ -766,33 +783,31 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev,
u8 *rss_config;
u32 rss_cfg_offset = offsetof(struct ethtool_rxfh, rss_config[0]);
- if (!(ops->get_rxfh_indir_size || ops->get_rxfh_key_size) ||
- !ops->get_rxnfc || !ops->set_rxfh)
+ if (!ops->get_rxnfc || !ops->set_rxfh)
return -EOPNOTSUPP;
if (ops->get_rxfh_indir_size)
dev_indir_size = ops->get_rxfh_indir_size(dev);
if (ops->get_rxfh_key_size)
dev_key_size = dev->ethtool_ops->get_rxfh_key_size(dev);
- if ((dev_key_size + dev_indir_size) == 0)
- return -EOPNOTSUPP;
if (copy_from_user(&rxfh, useraddr, sizeof(rxfh)))
return -EFAULT;
/* Check that reserved fields are 0 for now */
- if (rxfh.rss_context || rxfh.rsvd[0] || rxfh.rsvd[1])
+ if (rxfh.rss_context || rxfh.rsvd8[0] || rxfh.rsvd8[1] ||
+ rxfh.rsvd8[2] || rxfh.rsvd32)
return -EINVAL;
- /* If either indir or hash key is valid, proceed further.
- * It is not valid to request that both be unchanged.
+ /* If either indir, hash key or function is valid, proceed further.
+ * Must request at least one change: indir size, hash key or function.
*/
if ((rxfh.indir_size &&
rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE &&
rxfh.indir_size != dev_indir_size) ||
(rxfh.key_size && (rxfh.key_size != dev_key_size)) ||
(rxfh.indir_size == ETH_RXFH_INDIR_NO_CHANGE &&
- rxfh.key_size == 0))
+ rxfh.key_size == 0 && rxfh.hfunc == ETH_RSS_HASH_NO_CHANGE))
return -EINVAL;
if (rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE)
@@ -835,7 +850,7 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev,
}
}
- ret = ops->set_rxfh(dev, indir, hkey);
+ ret = ops->set_rxfh(dev, indir, hkey, rxfh.hfunc);
out:
kfree(rss_config);
@@ -1036,7 +1051,8 @@ static int ethtool_get_eeprom(struct net_device *dev, void __user *useraddr)
{
const struct ethtool_ops *ops = dev->ethtool_ops;
- if (!ops->get_eeprom || !ops->get_eeprom_len)
+ if (!ops->get_eeprom || !ops->get_eeprom_len ||
+ !ops->get_eeprom_len(dev))
return -EOPNOTSUPP;
return ethtool_get_any_eeprom(dev, useraddr, ops->get_eeprom,
@@ -1052,7 +1068,8 @@ static int ethtool_set_eeprom(struct net_device *dev, void __user *useraddr)
u8 *data;
int ret = 0;
- if (!ops->set_eeprom || !ops->get_eeprom_len)
+ if (!ops->set_eeprom || !ops->get_eeprom_len ||
+ !ops->get_eeprom_len(dev))
return -EOPNOTSUPP;
if (copy_from_user(&eeprom, useraddr, sizeof(eeprom)))
@@ -1621,6 +1638,81 @@ static int ethtool_get_module_eeprom(struct net_device *dev,
modinfo.eeprom_len);
}
+static int ethtool_tunable_valid(const struct ethtool_tunable *tuna)
+{
+ switch (tuna->id) {
+ case ETHTOOL_RX_COPYBREAK:
+ case ETHTOOL_TX_COPYBREAK:
+ if (tuna->len != sizeof(u32) ||
+ tuna->type_id != ETHTOOL_TUNABLE_U32)
+ return -EINVAL;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int ethtool_get_tunable(struct net_device *dev, void __user *useraddr)
+{
+ int ret;
+ struct ethtool_tunable tuna;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ void *data;
+
+ if (!ops->get_tunable)
+ return -EOPNOTSUPP;
+ if (copy_from_user(&tuna, useraddr, sizeof(tuna)))
+ return -EFAULT;
+ ret = ethtool_tunable_valid(&tuna);
+ if (ret)
+ return ret;
+ data = kmalloc(tuna.len, GFP_USER);
+ if (!data)
+ return -ENOMEM;
+ ret = ops->get_tunable(dev, &tuna, data);
+ if (ret)
+ goto out;
+ useraddr += sizeof(tuna);
+ ret = -EFAULT;
+ if (copy_to_user(useraddr, data, tuna.len))
+ goto out;
+ ret = 0;
+
+out:
+ kfree(data);
+ return ret;
+}
+
+static int ethtool_set_tunable(struct net_device *dev, void __user *useraddr)
+{
+ int ret;
+ struct ethtool_tunable tuna;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ void *data;
+
+ if (!ops->set_tunable)
+ return -EOPNOTSUPP;
+ if (copy_from_user(&tuna, useraddr, sizeof(tuna)))
+ return -EFAULT;
+ ret = ethtool_tunable_valid(&tuna);
+ if (ret)
+ return ret;
+ data = kmalloc(tuna.len, GFP_USER);
+ if (!data)
+ return -ENOMEM;
+ useraddr += sizeof(tuna);
+ ret = -EFAULT;
+ if (copy_from_user(data, useraddr, tuna.len))
+ goto out;
+ ret = ops->set_tunable(dev, &tuna, data);
+
+out:
+ kfree(data);
+ return ret;
+}
+
/* The main entry point in this file. Called from net/core/dev_ioctl.c */
int dev_ethtool(struct net *net, struct ifreq *ifr)
@@ -1670,6 +1762,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
case ETHTOOL_GCHANNELS:
case ETHTOOL_GET_TS_INFO:
case ETHTOOL_GEEE:
+ case ETHTOOL_GTUNABLE:
break;
default:
if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
@@ -1857,6 +1950,12 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
case ETHTOOL_GMODULEEEPROM:
rc = ethtool_get_module_eeprom(dev, useraddr);
break;
+ case ETHTOOL_GTUNABLE:
+ rc = ethtool_get_tunable(dev, useraddr);
+ break;
+ case ETHTOOL_STUNABLE:
+ rc = ethtool_set_tunable(dev, useraddr);
+ break;
default:
rc = -EOPNOTSUPP;
}
diff --git a/net/core/filter.c b/net/core/filter.c
index d814b8a89d0f..ec9baea10c16 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -44,6 +44,7 @@
#include <linux/ratelimit.h>
#include <linux/seccomp.h>
#include <linux/if_vlan.h>
+#include <linux/bpf.h>
/**
* sk_filter - run a packet through a socket filter
@@ -51,9 +52,9 @@
* @skb: buffer to filter
*
* Run the filter code and then cut skb->data to correct size returned by
- * sk_run_filter. If pkt_len is 0 we toss packet. If skb->len is smaller
+ * SK_RUN_FILTER. If pkt_len is 0 we toss packet. If skb->len is smaller
* than pkt_len we keep whole skb->data. This is the socket level
- * wrapper to sk_run_filter. It returns 0 if the packet should
+ * wrapper to SK_RUN_FILTER. It returns 0 if the packet should
* be accepted or -EPERM if the packet should be tossed.
*
*/
@@ -87,33 +88,9 @@ int sk_filter(struct sock *sk, struct sk_buff *skb)
}
EXPORT_SYMBOL(sk_filter);
-/* Helper to find the offset of pkt_type in sk_buff structure. We want
- * to make sure its still a 3bit field starting at a byte boundary;
- * taken from arch/x86/net/bpf_jit_comp.c.
- */
-#ifdef __BIG_ENDIAN_BITFIELD
-#define PKT_TYPE_MAX (7 << 5)
-#else
-#define PKT_TYPE_MAX 7
-#endif
-static unsigned int pkt_type_offset(void)
-{
- struct sk_buff skb_probe = { .pkt_type = ~0, };
- u8 *ct = (u8 *) &skb_probe;
- unsigned int off;
-
- for (off = 0; off < sizeof(struct sk_buff); off++) {
- if (ct[off] == PKT_TYPE_MAX)
- return off;
- }
-
- pr_err_once("Please fix %s, as pkt_type couldn't be found!\n", __func__);
- return -1;
-}
-
static u64 __skb_get_pay_offset(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
{
- return __skb_get_poff((struct sk_buff *)(unsigned long) ctx);
+ return skb_get_poff((struct sk_buff *)(unsigned long) ctx);
}
static u64 __skb_get_nlattr(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
@@ -190,11 +167,8 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
break;
case SKF_AD_OFF + SKF_AD_PKTTYPE:
- *insn = BPF_LDX_MEM(BPF_B, BPF_REG_A, BPF_REG_CTX,
- pkt_type_offset());
- if (insn->off < 0)
- return false;
- insn++;
+ *insn++ = BPF_LDX_MEM(BPF_B, BPF_REG_A, BPF_REG_CTX,
+ PKT_TYPE_OFFSET());
*insn = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, PKT_TYPE_MAX);
#ifdef __BIG_ENDIAN_BITFIELD
insn++;
@@ -593,11 +567,8 @@ err:
/* Security:
*
- * A BPF program is able to use 16 cells of memory to store intermediate
- * values (check u32 mem[BPF_MEMWORDS] in sk_run_filter()).
- *
* As we dont want to clear mem[] array for each packet going through
- * sk_run_filter(), we check that filter loaded by user never try to read
+ * __bpf_prog_run(), we check that filter loaded by user never try to read
* a cell if not previously written, and we check all branches to be sure
* a malicious user doesn't try to abuse us.
*/
@@ -843,8 +814,12 @@ static void bpf_release_orig_filter(struct bpf_prog *fp)
static void __bpf_prog_release(struct bpf_prog *prog)
{
- bpf_release_orig_filter(prog);
- bpf_prog_free(prog);
+ if (prog->aux->prog_type == BPF_PROG_TYPE_SOCKET_FILTER) {
+ bpf_prog_put(prog);
+ } else {
+ bpf_release_orig_filter(prog);
+ bpf_prog_free(prog);
+ }
}
static void __sk_filter_release(struct sk_filter *fp)
@@ -933,7 +908,7 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp)
/* Expand fp for appending the new filter representation. */
old_fp = fp;
- fp = krealloc(old_fp, bpf_prog_size(new_len), GFP_KERNEL);
+ fp = bpf_prog_realloc(old_fp, bpf_prog_size(new_len), 0);
if (!fp) {
/* The old_fp is still around in case we couldn't
* allocate new memory, so uncharge on that one.
@@ -972,7 +947,7 @@ static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp)
int err;
fp->bpf_func = NULL;
- fp->jited = 0;
+ fp->jited = false;
err = bpf_check_classic(fp->insns, fp->len);
if (err) {
@@ -1013,7 +988,7 @@ int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog)
if (fprog->filter == NULL)
return -EINVAL;
- fp = kmalloc(bpf_prog_size(fprog->len), GFP_KERNEL);
+ fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0);
if (!fp)
return -ENOMEM;
@@ -1069,12 +1044,12 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
if (fprog->filter == NULL)
return -EINVAL;
- prog = kmalloc(bpf_fsize, GFP_KERNEL);
+ prog = bpf_prog_alloc(bpf_fsize, 0);
if (!prog)
return -ENOMEM;
if (copy_from_user(prog->insns, fprog->filter, fsize)) {
- kfree(prog);
+ __bpf_prog_free(prog);
return -EFAULT;
}
@@ -1082,7 +1057,7 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
err = bpf_prog_store_orig_filter(prog, fprog);
if (err) {
- kfree(prog);
+ __bpf_prog_free(prog);
return -ENOMEM;
}
@@ -1118,6 +1093,94 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
}
EXPORT_SYMBOL_GPL(sk_attach_filter);
+#ifdef CONFIG_BPF_SYSCALL
+int sk_attach_bpf(u32 ufd, struct sock *sk)
+{
+ struct sk_filter *fp, *old_fp;
+ struct bpf_prog *prog;
+
+ if (sock_flag(sk, SOCK_FILTER_LOCKED))
+ return -EPERM;
+
+ prog = bpf_prog_get(ufd);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ if (prog->aux->prog_type != BPF_PROG_TYPE_SOCKET_FILTER) {
+ /* valid fd, but invalid program type */
+ bpf_prog_put(prog);
+ return -EINVAL;
+ }
+
+ fp = kmalloc(sizeof(*fp), GFP_KERNEL);
+ if (!fp) {
+ bpf_prog_put(prog);
+ return -ENOMEM;
+ }
+ fp->prog = prog;
+
+ atomic_set(&fp->refcnt, 0);
+
+ if (!sk_filter_charge(sk, fp)) {
+ __sk_filter_release(fp);
+ return -ENOMEM;
+ }
+
+ old_fp = rcu_dereference_protected(sk->sk_filter,
+ sock_owned_by_user(sk));
+ rcu_assign_pointer(sk->sk_filter, fp);
+
+ if (old_fp)
+ sk_filter_uncharge(sk, old_fp);
+
+ return 0;
+}
+
+/* allow socket filters to call
+ * bpf_map_lookup_elem(), bpf_map_update_elem(), bpf_map_delete_elem()
+ */
+static const struct bpf_func_proto *sock_filter_func_proto(enum bpf_func_id func_id)
+{
+ switch (func_id) {
+ case BPF_FUNC_map_lookup_elem:
+ return &bpf_map_lookup_elem_proto;
+ case BPF_FUNC_map_update_elem:
+ return &bpf_map_update_elem_proto;
+ case BPF_FUNC_map_delete_elem:
+ return &bpf_map_delete_elem_proto;
+ default:
+ return NULL;
+ }
+}
+
+static bool sock_filter_is_valid_access(int off, int size, enum bpf_access_type type)
+{
+ /* skb fields cannot be accessed yet */
+ return false;
+}
+
+static struct bpf_verifier_ops sock_filter_ops = {
+ .get_func_proto = sock_filter_func_proto,
+ .is_valid_access = sock_filter_is_valid_access,
+};
+
+static struct bpf_prog_type_list tl = {
+ .ops = &sock_filter_ops,
+ .type = BPF_PROG_TYPE_SOCKET_FILTER,
+};
+
+static int __init register_sock_filter_ops(void)
+{
+ bpf_register_prog_type(&tl);
+ return 0;
+}
+late_initcall(register_sock_filter_ops);
+#else
+int sk_attach_bpf(u32 ufd, struct sock *sk)
+{
+ return -EOPNOTSUPP;
+}
+#endif
int sk_detach_filter(struct sock *sk)
{
int ret = -ENOENT;
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 5f362c1d0332..45084938c403 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -13,6 +13,7 @@
#include <linux/if_pppox.h>
#include <linux/ppp_defs.h>
#include <net/flow_keys.h>
+#include <scsi/fc/fc_fcoe.h>
/* copy saddr & daddr, possibly using 64bit load/store
* Equivalent to : flow->src = iph->saddr;
@@ -26,36 +27,61 @@ static void iph_to_flow_copy_addrs(struct flow_keys *flow, const struct iphdr *i
}
/**
- * skb_flow_get_ports - extract the upper layer ports and return them
- * @skb: buffer to extract the ports from
+ * __skb_flow_get_ports - extract the upper layer ports and return them
+ * @skb: sk_buff to extract the ports from
* @thoff: transport header offset
* @ip_proto: protocol for which to get port offset
+ * @data: raw buffer pointer to the packet, if NULL use skb->data
+ * @hlen: packet header length, if @data is NULL use skb_headlen(skb)
*
* The function will try to retrieve the ports at offset thoff + poff where poff
* is the protocol port offset returned from proto_ports_offset
*/
-__be32 skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto)
+__be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
+ void *data, int hlen)
{
int poff = proto_ports_offset(ip_proto);
+ if (!data) {
+ data = skb->data;
+ hlen = skb_headlen(skb);
+ }
+
if (poff >= 0) {
__be32 *ports, _ports;
- ports = skb_header_pointer(skb, thoff + poff,
- sizeof(_ports), &_ports);
+ ports = __skb_header_pointer(skb, thoff + poff,
+ sizeof(_ports), data, hlen, &_ports);
if (ports)
return *ports;
}
return 0;
}
-EXPORT_SYMBOL(skb_flow_get_ports);
+EXPORT_SYMBOL(__skb_flow_get_ports);
-bool skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow)
+/**
+ * __skb_flow_dissect - extract the flow_keys struct and return it
+ * @skb: sk_buff to extract the flow from, can be NULL if the rest are specified
+ * @data: raw buffer pointer to the packet, if NULL use skb->data
+ * @proto: protocol for which to get the flow, if @data is NULL use skb->protocol
+ * @nhoff: network header offset, if @data is NULL use skb_network_offset(skb)
+ * @hlen: packet header length, if @data is NULL use skb_headlen(skb)
+ *
+ * The function will try to retrieve the struct flow_keys from either the skbuff
+ * or a raw buffer specified by the rest parameters
+ */
+bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow,
+ void *data, __be16 proto, int nhoff, int hlen)
{
- int nhoff = skb_network_offset(skb);
u8 ip_proto;
- __be16 proto = skb->protocol;
+
+ if (!data) {
+ data = skb->data;
+ proto = skb->protocol;
+ nhoff = skb_network_offset(skb);
+ hlen = skb_headlen(skb);
+ }
memset(flow, 0, sizeof(*flow));
@@ -65,7 +91,7 @@ again:
const struct iphdr *iph;
struct iphdr _iph;
ip:
- iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph);
+ iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph);
if (!iph || iph->ihl < 5)
return false;
nhoff += iph->ihl * 4;
@@ -74,6 +100,13 @@ ip:
if (ip_is_fragment(iph))
ip_proto = 0;
+ /* skip the address processing if skb is NULL. The assumption
+ * here is that if there is no skb we are not looking for flow
+ * info but lengths and protocols.
+ */
+ if (!skb)
+ break;
+
iph_to_flow_copy_addrs(flow, iph);
break;
}
@@ -83,14 +116,19 @@ ip:
__be32 flow_label;
ipv6:
- iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph);
+ iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph);
if (!iph)
return false;
ip_proto = iph->nexthdr;
+ nhoff += sizeof(struct ipv6hdr);
+
+ /* see comment above in IPv4 section */
+ if (!skb)
+ break;
+
flow->src = (__force __be32)ipv6_addr_hash(&iph->saddr);
flow->dst = (__force __be32)ipv6_addr_hash(&iph->daddr);
- nhoff += sizeof(struct ipv6hdr);
flow_label = ip6_flowlabel(iph);
if (flow_label) {
@@ -113,7 +151,7 @@ ipv6:
const struct vlan_hdr *vlan;
struct vlan_hdr _vlan;
- vlan = skb_header_pointer(skb, nhoff, sizeof(_vlan), &_vlan);
+ vlan = __skb_header_pointer(skb, nhoff, sizeof(_vlan), data, hlen, &_vlan);
if (!vlan)
return false;
@@ -126,7 +164,7 @@ ipv6:
struct pppoe_hdr hdr;
__be16 proto;
} *hdr, _hdr;
- hdr = skb_header_pointer(skb, nhoff, sizeof(_hdr), &_hdr);
+ hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr);
if (!hdr)
return false;
proto = hdr->proto;
@@ -140,6 +178,9 @@ ipv6:
return false;
}
}
+ case htons(ETH_P_FCOE):
+ flow->thoff = (u16)(nhoff + FCOE_HEADER_LEN);
+ /* fall through */
default:
return false;
}
@@ -151,7 +192,7 @@ ipv6:
__be16 proto;
} *hdr, _hdr;
- hdr = skb_header_pointer(skb, nhoff, sizeof(_hdr), &_hdr);
+ hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr);
if (!hdr)
return false;
/*
@@ -171,8 +212,9 @@ ipv6:
const struct ethhdr *eth;
struct ethhdr _eth;
- eth = skb_header_pointer(skb, nhoff,
- sizeof(_eth), &_eth);
+ eth = __skb_header_pointer(skb, nhoff,
+ sizeof(_eth),
+ data, hlen, &_eth);
if (!eth)
return false;
proto = eth->h_proto;
@@ -194,12 +236,16 @@ ipv6:
flow->n_proto = proto;
flow->ip_proto = ip_proto;
- flow->ports = skb_flow_get_ports(skb, nhoff, ip_proto);
flow->thoff = (u16) nhoff;
+ /* unless skb is set we don't need to record port info */
+ if (skb)
+ flow->ports = __skb_flow_get_ports(skb, nhoff, ip_proto,
+ data, hlen);
+
return true;
}
-EXPORT_SYMBOL(skb_flow_dissect);
+EXPORT_SYMBOL(__skb_flow_dissect);
static u32 hashrnd __read_mostly;
static __always_inline void __flow_hash_secret_init(void)
@@ -286,34 +332,27 @@ u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
qcount = dev->tc_to_txq[tc].count;
}
- return (u16) (((u64)skb_get_hash(skb) * qcount) >> 32) + qoffset;
+ return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
}
EXPORT_SYMBOL(__skb_tx_hash);
-/* __skb_get_poff() returns the offset to the payload as far as it could
- * be dissected. The main user is currently BPF, so that we can dynamically
- * truncate packets without needing to push actual payload to the user
- * space and can analyze headers only, instead.
- */
-u32 __skb_get_poff(const struct sk_buff *skb)
+u32 __skb_get_poff(const struct sk_buff *skb, void *data,
+ const struct flow_keys *keys, int hlen)
{
- struct flow_keys keys;
- u32 poff = 0;
+ u32 poff = keys->thoff;
- if (!skb_flow_dissect(skb, &keys))
- return 0;
-
- poff += keys.thoff;
- switch (keys.ip_proto) {
+ switch (keys->ip_proto) {
case IPPROTO_TCP: {
- const struct tcphdr *tcph;
- struct tcphdr _tcph;
+ /* access doff as u8 to avoid unaligned access */
+ const u8 *doff;
+ u8 _doff;
- tcph = skb_header_pointer(skb, poff, sizeof(_tcph), &_tcph);
- if (!tcph)
+ doff = __skb_header_pointer(skb, poff + 12, sizeof(_doff),
+ data, hlen, &_doff);
+ if (!doff)
return poff;
- poff += max_t(u32, sizeof(struct tcphdr), tcph->doff * 4);
+ poff += max_t(u32, sizeof(struct tcphdr), (*doff & 0xF0) >> 2);
break;
}
case IPPROTO_UDP:
@@ -343,6 +382,21 @@ u32 __skb_get_poff(const struct sk_buff *skb)
return poff;
}
+/* skb_get_poff() returns the offset to the payload as far as it could
+ * be dissected. The main user is currently BPF, so that we can dynamically
+ * truncate packets without needing to push actual payload to the user
+ * space and can analyze headers only, instead.
+ */
+u32 skb_get_poff(const struct sk_buff *skb)
+{
+ struct flow_keys keys;
+
+ if (!skb_flow_dissect(skb, &keys))
+ return 0;
+
+ return __skb_get_poff(skb, skb->data, &keys, skb_headlen(skb));
+}
+
static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
{
#ifdef CONFIG_XPS
@@ -359,9 +413,8 @@ static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
if (map->len == 1)
queue_index = map->queues[0];
else
- queue_index = map->queues[
- ((u64)skb_get_hash(skb) * map->len) >> 32];
-
+ queue_index = map->queues[reciprocal_scale(skb_get_hash(skb),
+ map->len)];
if (unlikely(queue_index >= dev->real_num_tx_queues))
queue_index = -1;
}
diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index 9d33dfffca19..9dfb88a933e7 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -91,6 +91,8 @@ struct gen_estimator
u32 avpps;
struct rcu_head e_rcu;
struct rb_node node;
+ struct gnet_stats_basic_cpu __percpu *cpu_bstats;
+ struct rcu_head head;
};
struct gen_estimator_head
@@ -115,9 +117,8 @@ static void est_timer(unsigned long arg)
rcu_read_lock();
list_for_each_entry_rcu(e, &elist[idx].list, list) {
- u64 nbytes;
+ struct gnet_stats_basic_packed b = {0};
u64 brate;
- u32 npackets;
u32 rate;
spin_lock(e->stats_lock);
@@ -125,15 +126,15 @@ static void est_timer(unsigned long arg)
if (e->bstats == NULL)
goto skip;
- nbytes = e->bstats->bytes;
- npackets = e->bstats->packets;
- brate = (nbytes - e->last_bytes)<<(7 - idx);
- e->last_bytes = nbytes;
+ __gnet_stats_copy_basic(&b, e->cpu_bstats, e->bstats);
+
+ brate = (b.bytes - e->last_bytes)<<(7 - idx);
+ e->last_bytes = b.bytes;
e->avbps += (brate >> e->ewma_log) - (e->avbps >> e->ewma_log);
e->rate_est->bps = (e->avbps+0xF)>>5;
- rate = (npackets - e->last_packets)<<(12 - idx);
- e->last_packets = npackets;
+ rate = (b.packets - e->last_packets)<<(12 - idx);
+ e->last_packets = b.packets;
e->avpps += (rate >> e->ewma_log) - (e->avpps >> e->ewma_log);
e->rate_est->pps = (e->avpps+0x1FF)>>10;
skip:
@@ -203,12 +204,14 @@ struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats
*
*/
int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
+ struct gnet_stats_basic_cpu __percpu *cpu_bstats,
struct gnet_stats_rate_est64 *rate_est,
spinlock_t *stats_lock,
struct nlattr *opt)
{
struct gen_estimator *est;
struct gnet_estimator *parm = nla_data(opt);
+ struct gnet_stats_basic_packed b = {0};
int idx;
if (nla_len(opt) < sizeof(*parm))
@@ -221,15 +224,18 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
if (est == NULL)
return -ENOBUFS;
+ __gnet_stats_copy_basic(&b, cpu_bstats, bstats);
+
idx = parm->interval + 2;
est->bstats = bstats;
est->rate_est = rate_est;
est->stats_lock = stats_lock;
est->ewma_log = parm->ewma_log;
- est->last_bytes = bstats->bytes;
+ est->last_bytes = b.bytes;
est->avbps = rate_est->bps<<5;
- est->last_packets = bstats->packets;
+ est->last_packets = b.packets;
est->avpps = rate_est->pps<<10;
+ est->cpu_bstats = cpu_bstats;
spin_lock_bh(&est_tree_lock);
if (!elist[idx].timer.function) {
@@ -290,11 +296,12 @@ EXPORT_SYMBOL(gen_kill_estimator);
* Returns 0 on success or a negative error code.
*/
int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
+ struct gnet_stats_basic_cpu __percpu *cpu_bstats,
struct gnet_stats_rate_est64 *rate_est,
spinlock_t *stats_lock, struct nlattr *opt)
{
gen_kill_estimator(bstats, rate_est);
- return gen_new_estimator(bstats, rate_est, stats_lock, opt);
+ return gen_new_estimator(bstats, cpu_bstats, rate_est, stats_lock, opt);
}
EXPORT_SYMBOL(gen_replace_estimator);
diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
index 2ddbce4cce14..0c08062d1796 100644
--- a/net/core/gen_stats.c
+++ b/net/core/gen_stats.c
@@ -97,6 +97,43 @@ gnet_stats_start_copy(struct sk_buff *skb, int type, spinlock_t *lock,
}
EXPORT_SYMBOL(gnet_stats_start_copy);
+static void
+__gnet_stats_copy_basic_cpu(struct gnet_stats_basic_packed *bstats,
+ struct gnet_stats_basic_cpu __percpu *cpu)
+{
+ int i;
+
+ for_each_possible_cpu(i) {
+ struct gnet_stats_basic_cpu *bcpu = per_cpu_ptr(cpu, i);
+ unsigned int start;
+ u64 bytes;
+ u32 packets;
+
+ do {
+ start = u64_stats_fetch_begin_irq(&bcpu->syncp);
+ bytes = bcpu->bstats.bytes;
+ packets = bcpu->bstats.packets;
+ } while (u64_stats_fetch_retry_irq(&bcpu->syncp, start));
+
+ bstats->bytes += bytes;
+ bstats->packets += packets;
+ }
+}
+
+void
+__gnet_stats_copy_basic(struct gnet_stats_basic_packed *bstats,
+ struct gnet_stats_basic_cpu __percpu *cpu,
+ struct gnet_stats_basic_packed *b)
+{
+ if (cpu) {
+ __gnet_stats_copy_basic_cpu(bstats, cpu);
+ } else {
+ bstats->bytes = b->bytes;
+ bstats->packets = b->packets;
+ }
+}
+EXPORT_SYMBOL(__gnet_stats_copy_basic);
+
/**
* gnet_stats_copy_basic - copy basic statistics into statistic TLV
* @d: dumping handle
@@ -109,19 +146,25 @@ EXPORT_SYMBOL(gnet_stats_start_copy);
* if the room in the socket buffer was not sufficient.
*/
int
-gnet_stats_copy_basic(struct gnet_dump *d, struct gnet_stats_basic_packed *b)
+gnet_stats_copy_basic(struct gnet_dump *d,
+ struct gnet_stats_basic_cpu __percpu *cpu,
+ struct gnet_stats_basic_packed *b)
{
+ struct gnet_stats_basic_packed bstats = {0};
+
+ __gnet_stats_copy_basic(&bstats, cpu, b);
+
if (d->compat_tc_stats) {
- d->tc_stats.bytes = b->bytes;
- d->tc_stats.packets = b->packets;
+ d->tc_stats.bytes = bstats.bytes;
+ d->tc_stats.packets = bstats.packets;
}
if (d->tail) {
struct gnet_stats_basic sb;
memset(&sb, 0, sizeof(sb));
- sb.bytes = b->bytes;
- sb.packets = b->packets;
+ sb.bytes = bstats.bytes;
+ sb.packets = bstats.packets;
return gnet_stats_copy(d, TCA_STATS_BASIC, &sb, sizeof(sb));
}
return 0;
@@ -172,29 +215,74 @@ gnet_stats_copy_rate_est(struct gnet_dump *d,
}
EXPORT_SYMBOL(gnet_stats_copy_rate_est);
+static void
+__gnet_stats_copy_queue_cpu(struct gnet_stats_queue *qstats,
+ const struct gnet_stats_queue __percpu *q)
+{
+ int i;
+
+ for_each_possible_cpu(i) {
+ const struct gnet_stats_queue *qcpu = per_cpu_ptr(q, i);
+
+ qstats->qlen = 0;
+ qstats->backlog += qcpu->backlog;
+ qstats->drops += qcpu->drops;
+ qstats->requeues += qcpu->requeues;
+ qstats->overlimits += qcpu->overlimits;
+ }
+}
+
+static void __gnet_stats_copy_queue(struct gnet_stats_queue *qstats,
+ const struct gnet_stats_queue __percpu *cpu,
+ const struct gnet_stats_queue *q,
+ __u32 qlen)
+{
+ if (cpu) {
+ __gnet_stats_copy_queue_cpu(qstats, cpu);
+ } else {
+ qstats->qlen = q->qlen;
+ qstats->backlog = q->backlog;
+ qstats->drops = q->drops;
+ qstats->requeues = q->requeues;
+ qstats->overlimits = q->overlimits;
+ }
+
+ qstats->qlen = qlen;
+}
+
/**
* gnet_stats_copy_queue - copy queue statistics into statistics TLV
* @d: dumping handle
+ * @cpu_q: per cpu queue statistics
* @q: queue statistics
+ * @qlen: queue length statistics
*
* Appends the queue statistics to the top level TLV created by
- * gnet_stats_start_copy().
+ * gnet_stats_start_copy(). Using per cpu queue statistics if
+ * they are available.
*
* Returns 0 on success or -1 with the statistic lock released
* if the room in the socket buffer was not sufficient.
*/
int
-gnet_stats_copy_queue(struct gnet_dump *d, struct gnet_stats_queue *q)
+gnet_stats_copy_queue(struct gnet_dump *d,
+ struct gnet_stats_queue __percpu *cpu_q,
+ struct gnet_stats_queue *q, __u32 qlen)
{
+ struct gnet_stats_queue qstats = {0};
+
+ __gnet_stats_copy_queue(&qstats, cpu_q, q, qlen);
+
if (d->compat_tc_stats) {
- d->tc_stats.drops = q->drops;
- d->tc_stats.qlen = q->qlen;
- d->tc_stats.backlog = q->backlog;
- d->tc_stats.overlimits = q->overlimits;
+ d->tc_stats.drops = qstats.drops;
+ d->tc_stats.qlen = qstats.qlen;
+ d->tc_stats.backlog = qstats.backlog;
+ d->tc_stats.overlimits = qstats.overlimits;
}
if (d->tail)
- return gnet_stats_copy(d, TCA_STATS_QUEUE, q, sizeof(*q));
+ return gnet_stats_copy(d, TCA_STATS_QUEUE,
+ &qstats, sizeof(qstats));
return 0;
}
diff --git a/net/core/iovec.c b/net/core/iovec.c
index e1ec45ab1e63..dcbe98b3726a 100644
--- a/net/core/iovec.c
+++ b/net/core/iovec.c
@@ -28,53 +28,6 @@
#include <net/sock.h>
/*
- * Verify iovec. The caller must ensure that the iovec is big enough
- * to hold the message iovec.
- *
- * Save time not doing access_ok. copy_*_user will make this work
- * in any case.
- */
-
-int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr_storage *address, int mode)
-{
- int size, ct, err;
-
- if (m->msg_name && m->msg_namelen) {
- if (mode == VERIFY_READ) {
- void __user *namep;
- namep = (void __user __force *) m->msg_name;
- err = move_addr_to_kernel(namep, m->msg_namelen,
- address);
- if (err < 0)
- return err;
- }
- m->msg_name = address;
- } else {
- m->msg_name = NULL;
- m->msg_namelen = 0;
- }
-
- size = m->msg_iovlen * sizeof(struct iovec);
- if (copy_from_user(iov, (void __user __force *) m->msg_iov, size))
- return -EFAULT;
-
- m->msg_iov = iov;
- err = 0;
-
- for (ct = 0; ct < m->msg_iovlen; ct++) {
- size_t len = iov[ct].iov_len;
-
- if (len > INT_MAX - err) {
- len = INT_MAX - err;
- iov[ct].iov_len = len;
- }
- err += len;
- }
-
- return err;
-}
-
-/*
* And now for the all-in-one: copy and checksum from a user iovec
* directly to a datagram
* Calls to csum_partial but the last must be in 32 bit chunks
diff --git a/net/core/link_watch.c b/net/core/link_watch.c
index bd0767e6b2b3..49a9e3e06c08 100644
--- a/net/core/link_watch.c
+++ b/net/core/link_watch.c
@@ -21,7 +21,7 @@
#include <linux/spinlock.h>
#include <linux/workqueue.h>
#include <linux/bitops.h>
-#include <asm/types.h>
+#include <linux/types.h>
enum lw_bits {
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index ef31fef25e5a..8e38f17288d3 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -56,7 +56,6 @@ static void __neigh_notify(struct neighbour *n, int type, int flags);
static void neigh_update_notify(struct neighbour *neigh);
static int pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev);
-static struct neigh_table *neigh_tables;
#ifdef CONFIG_PROC_FS
static const struct file_operations neigh_stat_seq_fops;
#endif
@@ -87,13 +86,8 @@ static const struct file_operations neigh_stat_seq_fops;
the most complicated procedure, which we allow is dev->hard_header.
It is supposed, that dev->hard_header is simplistic and does
not make callbacks to neighbour tables.
-
- The last lock is neigh_tbl_lock. It is pure SMP lock, protecting
- list of neighbour tables. This list is used only in process context,
*/
-static DEFINE_RWLOCK(neigh_tbl_lock);
-
static int neigh_blackhole(struct neighbour *neigh, struct sk_buff *skb)
{
kfree_skb(skb);
@@ -773,7 +767,7 @@ static void neigh_periodic_work(struct work_struct *work)
if (time_after(jiffies, tbl->last_rand + 300 * HZ)) {
struct neigh_parms *p;
tbl->last_rand = jiffies;
- for (p = &tbl->parms; p; p = p->next)
+ list_for_each_entry(p, &tbl->parms_list, list)
p->reachable_time =
neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME));
}
@@ -1446,7 +1440,7 @@ static inline struct neigh_parms *lookup_neigh_parms(struct neigh_table *tbl,
{
struct neigh_parms *p;
- for (p = &tbl->parms; p; p = p->next) {
+ list_for_each_entry(p, &tbl->parms_list, list) {
if ((p->dev && p->dev->ifindex == ifindex && net_eq(neigh_parms_net(p), net)) ||
(!p->dev && !ifindex && net_eq(net, &init_net)))
return p;
@@ -1481,8 +1475,7 @@ struct neigh_parms *neigh_parms_alloc(struct net_device *dev,
}
write_lock_bh(&tbl->lock);
- p->next = tbl->parms.next;
- tbl->parms.next = p;
+ list_add(&p->list, &tbl->parms.list);
write_unlock_bh(&tbl->lock);
neigh_parms_data_state_cleanall(p);
@@ -1501,24 +1494,15 @@ static void neigh_rcu_free_parms(struct rcu_head *head)
void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms)
{
- struct neigh_parms **p;
-
if (!parms || parms == &tbl->parms)
return;
write_lock_bh(&tbl->lock);
- for (p = &tbl->parms.next; *p; p = &(*p)->next) {
- if (*p == parms) {
- *p = parms->next;
- parms->dead = 1;
- write_unlock_bh(&tbl->lock);
- if (parms->dev)
- dev_put(parms->dev);
- call_rcu(&parms->rcu_head, neigh_rcu_free_parms);
- return;
- }
- }
+ list_del(&parms->list);
+ parms->dead = 1;
write_unlock_bh(&tbl->lock);
- neigh_dbg(1, "%s: not found\n", __func__);
+ if (parms->dev)
+ dev_put(parms->dev);
+ call_rcu(&parms->rcu_head, neigh_rcu_free_parms);
}
EXPORT_SYMBOL(neigh_parms_release);
@@ -1530,11 +1514,15 @@ static void neigh_parms_destroy(struct neigh_parms *parms)
static struct lock_class_key neigh_table_proxy_queue_class;
-static void neigh_table_init_no_netlink(struct neigh_table *tbl)
+static struct neigh_table *neigh_tables[NEIGH_NR_TABLES] __read_mostly;
+
+void neigh_table_init(int index, struct neigh_table *tbl)
{
unsigned long now = jiffies;
unsigned long phsize;
+ INIT_LIST_HEAD(&tbl->parms_list);
+ list_add(&tbl->parms.list, &tbl->parms_list);
write_pnet(&tbl->parms.net, &init_net);
atomic_set(&tbl->parms.refcnt, 1);
tbl->parms.reachable_time =
@@ -1574,34 +1562,14 @@ static void neigh_table_init_no_netlink(struct neigh_table *tbl)
tbl->last_flush = now;
tbl->last_rand = now + tbl->parms.reachable_time * 20;
-}
-
-void neigh_table_init(struct neigh_table *tbl)
-{
- struct neigh_table *tmp;
-
- neigh_table_init_no_netlink(tbl);
- write_lock(&neigh_tbl_lock);
- for (tmp = neigh_tables; tmp; tmp = tmp->next) {
- if (tmp->family == tbl->family)
- break;
- }
- tbl->next = neigh_tables;
- neigh_tables = tbl;
- write_unlock(&neigh_tbl_lock);
- if (unlikely(tmp)) {
- pr_err("Registering multiple tables for family %d\n",
- tbl->family);
- dump_stack();
- }
+ neigh_tables[index] = tbl;
}
EXPORT_SYMBOL(neigh_table_init);
-int neigh_table_clear(struct neigh_table *tbl)
+int neigh_table_clear(int index, struct neigh_table *tbl)
{
- struct neigh_table **tp;
-
+ neigh_tables[index] = NULL;
/* It is not clean... Fix it to unload IPv6 module safely */
cancel_delayed_work_sync(&tbl->gc_work);
del_timer_sync(&tbl->proxy_timer);
@@ -1609,14 +1577,6 @@ int neigh_table_clear(struct neigh_table *tbl)
neigh_ifdown(tbl, NULL);
if (atomic_read(&tbl->entries))
pr_crit("neighbour leakage\n");
- write_lock(&neigh_tbl_lock);
- for (tp = &neigh_tables; *tp; tp = &(*tp)->next) {
- if (*tp == tbl) {
- *tp = tbl->next;
- break;
- }
- }
- write_unlock(&neigh_tbl_lock);
call_rcu(&rcu_dereference_protected(tbl->nht, 1)->rcu,
neigh_hash_free_rcu);
@@ -1634,12 +1594,32 @@ int neigh_table_clear(struct neigh_table *tbl)
}
EXPORT_SYMBOL(neigh_table_clear);
+static struct neigh_table *neigh_find_table(int family)
+{
+ struct neigh_table *tbl = NULL;
+
+ switch (family) {
+ case AF_INET:
+ tbl = neigh_tables[NEIGH_ARP_TABLE];
+ break;
+ case AF_INET6:
+ tbl = neigh_tables[NEIGH_ND_TABLE];
+ break;
+ case AF_DECnet:
+ tbl = neigh_tables[NEIGH_DN_TABLE];
+ break;
+ }
+
+ return tbl;
+}
+
static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh)
{
struct net *net = sock_net(skb->sk);
struct ndmsg *ndm;
struct nlattr *dst_attr;
struct neigh_table *tbl;
+ struct neighbour *neigh;
struct net_device *dev = NULL;
int err = -EINVAL;
@@ -1660,39 +1640,31 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh)
}
}
- read_lock(&neigh_tbl_lock);
- for (tbl = neigh_tables; tbl; tbl = tbl->next) {
- struct neighbour *neigh;
+ tbl = neigh_find_table(ndm->ndm_family);
+ if (tbl == NULL)
+ return -EAFNOSUPPORT;
- if (tbl->family != ndm->ndm_family)
- continue;
- read_unlock(&neigh_tbl_lock);
-
- if (nla_len(dst_attr) < tbl->key_len)
- goto out;
-
- if (ndm->ndm_flags & NTF_PROXY) {
- err = pneigh_delete(tbl, net, nla_data(dst_attr), dev);
- goto out;
- }
+ if (nla_len(dst_attr) < tbl->key_len)
+ goto out;
- if (dev == NULL)
- goto out;
+ if (ndm->ndm_flags & NTF_PROXY) {
+ err = pneigh_delete(tbl, net, nla_data(dst_attr), dev);
+ goto out;
+ }
- neigh = neigh_lookup(tbl, nla_data(dst_attr), dev);
- if (neigh == NULL) {
- err = -ENOENT;
- goto out;
- }
+ if (dev == NULL)
+ goto out;
- err = neigh_update(neigh, NULL, NUD_FAILED,
- NEIGH_UPDATE_F_OVERRIDE |
- NEIGH_UPDATE_F_ADMIN);
- neigh_release(neigh);
+ neigh = neigh_lookup(tbl, nla_data(dst_attr), dev);
+ if (neigh == NULL) {
+ err = -ENOENT;
goto out;
}
- read_unlock(&neigh_tbl_lock);
- err = -EAFNOSUPPORT;
+
+ err = neigh_update(neigh, NULL, NUD_FAILED,
+ NEIGH_UPDATE_F_OVERRIDE |
+ NEIGH_UPDATE_F_ADMIN);
+ neigh_release(neigh);
out:
return err;
@@ -1700,11 +1672,14 @@ out:
static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh)
{
+ int flags = NEIGH_UPDATE_F_ADMIN | NEIGH_UPDATE_F_OVERRIDE;
struct net *net = sock_net(skb->sk);
struct ndmsg *ndm;
struct nlattr *tb[NDA_MAX+1];
struct neigh_table *tbl;
struct net_device *dev = NULL;
+ struct neighbour *neigh;
+ void *dst, *lladdr;
int err;
ASSERT_RTNL();
@@ -1728,70 +1703,60 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh)
goto out;
}
- read_lock(&neigh_tbl_lock);
- for (tbl = neigh_tables; tbl; tbl = tbl->next) {
- int flags = NEIGH_UPDATE_F_ADMIN | NEIGH_UPDATE_F_OVERRIDE;
- struct neighbour *neigh;
- void *dst, *lladdr;
+ tbl = neigh_find_table(ndm->ndm_family);
+ if (tbl == NULL)
+ return -EAFNOSUPPORT;
- if (tbl->family != ndm->ndm_family)
- continue;
- read_unlock(&neigh_tbl_lock);
+ if (nla_len(tb[NDA_DST]) < tbl->key_len)
+ goto out;
+ dst = nla_data(tb[NDA_DST]);
+ lladdr = tb[NDA_LLADDR] ? nla_data(tb[NDA_LLADDR]) : NULL;
- if (nla_len(tb[NDA_DST]) < tbl->key_len)
- goto out;
- dst = nla_data(tb[NDA_DST]);
- lladdr = tb[NDA_LLADDR] ? nla_data(tb[NDA_LLADDR]) : NULL;
+ if (ndm->ndm_flags & NTF_PROXY) {
+ struct pneigh_entry *pn;
- if (ndm->ndm_flags & NTF_PROXY) {
- struct pneigh_entry *pn;
+ err = -ENOBUFS;
+ pn = pneigh_lookup(tbl, net, dst, dev, 1);
+ if (pn) {
+ pn->flags = ndm->ndm_flags;
+ err = 0;
+ }
+ goto out;
+ }
- err = -ENOBUFS;
- pn = pneigh_lookup(tbl, net, dst, dev, 1);
- if (pn) {
- pn->flags = ndm->ndm_flags;
- err = 0;
- }
+ if (dev == NULL)
+ goto out;
+
+ neigh = neigh_lookup(tbl, dst, dev);
+ if (neigh == NULL) {
+ if (!(nlh->nlmsg_flags & NLM_F_CREATE)) {
+ err = -ENOENT;
goto out;
}
- if (dev == NULL)
+ neigh = __neigh_lookup_errno(tbl, dst, dev);
+ if (IS_ERR(neigh)) {
+ err = PTR_ERR(neigh);
+ goto out;
+ }
+ } else {
+ if (nlh->nlmsg_flags & NLM_F_EXCL) {
+ err = -EEXIST;
+ neigh_release(neigh);
goto out;
-
- neigh = neigh_lookup(tbl, dst, dev);
- if (neigh == NULL) {
- if (!(nlh->nlmsg_flags & NLM_F_CREATE)) {
- err = -ENOENT;
- goto out;
- }
-
- neigh = __neigh_lookup_errno(tbl, dst, dev);
- if (IS_ERR(neigh)) {
- err = PTR_ERR(neigh);
- goto out;
- }
- } else {
- if (nlh->nlmsg_flags & NLM_F_EXCL) {
- err = -EEXIST;
- neigh_release(neigh);
- goto out;
- }
-
- if (!(nlh->nlmsg_flags & NLM_F_REPLACE))
- flags &= ~NEIGH_UPDATE_F_OVERRIDE;
}
- if (ndm->ndm_flags & NTF_USE) {
- neigh_event_send(neigh, NULL);
- err = 0;
- } else
- err = neigh_update(neigh, lladdr, ndm->ndm_state, flags);
- neigh_release(neigh);
- goto out;
+ if (!(nlh->nlmsg_flags & NLM_F_REPLACE))
+ flags &= ~NEIGH_UPDATE_F_OVERRIDE;
}
- read_unlock(&neigh_tbl_lock);
- err = -EAFNOSUPPORT;
+ if (ndm->ndm_flags & NTF_USE) {
+ neigh_event_send(neigh, NULL);
+ err = 0;
+ } else
+ err = neigh_update(neigh, lladdr, ndm->ndm_state, flags);
+ neigh_release(neigh);
+
out:
return err;
}
@@ -1990,7 +1955,8 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh)
struct neigh_table *tbl;
struct ndtmsg *ndtmsg;
struct nlattr *tb[NDTA_MAX+1];
- int err;
+ bool found = false;
+ int err, tidx;
err = nlmsg_parse(nlh, sizeof(*ndtmsg), tb, NDTA_MAX,
nl_neightbl_policy);
@@ -2003,19 +1969,21 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh)
}
ndtmsg = nlmsg_data(nlh);
- read_lock(&neigh_tbl_lock);
- for (tbl = neigh_tables; tbl; tbl = tbl->next) {
+
+ for (tidx = 0; tidx < NEIGH_NR_TABLES; tidx++) {
+ tbl = neigh_tables[tidx];
+ if (!tbl)
+ continue;
if (ndtmsg->ndtm_family && tbl->family != ndtmsg->ndtm_family)
continue;
-
- if (nla_strcmp(tb[NDTA_NAME], tbl->id) == 0)
+ if (nla_strcmp(tb[NDTA_NAME], tbl->id) == 0) {
+ found = true;
break;
+ }
}
- if (tbl == NULL) {
- err = -ENOENT;
- goto errout_locked;
- }
+ if (!found)
+ return -ENOENT;
/*
* We acquire tbl->lock to be nice to the periodic timers and
@@ -2126,8 +2094,6 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh)
errout_tbl_lock:
write_unlock_bh(&tbl->lock);
-errout_locked:
- read_unlock(&neigh_tbl_lock);
errout:
return err;
}
@@ -2142,10 +2108,13 @@ static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
family = ((struct rtgenmsg *) nlmsg_data(cb->nlh))->rtgen_family;
- read_lock(&neigh_tbl_lock);
- for (tbl = neigh_tables, tidx = 0; tbl; tbl = tbl->next, tidx++) {
+ for (tidx = 0; tidx < NEIGH_NR_TABLES; tidx++) {
struct neigh_parms *p;
+ tbl = neigh_tables[tidx];
+ if (!tbl)
+ continue;
+
if (tidx < tbl_skip || (family && tbl->family != family))
continue;
@@ -2154,7 +2123,9 @@ static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
NLM_F_MULTI) <= 0)
break;
- for (nidx = 0, p = tbl->parms.next; p; p = p->next) {
+ nidx = 0;
+ p = list_next_entry(&tbl->parms, list);
+ list_for_each_entry_from(p, &tbl->parms_list, list) {
if (!net_eq(neigh_parms_net(p), net))
continue;
@@ -2174,7 +2145,6 @@ static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
neigh_skip = 0;
}
out:
- read_unlock(&neigh_tbl_lock);
cb->args[0] = tidx;
cb->args[1] = nidx;
@@ -2357,7 +2327,6 @@ static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
int proxy = 0;
int err;
- read_lock(&neigh_tbl_lock);
family = ((struct rtgenmsg *) nlmsg_data(cb->nlh))->rtgen_family;
/* check for full ndmsg structure presence, family member is
@@ -2369,8 +2338,11 @@ static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
s_t = cb->args[0];
- for (tbl = neigh_tables, t = 0; tbl;
- tbl = tbl->next, t++) {
+ for (t = 0; t < NEIGH_NR_TABLES; t++) {
+ tbl = neigh_tables[t];
+
+ if (!tbl)
+ continue;
if (t < s_t || (family && tbl->family != family))
continue;
if (t > s_t)
@@ -2383,7 +2355,6 @@ static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
if (err < 0)
break;
}
- read_unlock(&neigh_tbl_lock);
cb->args[0] = t;
return skb->len;
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 9dd06699b09c..999341244434 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -12,6 +12,7 @@
#include <linux/capability.h>
#include <linux/kernel.h>
#include <linux/netdevice.h>
+#include <net/switchdev.h>
#include <linux/if_arp.h>
#include <linux/slab.h>
#include <linux/nsproxy.h>
@@ -325,6 +326,23 @@ static ssize_t tx_queue_len_store(struct device *dev,
}
NETDEVICE_SHOW_RW(tx_queue_len, fmt_ulong);
+static int change_gro_flush_timeout(struct net_device *dev, unsigned long val)
+{
+ dev->gro_flush_timeout = val;
+ return 0;
+}
+
+static ssize_t gro_flush_timeout_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ return netdev_store(dev, attr, buf, len, change_gro_flush_timeout);
+}
+NETDEVICE_SHOW_RW(gro_flush_timeout, fmt_ulong);
+
static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr,
const char *buf, size_t len)
{
@@ -387,7 +405,7 @@ static ssize_t phys_port_id_show(struct device *dev,
return restart_syscall();
if (dev_isalive(netdev)) {
- struct netdev_phys_port_id ppid;
+ struct netdev_phys_item_id ppid;
ret = dev_get_phys_port_id(netdev, &ppid);
if (!ret)
@@ -399,6 +417,28 @@ static ssize_t phys_port_id_show(struct device *dev,
}
static DEVICE_ATTR_RO(phys_port_id);
+static ssize_t phys_switch_id_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct net_device *netdev = to_net_dev(dev);
+ ssize_t ret = -EINVAL;
+
+ if (!rtnl_trylock())
+ return restart_syscall();
+
+ if (dev_isalive(netdev)) {
+ struct netdev_phys_item_id ppid;
+
+ ret = netdev_switch_parent_id_get(netdev, &ppid);
+ if (!ret)
+ ret = sprintf(buf, "%*phN\n", ppid.id_len, ppid.id);
+ }
+ rtnl_unlock();
+
+ return ret;
+}
+static DEVICE_ATTR_RO(phys_switch_id);
+
static struct attribute *net_class_attrs[] = {
&dev_attr_netdev_group.attr,
&dev_attr_type.attr,
@@ -422,7 +462,9 @@ static struct attribute *net_class_attrs[] = {
&dev_attr_mtu.attr,
&dev_attr_flags.attr,
&dev_attr_tx_queue_len.attr,
+ &dev_attr_gro_flush_timeout.attr,
&dev_attr_phys_port_id.attr,
+ &dev_attr_phys_switch_id.attr,
NULL,
};
ATTRIBUTE_GROUPS(net_class);
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 7c6b51a58968..ce780c722e48 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -224,7 +224,7 @@ static void net_free(struct net *net)
return;
}
#endif
- kfree(net->gen);
+ kfree(rcu_access_pointer(net->gen));
kmem_cache_free(net_cachep, net);
}
@@ -337,17 +337,17 @@ EXPORT_SYMBOL_GPL(__put_net);
struct net *get_net_ns_by_fd(int fd)
{
- struct proc_ns *ei;
struct file *file;
+ struct ns_common *ns;
struct net *net;
file = proc_ns_fget(fd);
if (IS_ERR(file))
return ERR_CAST(file);
- ei = get_proc_ns(file_inode(file));
- if (ei->ns_ops == &netns_operations)
- net = get_net(ei->ns);
+ ns = get_proc_ns(file_inode(file));
+ if (ns->ops == &netns_operations)
+ net = get_net(container_of(ns, struct net, ns));
else
net = ERR_PTR(-EINVAL);
@@ -386,12 +386,15 @@ EXPORT_SYMBOL_GPL(get_net_ns_by_pid);
static __net_init int net_ns_net_init(struct net *net)
{
- return proc_alloc_inum(&net->proc_inum);
+#ifdef CONFIG_NET_NS
+ net->ns.ops = &netns_operations;
+#endif
+ return ns_alloc_inum(&net->ns);
}
static __net_exit void net_ns_net_exit(struct net *net)
{
- proc_free_inum(net->proc_inum);
+ ns_free_inum(&net->ns);
}
static struct pernet_operations __net_initdata net_ns_ops = {
@@ -629,7 +632,7 @@ void unregister_pernet_device(struct pernet_operations *ops)
EXPORT_SYMBOL_GPL(unregister_pernet_device);
#ifdef CONFIG_NET_NS
-static void *netns_get(struct task_struct *task)
+static struct ns_common *netns_get(struct task_struct *task)
{
struct net *net = NULL;
struct nsproxy *nsproxy;
@@ -640,17 +643,22 @@ static void *netns_get(struct task_struct *task)
net = get_net(nsproxy->net_ns);
task_unlock(task);
- return net;
+ return net ? &net->ns : NULL;
}
-static void netns_put(void *ns)
+static inline struct net *to_net_ns(struct ns_common *ns)
{
- put_net(ns);
+ return container_of(ns, struct net, ns);
}
-static int netns_install(struct nsproxy *nsproxy, void *ns)
+static void netns_put(struct ns_common *ns)
{
- struct net *net = ns;
+ put_net(to_net_ns(ns));
+}
+
+static int netns_install(struct nsproxy *nsproxy, struct ns_common *ns)
+{
+ struct net *net = to_net_ns(ns);
if (!ns_capable(net->user_ns, CAP_SYS_ADMIN) ||
!ns_capable(current_user_ns(), CAP_SYS_ADMIN))
@@ -661,18 +669,11 @@ static int netns_install(struct nsproxy *nsproxy, void *ns)
return 0;
}
-static unsigned int netns_inum(void *ns)
-{
- struct net *net = ns;
- return net->proc_inum;
-}
-
const struct proc_ns_operations netns_operations = {
.name = "net",
.type = CLONE_NEWNET,
.get = netns_get,
.put = netns_put,
.install = netns_install,
- .inum = netns_inum,
};
#endif
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 907fb5e36c02..e0ad5d16c9c5 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -72,7 +72,6 @@ module_param(carrier_timeout, uint, 0644);
static int netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev,
struct netdev_queue *txq)
{
- const struct net_device_ops *ops = dev->netdev_ops;
int status = NETDEV_TX_OK;
netdev_features_t features;
@@ -80,8 +79,7 @@ static int netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev,
if (vlan_tx_tag_present(skb) &&
!vlan_hw_offload_capable(features, skb->vlan_proto)) {
- skb = __vlan_put_tag(skb, skb->vlan_proto,
- vlan_tx_tag_get(skb));
+ skb = __vlan_hwaccel_push_inside(skb);
if (unlikely(!skb)) {
/* This is actually a packet drop, but we
* don't want the code that calls this
@@ -89,12 +87,9 @@ static int netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev,
*/
goto out;
}
- skb->vlan_tci = 0;
}
- status = ops->ndo_start_xmit(skb, dev);
- if (status == NETDEV_TX_OK)
- txq_trans_update(txq);
+ status = netdev_start_xmit(skb, dev, txq, false);
out:
return status;
@@ -116,7 +111,7 @@ static void queue_process(struct work_struct *work)
continue;
}
- txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb));
+ txq = skb_get_tx_queue(dev, skb);
local_irq_save(flags);
HARD_TX_LOCK(dev, txq, smp_processor_id());
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 8b849ddfef2e..da934fc3faa8 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -202,6 +202,7 @@
#define F_QUEUE_MAP_CPU (1<<14) /* queue map mirrors smp_processor_id() */
#define F_NODE (1<<15) /* Node memory alloc*/
#define F_UDPCSUM (1<<16) /* Include UDP checksum */
+#define F_NO_TIMESTAMP (1<<17) /* Don't timestamp packets (default TS) */
/* Thread control flag bits */
#define T_STOP (1<<0) /* Stop run */
@@ -386,6 +387,7 @@ struct pktgen_dev {
u16 queue_map_min;
u16 queue_map_max;
__u32 skb_priority; /* skb priority field */
+ unsigned int burst; /* number of duplicated packets to burst */
int node; /* Memory node */
#ifdef CONFIG_XFRM
@@ -505,7 +507,7 @@ static ssize_t pgctrl_write(struct file *file, const char __user *buf,
pktgen_reset_all_threads(pn);
else
- pr_warning("Unknown command: %s\n", data);
+ pr_warn("Unknown command: %s\n", data);
return count;
}
@@ -612,6 +614,9 @@ static int pktgen_if_show(struct seq_file *seq, void *v)
if (pkt_dev->traffic_class)
seq_printf(seq, " traffic_class: 0x%02x\n", pkt_dev->traffic_class);
+ if (pkt_dev->burst > 1)
+ seq_printf(seq, " burst: %d\n", pkt_dev->burst);
+
if (pkt_dev->node >= 0)
seq_printf(seq, " node: %d\n", pkt_dev->node);
@@ -638,6 +643,9 @@ static int pktgen_if_show(struct seq_file *seq, void *v)
if (pkt_dev->flags & F_UDPCSUM)
seq_puts(seq, "UDPCSUM ");
+ if (pkt_dev->flags & F_NO_TIMESTAMP)
+ seq_puts(seq, "NO_TIMESTAMP ");
+
if (pkt_dev->flags & F_MPLS_RND)
seq_puts(seq, "MPLS_RND ");
@@ -857,14 +865,14 @@ static ssize_t pktgen_if_write(struct file *file,
pg_result = &(pkt_dev->result[0]);
if (count < 1) {
- pr_warning("wrong command format\n");
+ pr_warn("wrong command format\n");
return -EINVAL;
}
max = count;
tmp = count_trail_chars(user_buffer, max);
if (tmp < 0) {
- pr_warning("illegal format\n");
+ pr_warn("illegal format\n");
return tmp;
}
i = tmp;
@@ -1120,6 +1128,16 @@ static ssize_t pktgen_if_write(struct file *file,
pkt_dev->dst_mac_count);
return count;
}
+ if (!strcmp(name, "burst")) {
+ len = num_arg(&user_buffer[i], 10, &value);
+ if (len < 0)
+ return len;
+
+ i += len;
+ pkt_dev->burst = value < 1 ? 1 : value;
+ sprintf(pg_result, "OK: burst=%d", pkt_dev->burst);
+ return count;
+ }
if (!strcmp(name, "node")) {
len = num_arg(&user_buffer[i], 10, &value);
if (len < 0)
@@ -1243,6 +1261,9 @@ static ssize_t pktgen_if_write(struct file *file,
else if (strcmp(f, "!UDPCSUM") == 0)
pkt_dev->flags &= ~F_UDPCSUM;
+ else if (strcmp(f, "NO_TIMESTAMP") == 0)
+ pkt_dev->flags |= F_NO_TIMESTAMP;
+
else {
sprintf(pg_result,
"Flag -:%s:- unknown\nAvailable flags, (prepend ! to un-set flag):\n%s",
@@ -1251,6 +1272,7 @@ static ssize_t pktgen_if_write(struct file *file,
"MACSRC_RND, MACDST_RND, TXSIZE_RND, IPV6, "
"MPLS_RND, VID_RND, SVID_RND, FLOW_SEQ, "
"QUEUE_MAP_RND, QUEUE_MAP_CPU, UDPCSUM, "
+ "NO_TIMESTAMP, "
#ifdef CONFIG_XFRM
"IPSEC, "
#endif
@@ -2048,15 +2070,15 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev)
ntxq = pkt_dev->odev->real_num_tx_queues;
if (ntxq <= pkt_dev->queue_map_min) {
- pr_warning("WARNING: Requested queue_map_min (zero-based) (%d) exceeds valid range [0 - %d] for (%d) queues on %s, resetting\n",
- pkt_dev->queue_map_min, (ntxq ?: 1) - 1, ntxq,
- pkt_dev->odevname);
+ pr_warn("WARNING: Requested queue_map_min (zero-based) (%d) exceeds valid range [0 - %d] for (%d) queues on %s, resetting\n",
+ pkt_dev->queue_map_min, (ntxq ?: 1) - 1, ntxq,
+ pkt_dev->odevname);
pkt_dev->queue_map_min = (ntxq ?: 1) - 1;
}
if (pkt_dev->queue_map_max >= ntxq) {
- pr_warning("WARNING: Requested queue_map_max (zero-based) (%d) exceeds valid range [0 - %d] for (%d) queues on %s, resetting\n",
- pkt_dev->queue_map_max, (ntxq ?: 1) - 1, ntxq,
- pkt_dev->odevname);
+ pr_warn("WARNING: Requested queue_map_max (zero-based) (%d) exceeds valid range [0 - %d] for (%d) queues on %s, resetting\n",
+ pkt_dev->queue_map_max, (ntxq ?: 1) - 1, ntxq,
+ pkt_dev->odevname);
pkt_dev->queue_map_max = (ntxq ?: 1) - 1;
}
@@ -2685,9 +2707,14 @@ static void pktgen_finalize_skb(struct pktgen_dev *pkt_dev, struct sk_buff *skb,
pgh->pgh_magic = htonl(PKTGEN_MAGIC);
pgh->seq_num = htonl(pkt_dev->seq_num);
- do_gettimeofday(&timestamp);
- pgh->tv_sec = htonl(timestamp.tv_sec);
- pgh->tv_usec = htonl(timestamp.tv_usec);
+ if (pkt_dev->flags & F_NO_TIMESTAMP) {
+ pgh->tv_sec = 0;
+ pgh->tv_usec = 0;
+ } else {
+ do_gettimeofday(&timestamp);
+ pgh->tv_sec = htonl(timestamp.tv_sec);
+ pgh->tv_usec = htonl(timestamp.tv_usec);
+ }
}
static struct sk_buff *pktgen_alloc_skb(struct net_device *dev,
@@ -3160,8 +3187,8 @@ static int pktgen_stop_device(struct pktgen_dev *pkt_dev)
int nr_frags = pkt_dev->skb ? skb_shinfo(pkt_dev->skb)->nr_frags : -1;
if (!pkt_dev->running) {
- pr_warning("interface: %s is already stopped\n",
- pkt_dev->odevname);
+ pr_warn("interface: %s is already stopped\n",
+ pkt_dev->odevname);
return -EINVAL;
}
@@ -3284,11 +3311,9 @@ static void pktgen_wait_for_skb(struct pktgen_dev *pkt_dev)
static void pktgen_xmit(struct pktgen_dev *pkt_dev)
{
+ unsigned int burst = ACCESS_ONCE(pkt_dev->burst);
struct net_device *odev = pkt_dev->odev;
- netdev_tx_t (*xmit)(struct sk_buff *, struct net_device *)
- = odev->netdev_ops->ndo_start_xmit;
struct netdev_queue *txq;
- u16 queue_map;
int ret;
/* If device is offline, then don't send */
@@ -3326,8 +3351,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
if (pkt_dev->delay && pkt_dev->last_ok)
spin(pkt_dev, pkt_dev->next_tx);
- queue_map = skb_get_queue_mapping(pkt_dev->skb);
- txq = netdev_get_tx_queue(odev, queue_map);
+ txq = skb_get_tx_queue(odev, pkt_dev->skb);
local_bh_disable();
@@ -3338,16 +3362,19 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
pkt_dev->last_ok = 0;
goto unlock;
}
- atomic_inc(&(pkt_dev->skb->users));
- ret = (*xmit)(pkt_dev->skb, odev);
+ atomic_add(burst, &pkt_dev->skb->users);
+
+xmit_more:
+ ret = netdev_start_xmit(pkt_dev->skb, odev, txq, --burst > 0);
switch (ret) {
case NETDEV_TX_OK:
- txq_trans_update(txq);
pkt_dev->last_ok = 1;
pkt_dev->sofar++;
pkt_dev->seq_num++;
pkt_dev->tx_bytes += pkt_dev->last_pkt_size;
+ if (burst > 0 && !netif_xmit_frozen_or_drv_stopped(txq))
+ goto xmit_more;
break;
case NET_XMIT_DROP:
case NET_XMIT_CN:
@@ -3366,6 +3393,8 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
atomic_dec(&(pkt_dev->skb->users));
pkt_dev->last_ok = 0;
}
+ if (unlikely(burst))
+ atomic_sub(burst, &pkt_dev->skb->users);
unlock:
HARD_TX_UNLOCK(odev, txq);
@@ -3564,6 +3593,7 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname)
pkt_dev->svlan_p = 0;
pkt_dev->svlan_cfi = 0;
pkt_dev->svlan_id = 0xffff;
+ pkt_dev->burst = 1;
pkt_dev->node = -1;
err = pktgen_setup_dev(t->net, pkt_dev, ifname);
@@ -3684,7 +3714,7 @@ static int pktgen_remove_device(struct pktgen_thread *t,
pr_debug("remove_device pkt_dev=%p\n", pkt_dev);
if (pkt_dev->running) {
- pr_warning("WARNING: trying to remove a running interface, stopping it now\n");
+ pr_warn("WARNING: trying to remove a running interface, stopping it now\n");
pktgen_stop_device(pkt_dev);
}
@@ -3698,8 +3728,7 @@ static int pktgen_remove_device(struct pktgen_thread *t,
/* Remove proc before if_list entry, because add_device uses
* list to determine if interface already exist, avoid race
* with proc_create_data() */
- if (pkt_dev->entry)
- proc_remove(pkt_dev->entry);
+ proc_remove(pkt_dev->entry);
/* And update the thread if_list */
_rem_dev_from_if_list(t, pkt_dev);
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index f0493e3b7471..9cf6fe9ddc0c 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -36,6 +36,7 @@
#include <linux/mutex.h>
#include <linux/if_addr.h>
#include <linux/if_bridge.h>
+#include <linux/if_vlan.h>
#include <linux/pci.h>
#include <linux/etherdevice.h>
@@ -43,6 +44,7 @@
#include <linux/inet.h>
#include <linux/netdevice.h>
+#include <net/switchdev.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <net/arp.h>
@@ -365,11 +367,10 @@ static void rtnl_lock_unregistering_all(void)
{
struct net *net;
bool unregistering;
- DEFINE_WAIT(wait);
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+ add_wait_queue(&netdev_unregistering_wq, &wait);
for (;;) {
- prepare_to_wait(&netdev_unregistering_wq, &wait,
- TASK_UNINTERRUPTIBLE);
unregistering = false;
rtnl_lock();
for_each_net(net) {
@@ -381,9 +382,10 @@ static void rtnl_lock_unregistering_all(void)
if (!unregistering)
break;
__rtnl_unlock();
- schedule();
+
+ wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
}
- finish_wait(&netdev_unregistering_wq, &wait);
+ remove_wait_queue(&netdev_unregistering_wq, &wait);
}
/**
@@ -868,7 +870,8 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
+ rtnl_port_size(dev, ext_filter_mask) /* IFLA_VF_PORTS + IFLA_PORT_SELF */
+ rtnl_link_get_size(dev) /* IFLA_LINKINFO */
+ rtnl_link_get_af_size(dev) /* IFLA_AF_SPEC */
- + nla_total_size(MAX_PHYS_PORT_ID_LEN); /* IFLA_PHYS_PORT_ID */
+ + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_PORT_ID */
+ + nla_total_size(MAX_PHYS_ITEM_ID_LEN); /* IFLA_PHYS_SWITCH_ID */
}
static int rtnl_vf_ports_fill(struct sk_buff *skb, struct net_device *dev)
@@ -952,7 +955,7 @@ static int rtnl_port_fill(struct sk_buff *skb, struct net_device *dev,
static int rtnl_phys_port_id_fill(struct sk_buff *skb, struct net_device *dev)
{
int err;
- struct netdev_phys_port_id ppid;
+ struct netdev_phys_item_id ppid;
err = dev_get_phys_port_id(dev, &ppid);
if (err) {
@@ -967,6 +970,24 @@ static int rtnl_phys_port_id_fill(struct sk_buff *skb, struct net_device *dev)
return 0;
}
+static int rtnl_phys_switch_id_fill(struct sk_buff *skb, struct net_device *dev)
+{
+ int err;
+ struct netdev_phys_item_id psid;
+
+ err = netdev_switch_parent_id_get(dev, &psid);
+ if (err) {
+ if (err == -EOPNOTSUPP)
+ return 0;
+ return err;
+ }
+
+ if (nla_put(skb, IFLA_PHYS_SWITCH_ID, psid.id_len, psid.id))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
int type, u32 pid, u32 seq, u32 change,
unsigned int flags, u32 ext_filter_mask)
@@ -1039,6 +1060,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
if (rtnl_phys_port_id_fill(skb, dev))
goto nla_put_failure;
+ if (rtnl_phys_switch_id_fill(skb, dev))
+ goto nla_put_failure;
+
attr = nla_reserve(skb, IFLA_STATS,
sizeof(struct rtnl_link_stats));
if (attr == NULL)
@@ -1196,8 +1220,9 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
[IFLA_PROMISCUITY] = { .type = NLA_U32 },
[IFLA_NUM_TX_QUEUES] = { .type = NLA_U32 },
[IFLA_NUM_RX_QUEUES] = { .type = NLA_U32 },
- [IFLA_PHYS_PORT_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_PORT_ID_LEN },
+ [IFLA_PHYS_PORT_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN },
[IFLA_CARRIER_CHANGES] = { .type = NLA_U32 }, /* ignored */
+ [IFLA_PHYS_SWITCH_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN },
};
static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
@@ -1481,9 +1506,12 @@ static int do_set_master(struct net_device *dev, int ifindex)
return 0;
}
+#define DO_SETLINK_MODIFIED 0x01
+/* notify flag means notify + modified. */
+#define DO_SETLINK_NOTIFY 0x03
static int do_setlink(const struct sk_buff *skb,
struct net_device *dev, struct ifinfomsg *ifm,
- struct nlattr **tb, char *ifname, int modified)
+ struct nlattr **tb, char *ifname, int status)
{
const struct net_device_ops *ops = dev->netdev_ops;
int err;
@@ -1495,6 +1523,7 @@ static int do_setlink(const struct sk_buff *skb,
goto errout;
}
if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) {
+ put_net(net);
err = -EPERM;
goto errout;
}
@@ -1502,7 +1531,7 @@ static int do_setlink(const struct sk_buff *skb,
put_net(net);
if (err)
goto errout;
- modified = 1;
+ status |= DO_SETLINK_MODIFIED;
}
if (tb[IFLA_MAP]) {
@@ -1531,7 +1560,7 @@ static int do_setlink(const struct sk_buff *skb,
if (err < 0)
goto errout;
- modified = 1;
+ status |= DO_SETLINK_NOTIFY;
}
if (tb[IFLA_ADDRESS]) {
@@ -1551,19 +1580,19 @@ static int do_setlink(const struct sk_buff *skb,
kfree(sa);
if (err)
goto errout;
- modified = 1;
+ status |= DO_SETLINK_MODIFIED;
}
if (tb[IFLA_MTU]) {
err = dev_set_mtu(dev, nla_get_u32(tb[IFLA_MTU]));
if (err < 0)
goto errout;
- modified = 1;
+ status |= DO_SETLINK_MODIFIED;
}
if (tb[IFLA_GROUP]) {
dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP]));
- modified = 1;
+ status |= DO_SETLINK_NOTIFY;
}
/*
@@ -1575,7 +1604,7 @@ static int do_setlink(const struct sk_buff *skb,
err = dev_change_name(dev, ifname);
if (err < 0)
goto errout;
- modified = 1;
+ status |= DO_SETLINK_MODIFIED;
}
if (tb[IFLA_IFALIAS]) {
@@ -1583,7 +1612,7 @@ static int do_setlink(const struct sk_buff *skb,
nla_len(tb[IFLA_IFALIAS]));
if (err < 0)
goto errout;
- modified = 1;
+ status |= DO_SETLINK_NOTIFY;
}
if (tb[IFLA_BROADCAST]) {
@@ -1601,25 +1630,35 @@ static int do_setlink(const struct sk_buff *skb,
err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]));
if (err)
goto errout;
- modified = 1;
+ status |= DO_SETLINK_MODIFIED;
}
if (tb[IFLA_CARRIER]) {
err = dev_change_carrier(dev, nla_get_u8(tb[IFLA_CARRIER]));
if (err)
goto errout;
- modified = 1;
+ status |= DO_SETLINK_MODIFIED;
}
- if (tb[IFLA_TXQLEN])
- dev->tx_queue_len = nla_get_u32(tb[IFLA_TXQLEN]);
+ if (tb[IFLA_TXQLEN]) {
+ unsigned long value = nla_get_u32(tb[IFLA_TXQLEN]);
+
+ if (dev->tx_queue_len ^ value)
+ status |= DO_SETLINK_NOTIFY;
+
+ dev->tx_queue_len = value;
+ }
if (tb[IFLA_OPERSTATE])
set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE]));
if (tb[IFLA_LINKMODE]) {
+ unsigned char value = nla_get_u8(tb[IFLA_LINKMODE]);
+
write_lock_bh(&dev_base_lock);
- dev->link_mode = nla_get_u8(tb[IFLA_LINKMODE]);
+ if (dev->link_mode ^ value)
+ status |= DO_SETLINK_NOTIFY;
+ dev->link_mode = value;
write_unlock_bh(&dev_base_lock);
}
@@ -1634,7 +1673,7 @@ static int do_setlink(const struct sk_buff *skb,
err = do_setvfinfo(dev, attr);
if (err < 0)
goto errout;
- modified = 1;
+ status |= DO_SETLINK_NOTIFY;
}
}
err = 0;
@@ -1664,7 +1703,7 @@ static int do_setlink(const struct sk_buff *skb,
err = ops->ndo_set_vf_port(dev, vf, port);
if (err < 0)
goto errout;
- modified = 1;
+ status |= DO_SETLINK_NOTIFY;
}
}
err = 0;
@@ -1682,7 +1721,7 @@ static int do_setlink(const struct sk_buff *skb,
err = ops->ndo_set_vf_port(dev, PORT_SELF_VF, port);
if (err < 0)
goto errout;
- modified = 1;
+ status |= DO_SETLINK_NOTIFY;
}
if (tb[IFLA_AF_SPEC]) {
@@ -1699,15 +1738,20 @@ static int do_setlink(const struct sk_buff *skb,
if (err < 0)
goto errout;
- modified = 1;
+ status |= DO_SETLINK_NOTIFY;
}
}
err = 0;
errout:
- if (err < 0 && modified)
- net_warn_ratelimited("A link change request failed with some changes committed already. Interface %s may have been left with an inconsistent configuration, please check.\n",
- dev->name);
+ if (status & DO_SETLINK_MODIFIED) {
+ if (status & DO_SETLINK_NOTIFY)
+ netdev_state_change(dev);
+
+ if (err < 0)
+ net_warn_ratelimited("A link change request failed with some changes committed already. Interface %s may have been left with an inconsistent configuration, please check.\n",
+ dev->name);
+ }
return err;
}
@@ -1989,7 +2033,7 @@ replay:
}
if (dev) {
- int modified = 0;
+ int status = 0;
if (nlh->nlmsg_flags & NLM_F_EXCL)
return -EEXIST;
@@ -2004,7 +2048,7 @@ replay:
err = ops->changelink(dev, tb, data);
if (err < 0)
return err;
- modified = 1;
+ status |= DO_SETLINK_NOTIFY;
}
if (linkinfo[IFLA_INFO_SLAVE_DATA]) {
@@ -2015,10 +2059,10 @@ replay:
tb, slave_data);
if (err < 0)
return err;
- modified = 1;
+ status |= DO_SETLINK_NOTIFY;
}
- return do_setlink(skb, dev, ifm, tb, ifname, modified);
+ return do_setlink(skb, dev, ifm, tb, ifname, status);
}
if (!(nlh->nlmsg_flags & NLM_F_CREATE)) {
@@ -2202,8 +2246,8 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb)
return skb->len;
}
-void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change,
- gfp_t flags)
+struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev,
+ unsigned int change, gfp_t flags)
{
struct net *net = dev_net(dev);
struct sk_buff *skb;
@@ -2221,11 +2265,28 @@ void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change,
kfree_skb(skb);
goto errout;
}
- rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, flags);
- return;
+ return skb;
errout:
if (err < 0)
rtnl_set_sk_err(net, RTNLGRP_LINK, err);
+ return NULL;
+}
+
+void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev, gfp_t flags)
+{
+ struct net *net = dev_net(dev);
+
+ rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, flags);
+}
+
+void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change,
+ gfp_t flags)
+{
+ struct sk_buff *skb;
+
+ skb = rtmsg_ifinfo_build_skb(type, dev, change, flags);
+ if (skb)
+ rtmsg_ifinfo_send(skb, dev, flags);
}
EXPORT_SYMBOL(rtmsg_ifinfo);
@@ -2294,7 +2355,7 @@ errout:
int ndo_dflt_fdb_add(struct ndmsg *ndm,
struct nlattr *tb[],
struct net_device *dev,
- const unsigned char *addr,
+ const unsigned char *addr, u16 vid,
u16 flags)
{
int err = -EINVAL;
@@ -2307,6 +2368,11 @@ int ndo_dflt_fdb_add(struct ndmsg *ndm,
return err;
}
+ if (vid) {
+ pr_info("%s: vlans aren't supported yet for dev_uc|mc_add()\n", dev->name);
+ return err;
+ }
+
if (is_unicast_ether_addr(addr) || is_link_local_ether_addr(addr))
err = dev_uc_add_excl(dev, addr);
else if (is_multicast_ether_addr(addr))
@@ -2320,6 +2386,28 @@ int ndo_dflt_fdb_add(struct ndmsg *ndm,
}
EXPORT_SYMBOL(ndo_dflt_fdb_add);
+static int fdb_vid_parse(struct nlattr *vlan_attr, u16 *p_vid)
+{
+ u16 vid = 0;
+
+ if (vlan_attr) {
+ if (nla_len(vlan_attr) != sizeof(u16)) {
+ pr_info("PF_BRIDGE: RTM_NEWNEIGH with invalid vlan\n");
+ return -EINVAL;
+ }
+
+ vid = nla_get_u16(vlan_attr);
+
+ if (!vid || vid >= VLAN_VID_MASK) {
+ pr_info("PF_BRIDGE: RTM_NEWNEIGH with invalid vlan id %d\n",
+ vid);
+ return -EINVAL;
+ }
+ }
+ *p_vid = vid;
+ return 0;
+}
+
static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh)
{
struct net *net = sock_net(skb->sk);
@@ -2327,6 +2415,7 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh)
struct nlattr *tb[NDA_MAX+1];
struct net_device *dev;
u8 *addr;
+ u16 vid;
int err;
err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL);
@@ -2352,6 +2441,10 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh)
addr = nla_data(tb[NDA_LLADDR]);
+ err = fdb_vid_parse(tb[NDA_VLAN], &vid);
+ if (err)
+ return err;
+
err = -EOPNOTSUPP;
/* Support fdb on master device the net/bridge default case */
@@ -2360,7 +2453,8 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh)
struct net_device *br_dev = netdev_master_upper_dev_get(dev);
const struct net_device_ops *ops = br_dev->netdev_ops;
- err = ops->ndo_fdb_add(ndm, tb, dev, addr, nlh->nlmsg_flags);
+ err = ops->ndo_fdb_add(ndm, tb, dev, addr, vid,
+ nlh->nlmsg_flags);
if (err)
goto out;
else
@@ -2371,9 +2465,10 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh)
if ((ndm->ndm_flags & NTF_SELF)) {
if (dev->netdev_ops->ndo_fdb_add)
err = dev->netdev_ops->ndo_fdb_add(ndm, tb, dev, addr,
+ vid,
nlh->nlmsg_flags);
else
- err = ndo_dflt_fdb_add(ndm, tb, dev, addr,
+ err = ndo_dflt_fdb_add(ndm, tb, dev, addr, vid,
nlh->nlmsg_flags);
if (!err) {
@@ -2391,7 +2486,7 @@ out:
int ndo_dflt_fdb_del(struct ndmsg *ndm,
struct nlattr *tb[],
struct net_device *dev,
- const unsigned char *addr)
+ const unsigned char *addr, u16 vid)
{
int err = -EINVAL;
@@ -2420,6 +2515,7 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh)
struct net_device *dev;
int err = -EINVAL;
__u8 *addr;
+ u16 vid;
if (!netlink_capable(skb, CAP_NET_ADMIN))
return -EPERM;
@@ -2447,6 +2543,10 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh)
addr = nla_data(tb[NDA_LLADDR]);
+ err = fdb_vid_parse(tb[NDA_VLAN], &vid);
+ if (err)
+ return err;
+
err = -EOPNOTSUPP;
/* Support fdb on master device the net/bridge default case */
@@ -2456,7 +2556,7 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh)
const struct net_device_ops *ops = br_dev->netdev_ops;
if (ops->ndo_fdb_del)
- err = ops->ndo_fdb_del(ndm, tb, dev, addr);
+ err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid);
if (err)
goto out;
@@ -2467,9 +2567,10 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh)
/* Embedded bridge, macvlan, and any other device support */
if (ndm->ndm_flags & NTF_SELF) {
if (dev->netdev_ops->ndo_fdb_del)
- err = dev->netdev_ops->ndo_fdb_del(ndm, tb, dev, addr);
+ err = dev->netdev_ops->ndo_fdb_del(ndm, tb, dev, addr,
+ vid);
else
- err = ndo_dflt_fdb_del(ndm, tb, dev, addr);
+ err = ndo_dflt_fdb_del(ndm, tb, dev, addr, vid);
if (!err) {
rtnl_fdb_notify(dev, addr, RTM_DELNEIGH);
@@ -2609,12 +2710,22 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
return skb->len;
}
+static int brport_nla_put_flag(struct sk_buff *skb, u32 flags, u32 mask,
+ unsigned int attrnum, unsigned int flag)
+{
+ if (mask & flag)
+ return nla_put_u8(skb, attrnum, !!(flags & flag));
+ return 0;
+}
+
int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
- struct net_device *dev, u16 mode)
+ struct net_device *dev, u16 mode,
+ u32 flags, u32 mask)
{
struct nlmsghdr *nlh;
struct ifinfomsg *ifm;
struct nlattr *br_afspec;
+ struct nlattr *protinfo;
u8 operstate = netif_running(dev) ? dev->operstate : IF_OPER_DOWN;
struct net_device *br_dev = netdev_master_upper_dev_get(dev);
@@ -2646,13 +2757,46 @@ int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
if (!br_afspec)
goto nla_put_failure;
- if (nla_put_u16(skb, IFLA_BRIDGE_FLAGS, BRIDGE_FLAGS_SELF) ||
- nla_put_u16(skb, IFLA_BRIDGE_MODE, mode)) {
+ if (nla_put_u16(skb, IFLA_BRIDGE_FLAGS, BRIDGE_FLAGS_SELF)) {
nla_nest_cancel(skb, br_afspec);
goto nla_put_failure;
}
+
+ if (mode != BRIDGE_MODE_UNDEF) {
+ if (nla_put_u16(skb, IFLA_BRIDGE_MODE, mode)) {
+ nla_nest_cancel(skb, br_afspec);
+ goto nla_put_failure;
+ }
+ }
nla_nest_end(skb, br_afspec);
+ protinfo = nla_nest_start(skb, IFLA_PROTINFO | NLA_F_NESTED);
+ if (!protinfo)
+ goto nla_put_failure;
+
+ if (brport_nla_put_flag(skb, flags, mask,
+ IFLA_BRPORT_MODE, BR_HAIRPIN_MODE) ||
+ brport_nla_put_flag(skb, flags, mask,
+ IFLA_BRPORT_GUARD, BR_BPDU_GUARD) ||
+ brport_nla_put_flag(skb, flags, mask,
+ IFLA_BRPORT_FAST_LEAVE,
+ BR_MULTICAST_FAST_LEAVE) ||
+ brport_nla_put_flag(skb, flags, mask,
+ IFLA_BRPORT_PROTECT, BR_ROOT_BLOCK) ||
+ brport_nla_put_flag(skb, flags, mask,
+ IFLA_BRPORT_LEARNING, BR_LEARNING) ||
+ brport_nla_put_flag(skb, flags, mask,
+ IFLA_BRPORT_LEARNING_SYNC, BR_LEARNING_SYNC) ||
+ brport_nla_put_flag(skb, flags, mask,
+ IFLA_BRPORT_UNICAST_FLOOD, BR_FLOOD) ||
+ brport_nla_put_flag(skb, flags, mask,
+ IFLA_BRPORT_PROXYARP, BR_PROXYARP)) {
+ nla_nest_cancel(skb, protinfo);
+ goto nla_put_failure;
+ }
+
+ nla_nest_end(skb, protinfo);
+
return nlmsg_end(skb, nlh);
nla_put_failure:
nlmsg_cancel(skb, nlh);
@@ -2667,13 +2811,20 @@ static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb)
int idx = 0;
u32 portid = NETLINK_CB(cb->skb).portid;
u32 seq = cb->nlh->nlmsg_seq;
- struct nlattr *extfilt;
u32 filter_mask = 0;
- extfilt = nlmsg_find_attr(cb->nlh, sizeof(struct ifinfomsg),
- IFLA_EXT_MASK);
- if (extfilt)
- filter_mask = nla_get_u32(extfilt);
+ if (nlmsg_len(cb->nlh) > sizeof(struct ifinfomsg)) {
+ struct nlattr *extfilt;
+
+ extfilt = nlmsg_find_attr(cb->nlh, sizeof(struct ifinfomsg),
+ IFLA_EXT_MASK);
+ if (extfilt) {
+ if (nla_len(extfilt) < sizeof(filter_mask))
+ return -EINVAL;
+
+ filter_mask = nla_get_u32(extfilt);
+ }
+ }
rcu_read_lock();
for_each_netdev_rcu(net, dev) {
@@ -2780,6 +2931,9 @@ static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh)
if (br_spec) {
nla_for_each_nested(attr, br_spec, rem) {
if (nla_type(attr) == IFLA_BRIDGE_FLAGS) {
+ if (nla_len(attr) < sizeof(flags))
+ return -EINVAL;
+
have_flags = true;
flags = nla_get_u16(attr);
break;
@@ -2850,6 +3004,9 @@ static int rtnl_bridge_dellink(struct sk_buff *skb, struct nlmsghdr *nlh)
if (br_spec) {
nla_for_each_nested(attr, br_spec, rem) {
if (nla_type(attr) == IFLA_BRIDGE_FLAGS) {
+ if (nla_len(attr) < sizeof(flags))
+ return -EINVAL;
+
have_flags = true;
flags = nla_get_u16(attr);
break;
diff --git a/net/core/scm.c b/net/core/scm.c
index b442e7e25e60..3b6899b7d810 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -129,8 +129,7 @@ int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p)
struct cmsghdr *cmsg;
int err;
- for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg))
- {
+ for_each_cmsghdr(cmsg, msg) {
err = -EINVAL;
/* Verify that cmsg_len is at least sizeof(struct cmsghdr) */
diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c
index ba71212f0251..51dd3193a33e 100644
--- a/net/core/secure_seq.c
+++ b/net/core/secure_seq.c
@@ -35,7 +35,7 @@ static u32 seq_scale(u32 seq)
* overlaps less than one time per MSL (2 minutes).
* Choosing a clock of 64 ns period is OK. (period of 274 s)
*/
- return seq + (ktime_to_ns(ktime_get_real()) >> 6);
+ return seq + (ktime_get_real_ns() >> 6);
}
#endif
@@ -135,7 +135,7 @@ u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr,
md5_transform(hash, net_secret);
seq = hash[0] | (((u64)hash[1]) << 32);
- seq += ktime_to_ns(ktime_get_real());
+ seq += ktime_get_real_ns();
seq &= (1ull << 48) - 1;
return seq;
@@ -163,7 +163,7 @@ u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr,
md5_transform(hash, secret);
seq = hash[0] | (((u64)hash[1]) << 32);
- seq += ktime_to_ns(ktime_get_real());
+ seq += ktime_get_real_ns();
seq &= (1ull << 48) - 1;
return seq;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 8d289697cc7a..395c15b82087 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -257,16 +257,16 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
kmemcheck_annotate_variable(shinfo->destructor_arg);
if (flags & SKB_ALLOC_FCLONE) {
- struct sk_buff *child = skb + 1;
- atomic_t *fclone_ref = (atomic_t *) (child + 1);
+ struct sk_buff_fclones *fclones;
- kmemcheck_annotate_bitfield(child, flags1);
- kmemcheck_annotate_bitfield(child, flags2);
+ fclones = container_of(skb, struct sk_buff_fclones, skb1);
+
+ kmemcheck_annotate_bitfield(&fclones->skb2, flags1);
skb->fclone = SKB_FCLONE_ORIG;
- atomic_set(fclone_ref, 1);
+ atomic_set(&fclones->fclone_ref, 1);
- child->fclone = SKB_FCLONE_UNAVAILABLE;
- child->pfmemalloc = pfmemalloc;
+ fclones->skb2.fclone = SKB_FCLONE_CLONE;
+ fclones->skb2.pfmemalloc = pfmemalloc;
}
out:
return skb;
@@ -336,48 +336,85 @@ struct netdev_alloc_cache {
unsigned int pagecnt_bias;
};
static DEFINE_PER_CPU(struct netdev_alloc_cache, netdev_alloc_cache);
+static DEFINE_PER_CPU(struct netdev_alloc_cache, napi_alloc_cache);
-static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
+static struct page *__page_frag_refill(struct netdev_alloc_cache *nc,
+ gfp_t gfp_mask)
{
- struct netdev_alloc_cache *nc;
- void *data = NULL;
- int order;
- unsigned long flags;
+ const unsigned int order = NETDEV_FRAG_PAGE_MAX_ORDER;
+ struct page *page = NULL;
+ gfp_t gfp = gfp_mask;
+
+ if (order) {
+ gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY;
+ page = alloc_pages_node(NUMA_NO_NODE, gfp_mask, order);
+ nc->frag.size = PAGE_SIZE << (page ? order : 0);
+ }
- local_irq_save(flags);
- nc = &__get_cpu_var(netdev_alloc_cache);
- if (unlikely(!nc->frag.page)) {
+ if (unlikely(!page))
+ page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
+
+ nc->frag.page = page;
+
+ return page;
+}
+
+static void *__alloc_page_frag(struct netdev_alloc_cache __percpu *cache,
+ unsigned int fragsz, gfp_t gfp_mask)
+{
+ struct netdev_alloc_cache *nc = this_cpu_ptr(cache);
+ struct page *page = nc->frag.page;
+ unsigned int size;
+ int offset;
+
+ if (unlikely(!page)) {
refill:
- for (order = NETDEV_FRAG_PAGE_MAX_ORDER; ;) {
- gfp_t gfp = gfp_mask;
+ page = __page_frag_refill(nc, gfp_mask);
+ if (!page)
+ return NULL;
- if (order)
- gfp |= __GFP_COMP | __GFP_NOWARN;
- nc->frag.page = alloc_pages(gfp, order);
- if (likely(nc->frag.page))
- break;
- if (--order < 0)
- goto end;
- }
- nc->frag.size = PAGE_SIZE << order;
-recycle:
- atomic_set(&nc->frag.page->_count, NETDEV_PAGECNT_MAX_BIAS);
- nc->pagecnt_bias = NETDEV_PAGECNT_MAX_BIAS;
- nc->frag.offset = 0;
+ /* if size can vary use frag.size else just use PAGE_SIZE */
+ size = NETDEV_FRAG_PAGE_MAX_ORDER ? nc->frag.size : PAGE_SIZE;
+
+ /* Even if we own the page, we do not use atomic_set().
+ * This would break get_page_unless_zero() users.
+ */
+ atomic_add(size - 1, &page->_count);
+
+ /* reset page count bias and offset to start of new frag */
+ nc->pagecnt_bias = size;
+ nc->frag.offset = size;
}
- if (nc->frag.offset + fragsz > nc->frag.size) {
- /* avoid unnecessary locked operations if possible */
- if ((atomic_read(&nc->frag.page->_count) == nc->pagecnt_bias) ||
- atomic_sub_and_test(nc->pagecnt_bias, &nc->frag.page->_count))
- goto recycle;
- goto refill;
+ offset = nc->frag.offset - fragsz;
+ if (unlikely(offset < 0)) {
+ if (!atomic_sub_and_test(nc->pagecnt_bias, &page->_count))
+ goto refill;
+
+ /* if size can vary use frag.size else just use PAGE_SIZE */
+ size = NETDEV_FRAG_PAGE_MAX_ORDER ? nc->frag.size : PAGE_SIZE;
+
+ /* OK, page count is 0, we can safely set it */
+ atomic_set(&page->_count, size);
+
+ /* reset page count bias and offset to start of new frag */
+ nc->pagecnt_bias = size;
+ offset = size - fragsz;
}
- data = page_address(nc->frag.page) + nc->frag.offset;
- nc->frag.offset += fragsz;
nc->pagecnt_bias--;
-end:
+ nc->frag.offset = offset;
+
+ return page_address(page) + offset;
+}
+
+static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
+{
+ unsigned long flags;
+ void *data;
+
+ local_irq_save(flags);
+ data = __alloc_page_frag(&netdev_alloc_cache, fragsz, gfp_mask);
local_irq_restore(flags);
return data;
}
@@ -395,11 +432,25 @@ void *netdev_alloc_frag(unsigned int fragsz)
}
EXPORT_SYMBOL(netdev_alloc_frag);
+static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
+{
+ return __alloc_page_frag(&napi_alloc_cache, fragsz, gfp_mask);
+}
+
+void *napi_alloc_frag(unsigned int fragsz)
+{
+ return __napi_alloc_frag(fragsz, GFP_ATOMIC | __GFP_COLD);
+}
+EXPORT_SYMBOL(napi_alloc_frag);
+
/**
- * __netdev_alloc_skb - allocate an skbuff for rx on a specific device
- * @dev: network device to receive on
+ * __alloc_rx_skb - allocate an skbuff for rx
* @length: length to allocate
* @gfp_mask: get_free_pages mask, passed to alloc_skb
+ * @flags: If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for
+ * allocations in case we have to fallback to __alloc_skb()
+ * If SKB_ALLOC_NAPI is set, page fragment will be allocated
+ * from napi_cache instead of netdev_cache.
*
* Allocate a new &sk_buff and assign it a usage count of one. The
* buffer has unspecified headroom built in. Users should allocate
@@ -408,11 +459,11 @@ EXPORT_SYMBOL(netdev_alloc_frag);
*
* %NULL is returned if there is no free memory.
*/
-struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
- unsigned int length, gfp_t gfp_mask)
+static struct sk_buff *__alloc_rx_skb(unsigned int length, gfp_t gfp_mask,
+ int flags)
{
struct sk_buff *skb = NULL;
- unsigned int fragsz = SKB_DATA_ALIGN(length + NET_SKB_PAD) +
+ unsigned int fragsz = SKB_DATA_ALIGN(length) +
SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
if (fragsz <= PAGE_SIZE && !(gfp_mask & (__GFP_WAIT | GFP_DMA))) {
@@ -421,7 +472,9 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
if (sk_memalloc_socks())
gfp_mask |= __GFP_MEMALLOC;
- data = __netdev_alloc_frag(fragsz, gfp_mask);
+ data = (flags & SKB_ALLOC_NAPI) ?
+ __napi_alloc_frag(fragsz, gfp_mask) :
+ __netdev_alloc_frag(fragsz, gfp_mask);
if (likely(data)) {
skb = build_skb(data, fragsz);
@@ -429,17 +482,72 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
put_page(virt_to_head_page(data));
}
} else {
- skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask,
+ skb = __alloc_skb(length, gfp_mask,
SKB_ALLOC_RX, NUMA_NO_NODE);
}
+ return skb;
+}
+
+/**
+ * __netdev_alloc_skb - allocate an skbuff for rx on a specific device
+ * @dev: network device to receive on
+ * @length: length to allocate
+ * @gfp_mask: get_free_pages mask, passed to alloc_skb
+ *
+ * Allocate a new &sk_buff and assign it a usage count of one. The
+ * buffer has NET_SKB_PAD headroom built in. Users should allocate
+ * the headroom they think they need without accounting for the
+ * built in space. The built in space is used for optimisations.
+ *
+ * %NULL is returned if there is no free memory.
+ */
+struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
+ unsigned int length, gfp_t gfp_mask)
+{
+ struct sk_buff *skb;
+
+ length += NET_SKB_PAD;
+ skb = __alloc_rx_skb(length, gfp_mask, 0);
+
if (likely(skb)) {
skb_reserve(skb, NET_SKB_PAD);
skb->dev = dev;
}
+
return skb;
}
EXPORT_SYMBOL(__netdev_alloc_skb);
+/**
+ * __napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance
+ * @napi: napi instance this buffer was allocated for
+ * @length: length to allocate
+ * @gfp_mask: get_free_pages mask, passed to alloc_skb and alloc_pages
+ *
+ * Allocate a new sk_buff for use in NAPI receive. This buffer will
+ * attempt to allocate the head from a special reserved region used
+ * only for NAPI Rx allocation. By doing this we can save several
+ * CPU cycles by avoiding having to disable and re-enable IRQs.
+ *
+ * %NULL is returned if there is no free memory.
+ */
+struct sk_buff *__napi_alloc_skb(struct napi_struct *napi,
+ unsigned int length, gfp_t gfp_mask)
+{
+ struct sk_buff *skb;
+
+ length += NET_SKB_PAD + NET_IP_ALIGN;
+ skb = __alloc_rx_skb(length, gfp_mask, SKB_ALLOC_NAPI);
+
+ if (likely(skb)) {
+ skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
+ skb->dev = napi->dev;
+ }
+
+ return skb;
+}
+EXPORT_SYMBOL(__napi_alloc_skb);
+
void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
int size, unsigned int truesize)
{
@@ -491,32 +599,33 @@ static void skb_free_head(struct sk_buff *skb)
static void skb_release_data(struct sk_buff *skb)
{
- if (!skb->cloned ||
- !atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1,
- &skb_shinfo(skb)->dataref)) {
- if (skb_shinfo(skb)->nr_frags) {
- int i;
- for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
- skb_frag_unref(skb, i);
- }
+ struct skb_shared_info *shinfo = skb_shinfo(skb);
+ int i;
- /*
- * If skb buf is from userspace, we need to notify the caller
- * the lower device DMA has done;
- */
- if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
- struct ubuf_info *uarg;
+ if (skb->cloned &&
+ atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1,
+ &shinfo->dataref))
+ return;
- uarg = skb_shinfo(skb)->destructor_arg;
- if (uarg->callback)
- uarg->callback(uarg, true);
- }
+ for (i = 0; i < shinfo->nr_frags; i++)
+ __skb_frag_unref(&shinfo->frags[i]);
- if (skb_has_frag_list(skb))
- skb_drop_fraglist(skb);
+ /*
+ * If skb buf is from userspace, we need to notify the caller
+ * the lower device DMA has done;
+ */
+ if (shinfo->tx_flags & SKBTX_DEV_ZEROCOPY) {
+ struct ubuf_info *uarg;
- skb_free_head(skb);
+ uarg = shinfo->destructor_arg;
+ if (uarg->callback)
+ uarg->callback(uarg, true);
}
+
+ if (shinfo->frag_list)
+ kfree_skb_list(shinfo->frag_list);
+
+ skb_free_head(skb);
}
/*
@@ -524,33 +633,32 @@ static void skb_release_data(struct sk_buff *skb)
*/
static void kfree_skbmem(struct sk_buff *skb)
{
- struct sk_buff *other;
- atomic_t *fclone_ref;
+ struct sk_buff_fclones *fclones;
switch (skb->fclone) {
case SKB_FCLONE_UNAVAILABLE:
kmem_cache_free(skbuff_head_cache, skb);
- break;
+ return;
case SKB_FCLONE_ORIG:
- fclone_ref = (atomic_t *) (skb + 2);
- if (atomic_dec_and_test(fclone_ref))
- kmem_cache_free(skbuff_fclone_cache, skb);
- break;
-
- case SKB_FCLONE_CLONE:
- fclone_ref = (atomic_t *) (skb + 1);
- other = skb - 1;
+ fclones = container_of(skb, struct sk_buff_fclones, skb1);
- /* The clone portion is available for
- * fast-cloning again.
+ /* We usually free the clone (TX completion) before original skb
+ * This test would have no chance to be true for the clone,
+ * while here, branch prediction will be good.
*/
- skb->fclone = SKB_FCLONE_UNAVAILABLE;
+ if (atomic_read(&fclones->fclone_ref) == 1)
+ goto fastpath;
+ break;
- if (atomic_dec_and_test(fclone_ref))
- kmem_cache_free(skbuff_fclone_cache, other);
+ default: /* SKB_FCLONE_CLONE */
+ fclones = container_of(skb, struct sk_buff_fclones, skb2);
break;
}
+ if (!atomic_dec_and_test(&fclones->fclone_ref))
+ return;
+fastpath:
+ kmem_cache_free(skbuff_fclone_cache, fclones);
}
static void skb_release_head_state(struct sk_buff *skb)
@@ -566,7 +674,7 @@ static void skb_release_head_state(struct sk_buff *skb)
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
nf_conntrack_put(skb->nfct);
#endif
-#ifdef CONFIG_BRIDGE_NETFILTER
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
nf_bridge_put(skb->nf_bridge);
#endif
/* XXX: IS this still necessary? - JHS */
@@ -674,57 +782,61 @@ void consume_skb(struct sk_buff *skb)
}
EXPORT_SYMBOL(consume_skb);
+/* Make sure a field is enclosed inside headers_start/headers_end section */
+#define CHECK_SKB_FIELD(field) \
+ BUILD_BUG_ON(offsetof(struct sk_buff, field) < \
+ offsetof(struct sk_buff, headers_start)); \
+ BUILD_BUG_ON(offsetof(struct sk_buff, field) > \
+ offsetof(struct sk_buff, headers_end)); \
+
static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
{
new->tstamp = old->tstamp;
+ /* We do not copy old->sk */
new->dev = old->dev;
- new->transport_header = old->transport_header;
- new->network_header = old->network_header;
- new->mac_header = old->mac_header;
- new->inner_protocol = old->inner_protocol;
- new->inner_transport_header = old->inner_transport_header;
- new->inner_network_header = old->inner_network_header;
- new->inner_mac_header = old->inner_mac_header;
+ memcpy(new->cb, old->cb, sizeof(old->cb));
skb_dst_copy(new, old);
- skb_copy_hash(new, old);
- new->ooo_okay = old->ooo_okay;
- new->no_fcs = old->no_fcs;
- new->encapsulation = old->encapsulation;
- new->encap_hdr_csum = old->encap_hdr_csum;
- new->csum_valid = old->csum_valid;
- new->csum_complete_sw = old->csum_complete_sw;
#ifdef CONFIG_XFRM
new->sp = secpath_get(old->sp);
#endif
- memcpy(new->cb, old->cb, sizeof(old->cb));
- new->csum = old->csum;
- new->ignore_df = old->ignore_df;
- new->pkt_type = old->pkt_type;
- new->ip_summed = old->ip_summed;
- skb_copy_queue_mapping(new, old);
- new->priority = old->priority;
-#if IS_ENABLED(CONFIG_IP_VS)
- new->ipvs_property = old->ipvs_property;
+ __nf_copy(new, old, false);
+
+ /* Note : this field could be in headers_start/headers_end section
+ * It is not yet because we do not want to have a 16 bit hole
+ */
+ new->queue_mapping = old->queue_mapping;
+
+ memcpy(&new->headers_start, &old->headers_start,
+ offsetof(struct sk_buff, headers_end) -
+ offsetof(struct sk_buff, headers_start));
+ CHECK_SKB_FIELD(protocol);
+ CHECK_SKB_FIELD(csum);
+ CHECK_SKB_FIELD(hash);
+ CHECK_SKB_FIELD(priority);
+ CHECK_SKB_FIELD(skb_iif);
+ CHECK_SKB_FIELD(vlan_proto);
+ CHECK_SKB_FIELD(vlan_tci);
+ CHECK_SKB_FIELD(transport_header);
+ CHECK_SKB_FIELD(network_header);
+ CHECK_SKB_FIELD(mac_header);
+ CHECK_SKB_FIELD(inner_protocol);
+ CHECK_SKB_FIELD(inner_transport_header);
+ CHECK_SKB_FIELD(inner_network_header);
+ CHECK_SKB_FIELD(inner_mac_header);
+ CHECK_SKB_FIELD(mark);
+#ifdef CONFIG_NETWORK_SECMARK
+ CHECK_SKB_FIELD(secmark);
+#endif
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ CHECK_SKB_FIELD(napi_id);
#endif
- new->pfmemalloc = old->pfmemalloc;
- new->protocol = old->protocol;
- new->mark = old->mark;
- new->skb_iif = old->skb_iif;
- __nf_copy(new, old);
#ifdef CONFIG_NET_SCHED
- new->tc_index = old->tc_index;
+ CHECK_SKB_FIELD(tc_index);
#ifdef CONFIG_NET_CLS_ACT
- new->tc_verd = old->tc_verd;
+ CHECK_SKB_FIELD(tc_verd);
#endif
#endif
- new->vlan_proto = old->vlan_proto;
- new->vlan_tci = old->vlan_tci;
-
- skb_copy_secmark(new, old);
-#ifdef CONFIG_NET_RX_BUSY_POLL
- new->napi_id = old->napi_id;
-#endif
}
/*
@@ -855,17 +967,18 @@ EXPORT_SYMBOL_GPL(skb_copy_ubufs);
struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
{
+ struct sk_buff_fclones *fclones = container_of(skb,
+ struct sk_buff_fclones,
+ skb1);
struct sk_buff *n;
if (skb_orphan_frags(skb, gfp_mask))
return NULL;
- n = skb + 1;
if (skb->fclone == SKB_FCLONE_ORIG &&
- n->fclone == SKB_FCLONE_UNAVAILABLE) {
- atomic_t *fclone_ref = (atomic_t *) (n + 1);
- n->fclone = SKB_FCLONE_CLONE;
- atomic_inc(fclone_ref);
+ atomic_read(&fclones->fclone_ref) == 1) {
+ n = &fclones->skb2;
+ atomic_set(&fclones->fclone_ref, 2);
} else {
if (skb_pfmemalloc(skb))
gfp_mask |= __GFP_MEMALLOC;
@@ -875,7 +988,6 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
return NULL;
kmemcheck_annotate_bitfield(n, flags1);
- kmemcheck_annotate_bitfield(n, flags2);
n->fclone = SKB_FCLONE_UNAVAILABLE;
}
@@ -2988,7 +3100,7 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
if (nskb->len == len + doffset)
goto perform_csum_check;
- if (!sg) {
+ if (!sg && !nskb->remcsum_offload) {
nskb->ip_summed = CHECKSUM_NONE;
nskb->csum = skb_copy_and_csum_bits(head_skb, offset,
skb_put(nskb, len),
@@ -3060,7 +3172,7 @@ skip_fraglist:
nskb->truesize += nskb->data_len;
perform_csum_check:
- if (!csum) {
+ if (!csum && !nskb->remcsum_offload) {
nskb->csum = skb_checksum(nskb, doffset,
nskb->len - doffset, 0);
nskb->ip_summed = CHECKSUM_NONE;
@@ -3069,6 +3181,21 @@ perform_csum_check:
}
} while ((offset += len) < head_skb->len);
+ /* Some callers want to get the end of the list.
+ * Put it in segs->prev to avoid walking the list.
+ * (see validate_xmit_skb_list() for example)
+ */
+ segs->prev = tail;
+
+ /* Following permits correct backpressure, for protocols
+ * using skb_set_owner_w().
+ * Idea is to tranfert ownership from head_skb to last segment.
+ */
+ if (head_skb->destructor == sock_wfree) {
+ swap(tail->truesize, head_skb->truesize);
+ swap(tail->destructor, head_skb->destructor);
+ swap(tail->sk, head_skb->sk);
+ }
return segs;
err:
@@ -3182,7 +3309,7 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)
skb_shinfo(nskb)->frag_list = p;
skb_shinfo(nskb)->gso_size = pinfo->gso_size;
pinfo->gso_size = 0;
- skb_header_release(p);
+ __skb_header_release(p);
NAPI_GRO_CB(nskb)->last = p;
nskb->data_len += p->len;
@@ -3214,7 +3341,7 @@ merge:
else
NAPI_GRO_CB(p)->last->next = skb;
NAPI_GRO_CB(p)->last = skb;
- skb_header_release(skb);
+ __skb_header_release(skb);
lp = p;
done:
@@ -3230,7 +3357,6 @@ done:
NAPI_GRO_CB(skb)->same_flow = 1;
return 0;
}
-EXPORT_SYMBOL_GPL(skb_gro_receive);
void __init skb_init(void)
{
@@ -3240,8 +3366,7 @@ void __init skb_init(void)
SLAB_HWCACHE_ALIGN|SLAB_PANIC,
NULL);
skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache",
- (2*sizeof(struct sk_buff)) +
- sizeof(atomic_t),
+ sizeof(struct sk_buff_fclones),
0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC,
NULL);
@@ -3494,32 +3619,66 @@ int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb)
}
EXPORT_SYMBOL(sock_queue_err_skb);
-void __skb_tstamp_tx(struct sk_buff *orig_skb,
- struct skb_shared_hwtstamps *hwtstamps,
- struct sock *sk, int tstype)
+struct sk_buff *sock_dequeue_err_skb(struct sock *sk)
{
- struct sock_exterr_skb *serr;
- struct sk_buff *skb;
- int err;
+ struct sk_buff_head *q = &sk->sk_error_queue;
+ struct sk_buff *skb, *skb_next;
+ int err = 0;
- if (!sk)
- return;
+ spin_lock_bh(&q->lock);
+ skb = __skb_dequeue(q);
+ if (skb && (skb_next = skb_peek(q)))
+ err = SKB_EXT_ERR(skb_next)->ee.ee_errno;
+ spin_unlock_bh(&q->lock);
- if (hwtstamps) {
- *skb_hwtstamps(orig_skb) =
- *hwtstamps;
- } else {
- /*
- * no hardware time stamps available,
- * so keep the shared tx_flags and only
- * store software time stamp
- */
- orig_skb->tstamp = ktime_get_real();
+ sk->sk_err = err;
+ if (err)
+ sk->sk_error_report(sk);
+
+ return skb;
+}
+EXPORT_SYMBOL(sock_dequeue_err_skb);
+
+/**
+ * skb_clone_sk - create clone of skb, and take reference to socket
+ * @skb: the skb to clone
+ *
+ * This function creates a clone of a buffer that holds a reference on
+ * sk_refcnt. Buffers created via this function are meant to be
+ * returned using sock_queue_err_skb, or free via kfree_skb.
+ *
+ * When passing buffers allocated with this function to sock_queue_err_skb
+ * it is necessary to wrap the call with sock_hold/sock_put in order to
+ * prevent the socket from being released prior to being enqueued on
+ * the sk_error_queue.
+ */
+struct sk_buff *skb_clone_sk(struct sk_buff *skb)
+{
+ struct sock *sk = skb->sk;
+ struct sk_buff *clone;
+
+ if (!sk || !atomic_inc_not_zero(&sk->sk_refcnt))
+ return NULL;
+
+ clone = skb_clone(skb, GFP_ATOMIC);
+ if (!clone) {
+ sock_put(sk);
+ return NULL;
}
- skb = skb_clone(orig_skb, GFP_ATOMIC);
- if (!skb)
- return;
+ clone->sk = sk;
+ clone->destructor = sock_efree;
+
+ return clone;
+}
+EXPORT_SYMBOL(skb_clone_sk);
+
+static void __skb_complete_tx_timestamp(struct sk_buff *skb,
+ struct sock *sk,
+ int tstype)
+{
+ struct sock_exterr_skb *serr;
+ int err;
serr = SKB_EXT_ERR(skb);
memset(serr, 0, sizeof(*serr));
@@ -3537,6 +3696,42 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb,
if (err)
kfree_skb(skb);
}
+
+void skb_complete_tx_timestamp(struct sk_buff *skb,
+ struct skb_shared_hwtstamps *hwtstamps)
+{
+ struct sock *sk = skb->sk;
+
+ /* take a reference to prevent skb_orphan() from freeing the socket */
+ sock_hold(sk);
+
+ *skb_hwtstamps(skb) = *hwtstamps;
+ __skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND);
+
+ sock_put(sk);
+}
+EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp);
+
+void __skb_tstamp_tx(struct sk_buff *orig_skb,
+ struct skb_shared_hwtstamps *hwtstamps,
+ struct sock *sk, int tstype)
+{
+ struct sk_buff *skb;
+
+ if (!sk)
+ return;
+
+ if (hwtstamps)
+ *skb_hwtstamps(orig_skb) = *hwtstamps;
+ else
+ orig_skb->tstamp = ktime_get_real();
+
+ skb = skb_clone(orig_skb, GFP_ATOMIC);
+ if (!skb)
+ return;
+
+ __skb_complete_tx_timestamp(skb, sk, tstype);
+}
EXPORT_SYMBOL_GPL(__skb_tstamp_tx);
void skb_tstamp_tx(struct sk_buff *orig_skb,
@@ -3561,9 +3756,14 @@ void skb_complete_wifi_ack(struct sk_buff *skb, bool acked)
serr->ee.ee_errno = ENOMSG;
serr->ee.ee_origin = SO_EE_ORIGIN_TXSTATUS;
+ /* take a reference to prevent skb_orphan() from freeing the socket */
+ sock_hold(sk);
+
err = sock_queue_err_skb(sk, skb);
if (err)
kfree_skb(skb);
+
+ sock_put(sk);
}
EXPORT_SYMBOL_GPL(skb_complete_wifi_ack);
@@ -3864,7 +4064,8 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,
return false;
if (len <= skb_tailroom(to)) {
- BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len));
+ if (len)
+ BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len));
*delta_truesize = 0;
return true;
}
@@ -3947,6 +4148,7 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet)
skb->ignore_df = 0;
skb_dst_drop(skb);
skb->mark = 0;
+ skb_init_secmark(skb);
secpath_reset(skb);
nf_reset(skb);
nf_reset_trace(skb);
@@ -3966,15 +4168,22 @@ EXPORT_SYMBOL_GPL(skb_scrub_packet);
unsigned int skb_gso_transport_seglen(const struct sk_buff *skb)
{
const struct skb_shared_info *shinfo = skb_shinfo(skb);
+ unsigned int thlen = 0;
- if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
- return tcp_hdrlen(skb) + shinfo->gso_size;
+ if (skb->encapsulation) {
+ thlen = skb_inner_transport_header(skb) -
+ skb_transport_header(skb);
+ if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
+ thlen += inner_tcp_hdrlen(skb);
+ } else if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
+ thlen = tcp_hdrlen(skb);
+ }
/* UFO sets gso_size to the size of the fragmentation
* payload, i.e. the size of the L4 (UDP) header is already
* accounted for.
*/
- return shinfo->gso_size;
+ return thlen + shinfo->gso_size;
}
EXPORT_SYMBOL_GPL(skb_gso_transport_seglen);
@@ -4029,3 +4238,188 @@ err_free:
return NULL;
}
EXPORT_SYMBOL(skb_vlan_untag);
+
+int skb_ensure_writable(struct sk_buff *skb, int write_len)
+{
+ if (!pskb_may_pull(skb, write_len))
+ return -ENOMEM;
+
+ if (!skb_cloned(skb) || skb_clone_writable(skb, write_len))
+ return 0;
+
+ return pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
+}
+EXPORT_SYMBOL(skb_ensure_writable);
+
+/* remove VLAN header from packet and update csum accordingly. */
+static int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci)
+{
+ struct vlan_hdr *vhdr;
+ unsigned int offset = skb->data - skb_mac_header(skb);
+ int err;
+
+ __skb_push(skb, offset);
+ err = skb_ensure_writable(skb, VLAN_ETH_HLEN);
+ if (unlikely(err))
+ goto pull;
+
+ skb_postpull_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN);
+
+ vhdr = (struct vlan_hdr *)(skb->data + ETH_HLEN);
+ *vlan_tci = ntohs(vhdr->h_vlan_TCI);
+
+ memmove(skb->data + VLAN_HLEN, skb->data, 2 * ETH_ALEN);
+ __skb_pull(skb, VLAN_HLEN);
+
+ vlan_set_encap_proto(skb, vhdr);
+ skb->mac_header += VLAN_HLEN;
+
+ if (skb_network_offset(skb) < ETH_HLEN)
+ skb_set_network_header(skb, ETH_HLEN);
+
+ skb_reset_mac_len(skb);
+pull:
+ __skb_pull(skb, offset);
+
+ return err;
+}
+
+int skb_vlan_pop(struct sk_buff *skb)
+{
+ u16 vlan_tci;
+ __be16 vlan_proto;
+ int err;
+
+ if (likely(vlan_tx_tag_present(skb))) {
+ skb->vlan_tci = 0;
+ } else {
+ if (unlikely((skb->protocol != htons(ETH_P_8021Q) &&
+ skb->protocol != htons(ETH_P_8021AD)) ||
+ skb->len < VLAN_ETH_HLEN))
+ return 0;
+
+ err = __skb_vlan_pop(skb, &vlan_tci);
+ if (err)
+ return err;
+ }
+ /* move next vlan tag to hw accel tag */
+ if (likely((skb->protocol != htons(ETH_P_8021Q) &&
+ skb->protocol != htons(ETH_P_8021AD)) ||
+ skb->len < VLAN_ETH_HLEN))
+ return 0;
+
+ vlan_proto = skb->protocol;
+ err = __skb_vlan_pop(skb, &vlan_tci);
+ if (unlikely(err))
+ return err;
+
+ __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci);
+ return 0;
+}
+EXPORT_SYMBOL(skb_vlan_pop);
+
+int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci)
+{
+ if (vlan_tx_tag_present(skb)) {
+ unsigned int offset = skb->data - skb_mac_header(skb);
+ int err;
+
+ /* __vlan_insert_tag expect skb->data pointing to mac header.
+ * So change skb->data before calling it and change back to
+ * original position later
+ */
+ __skb_push(skb, offset);
+ err = __vlan_insert_tag(skb, skb->vlan_proto,
+ vlan_tx_tag_get(skb));
+ if (err)
+ return err;
+ skb->protocol = skb->vlan_proto;
+ skb->mac_len += VLAN_HLEN;
+ __skb_pull(skb, offset);
+
+ if (skb->ip_summed == CHECKSUM_COMPLETE)
+ skb->csum = csum_add(skb->csum, csum_partial(skb->data
+ + (2 * ETH_ALEN), VLAN_HLEN, 0));
+ }
+ __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci);
+ return 0;
+}
+EXPORT_SYMBOL(skb_vlan_push);
+
+/**
+ * alloc_skb_with_frags - allocate skb with page frags
+ *
+ * @header_len: size of linear part
+ * @data_len: needed length in frags
+ * @max_page_order: max page order desired.
+ * @errcode: pointer to error code if any
+ * @gfp_mask: allocation mask
+ *
+ * This can be used to allocate a paged skb, given a maximal order for frags.
+ */
+struct sk_buff *alloc_skb_with_frags(unsigned long header_len,
+ unsigned long data_len,
+ int max_page_order,
+ int *errcode,
+ gfp_t gfp_mask)
+{
+ int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
+ unsigned long chunk;
+ struct sk_buff *skb;
+ struct page *page;
+ gfp_t gfp_head;
+ int i;
+
+ *errcode = -EMSGSIZE;
+ /* Note this test could be relaxed, if we succeed to allocate
+ * high order pages...
+ */
+ if (npages > MAX_SKB_FRAGS)
+ return NULL;
+
+ gfp_head = gfp_mask;
+ if (gfp_head & __GFP_WAIT)
+ gfp_head |= __GFP_REPEAT;
+
+ *errcode = -ENOBUFS;
+ skb = alloc_skb(header_len, gfp_head);
+ if (!skb)
+ return NULL;
+
+ skb->truesize += npages << PAGE_SHIFT;
+
+ for (i = 0; npages > 0; i++) {
+ int order = max_page_order;
+
+ while (order) {
+ if (npages >= 1 << order) {
+ page = alloc_pages(gfp_mask |
+ __GFP_COMP |
+ __GFP_NOWARN |
+ __GFP_NORETRY,
+ order);
+ if (page)
+ goto fill_page;
+ /* Do not retry other high order allocations */
+ order = 1;
+ max_page_order = 0;
+ }
+ order--;
+ }
+ page = alloc_page(gfp_mask);
+ if (!page)
+ goto failure;
+fill_page:
+ chunk = min_t(unsigned long, data_len,
+ PAGE_SIZE << order);
+ skb_fill_page_desc(skb, i, page, 0, chunk);
+ data_len -= chunk;
+ npages -= 1 << order;
+ }
+ return skb;
+
+failure:
+ kfree_skb(skb);
+ return NULL;
+}
+EXPORT_SYMBOL(alloc_skb_with_frags);
diff --git a/net/core/sock.c b/net/core/sock.c
index 9c3f823e76a9..1c7a33db1314 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -437,7 +437,6 @@ static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
int err;
- int skb_len;
unsigned long flags;
struct sk_buff_head *list = &sk->sk_receive_queue;
@@ -459,13 +458,6 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
skb->dev = NULL;
skb_set_owner_r(skb, sk);
- /* Cache the SKB length before we tack it onto the receive
- * queue. Once it is added it no longer belongs to us and
- * may be freed by other threads of control pulling packets
- * from the queue.
- */
- skb_len = skb->len;
-
/* we escape from rcu protected region, make sure we dont leak
* a norefcounted dst
*/
@@ -896,6 +888,19 @@ set_rcvbuf:
}
break;
+ case SO_ATTACH_BPF:
+ ret = -EINVAL;
+ if (optlen == sizeof(u32)) {
+ u32 ufd;
+
+ ret = -EFAULT;
+ if (copy_from_user(&ufd, optval, sizeof(ufd)))
+ break;
+
+ ret = sk_attach_bpf(ufd, sk);
+ }
+ break;
+
case SO_DETACH_FILTER:
ret = sk_detach_filter(sk);
break;
@@ -1221,6 +1226,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
v.val = sk->sk_max_pacing_rate;
break;
+ case SO_INCOMING_CPU:
+ v.val = sk->sk_incoming_cpu;
+ break;
+
default:
return -ENOPROTOOPT;
}
@@ -1489,9 +1498,6 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
atomic_set(&newsk->sk_omem_alloc, 0);
skb_queue_head_init(&newsk->sk_receive_queue);
skb_queue_head_init(&newsk->sk_write_queue);
-#ifdef CONFIG_NET_DMA
- skb_queue_head_init(&newsk->sk_async_wait_queue);
-#endif
spin_lock_init(&newsk->sk_dst_lock);
rwlock_init(&newsk->sk_callback_lock);
@@ -1528,6 +1534,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
newsk->sk_err = 0;
newsk->sk_priority = 0;
+ newsk->sk_incoming_cpu = raw_smp_processor_id();
/*
* Before updating sk_refcnt, we must commit prior changes to memory
* (Documentation/RCU/rculist_nulls.txt for details)
@@ -1645,18 +1652,24 @@ void sock_rfree(struct sk_buff *skb)
}
EXPORT_SYMBOL(sock_rfree);
+void sock_efree(struct sk_buff *skb)
+{
+ sock_put(skb->sk);
+}
+EXPORT_SYMBOL(sock_efree);
+
+#ifdef CONFIG_INET
void sock_edemux(struct sk_buff *skb)
{
struct sock *sk = skb->sk;
-#ifdef CONFIG_INET
if (sk->sk_state == TCP_TIME_WAIT)
inet_twsk_put(inet_twsk(sk));
else
-#endif
sock_put(sk);
}
EXPORT_SYMBOL(sock_edemux);
+#endif
kuid_t sock_i_uid(struct sock *sk)
{
@@ -1718,16 +1731,34 @@ void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
}
EXPORT_SYMBOL(sock_kmalloc);
-/*
- * Free an option memory block.
+/* Free an option memory block. Note, we actually want the inline
+ * here as this allows gcc to detect the nullify and fold away the
+ * condition entirely.
*/
-void sock_kfree_s(struct sock *sk, void *mem, int size)
+static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
+ const bool nullify)
{
- kfree(mem);
+ if (WARN_ON_ONCE(!mem))
+ return;
+ if (nullify)
+ kzfree(mem);
+ else
+ kfree(mem);
atomic_sub(size, &sk->sk_omem_alloc);
}
+
+void sock_kfree_s(struct sock *sk, void *mem, int size)
+{
+ __sock_kfree_s(sk, mem, size, false);
+}
EXPORT_SYMBOL(sock_kfree_s);
+void sock_kzfree_s(struct sock *sk, void *mem, int size)
+{
+ __sock_kfree_s(sk, mem, size, true);
+}
+EXPORT_SYMBOL(sock_kzfree_s);
+
/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
I think, these locks should be removed for datagram sockets.
*/
@@ -1764,21 +1795,12 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
unsigned long data_len, int noblock,
int *errcode, int max_page_order)
{
- struct sk_buff *skb = NULL;
- unsigned long chunk;
- gfp_t gfp_mask;
+ struct sk_buff *skb;
long timeo;
int err;
- int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
- struct page *page;
- int i;
-
- err = -EMSGSIZE;
- if (npages > MAX_SKB_FRAGS)
- goto failure;
timeo = sock_sndtimeo(sk, noblock);
- while (!skb) {
+ for (;;) {
err = sock_error(sk);
if (err != 0)
goto failure;
@@ -1787,66 +1809,27 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
if (sk->sk_shutdown & SEND_SHUTDOWN)
goto failure;
- if (atomic_read(&sk->sk_wmem_alloc) >= sk->sk_sndbuf) {
- set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
- set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
- err = -EAGAIN;
- if (!timeo)
- goto failure;
- if (signal_pending(current))
- goto interrupted;
- timeo = sock_wait_for_wmem(sk, timeo);
- continue;
- }
-
- err = -ENOBUFS;
- gfp_mask = sk->sk_allocation;
- if (gfp_mask & __GFP_WAIT)
- gfp_mask |= __GFP_REPEAT;
+ if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
+ break;
- skb = alloc_skb(header_len, gfp_mask);
- if (!skb)
+ set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+ err = -EAGAIN;
+ if (!timeo)
goto failure;
-
- skb->truesize += data_len;
-
- for (i = 0; npages > 0; i++) {
- int order = max_page_order;
-
- while (order) {
- if (npages >= 1 << order) {
- page = alloc_pages(sk->sk_allocation |
- __GFP_COMP |
- __GFP_NOWARN |
- __GFP_NORETRY,
- order);
- if (page)
- goto fill_page;
- /* Do not retry other high order allocations */
- order = 1;
- max_page_order = 0;
- }
- order--;
- }
- page = alloc_page(sk->sk_allocation);
- if (!page)
- goto failure;
-fill_page:
- chunk = min_t(unsigned long, data_len,
- PAGE_SIZE << order);
- skb_fill_page_desc(skb, i, page, 0, chunk);
- data_len -= chunk;
- npages -= 1 << order;
- }
+ if (signal_pending(current))
+ goto interrupted;
+ timeo = sock_wait_for_wmem(sk, timeo);
}
-
- skb_set_owner_w(skb, sk);
+ skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
+ errcode, sk->sk_allocation);
+ if (skb)
+ skb_set_owner_w(skb, sk);
return skb;
interrupted:
err = sock_intr_errno(timeo);
failure:
- kfree_skb(skb);
*errcode = err;
return NULL;
}
@@ -2308,9 +2291,6 @@ void sock_init_data(struct socket *sock, struct sock *sk)
skb_queue_head_init(&sk->sk_receive_queue);
skb_queue_head_init(&sk->sk_write_queue);
skb_queue_head_init(&sk->sk_error_queue);
-#ifdef CONFIG_NET_DMA
- skb_queue_head_init(&sk->sk_async_wait_queue);
-#endif
sk->sk_send_head = NULL;
@@ -2498,11 +2478,11 @@ int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
int level, int type)
{
struct sock_exterr_skb *serr;
- struct sk_buff *skb, *skb2;
+ struct sk_buff *skb;
int copied, err;
err = -EAGAIN;
- skb = skb_dequeue(&sk->sk_error_queue);
+ skb = sock_dequeue_err_skb(sk);
if (skb == NULL)
goto out;
@@ -2511,7 +2491,7 @@ int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
msg->msg_flags |= MSG_TRUNC;
copied = len;
}
- err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+ err = skb_copy_datagram_msg(skb, 0, msg, copied);
if (err)
goto out_free_skb;
@@ -2523,16 +2503,6 @@ int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
msg->msg_flags |= MSG_ERRQUEUE;
err = copied;
- /* Reset and regenerate socket error */
- spin_lock_bh(&sk->sk_error_queue.lock);
- sk->sk_err = 0;
- if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) {
- sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno;
- spin_unlock_bh(&sk->sk_error_queue.lock);
- sk->sk_error_report(sk);
- } else
- spin_unlock_bh(&sk->sk_error_queue.lock);
-
out_free_skb:
kfree_skb(skb);
out:
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index cf9cd13509a7..31baba2a71ce 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -26,6 +26,8 @@ static int zero = 0;
static int one = 1;
static int ushort_max = USHRT_MAX;
+static int net_msg_warn; /* Unused, but still a sysctl */
+
#ifdef CONFIG_RPS
static int rps_sock_flow_sysctl(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
@@ -215,6 +217,18 @@ static int set_default_qdisc(struct ctl_table *table, int write,
}
#endif
+static int proc_do_rss_key(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table fake_table;
+ char buf[NETDEV_RSS_KEY_LEN * 3];
+
+ snprintf(buf, sizeof(buf), "%*phC", NETDEV_RSS_KEY_LEN, netdev_rss_key);
+ fake_table.data = buf;
+ fake_table.maxlen = sizeof(buf);
+ return proc_dostring(&fake_table, write, buffer, lenp, ppos);
+}
+
static struct ctl_table net_core_table[] = {
#ifdef CONFIG_NET
{
@@ -263,6 +277,13 @@ static struct ctl_table net_core_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec
},
+ {
+ .procname = "netdev_rss_key",
+ .data = &netdev_rss_key,
+ .maxlen = sizeof(int),
+ .mode = 0444,
+ .proc_handler = proc_do_rss_key,
+ },
#ifdef CONFIG_BPF_JIT
{
.procname = "bpf_jit_enable",
diff --git a/net/core/timestamping.c b/net/core/timestamping.c
index a8770391ea5b..43d3dd62fcc8 100644
--- a/net/core/timestamping.c
+++ b/net/core/timestamping.c
@@ -36,10 +36,9 @@ void skb_clone_tx_timestamp(struct sk_buff *skb)
{
struct phy_device *phydev;
struct sk_buff *clone;
- struct sock *sk = skb->sk;
unsigned int type;
- if (!sk)
+ if (!skb->sk)
return;
type = classify(skb);
@@ -48,50 +47,14 @@ void skb_clone_tx_timestamp(struct sk_buff *skb)
phydev = skb->dev->phydev;
if (likely(phydev->drv->txtstamp)) {
- if (!atomic_inc_not_zero(&sk->sk_refcnt))
+ clone = skb_clone_sk(skb);
+ if (!clone)
return;
-
- clone = skb_clone(skb, GFP_ATOMIC);
- if (!clone) {
- sock_put(sk);
- return;
- }
-
- clone->sk = sk;
phydev->drv->txtstamp(phydev, clone, type);
}
}
EXPORT_SYMBOL_GPL(skb_clone_tx_timestamp);
-void skb_complete_tx_timestamp(struct sk_buff *skb,
- struct skb_shared_hwtstamps *hwtstamps)
-{
- struct sock *sk = skb->sk;
- struct sock_exterr_skb *serr;
- int err;
-
- if (!hwtstamps) {
- sock_put(sk);
- kfree_skb(skb);
- return;
- }
-
- *skb_hwtstamps(skb) = *hwtstamps;
-
- serr = SKB_EXT_ERR(skb);
- memset(serr, 0, sizeof(*serr));
- serr->ee.ee_errno = ENOMSG;
- serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING;
- skb->sk = NULL;
-
- err = sock_queue_err_skb(sk, skb);
-
- sock_put(sk);
- if (err)
- kfree_skb(skb);
-}
-EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp);
-
bool skb_defer_rx_timestamp(struct sk_buff *skb)
{
struct phy_device *phydev;
diff --git a/net/core/tso.c b/net/core/tso.c
index 8c3203c585b0..630b30b4fb53 100644
--- a/net/core/tso.c
+++ b/net/core/tso.c
@@ -1,6 +1,7 @@
#include <linux/export.h>
#include <net/ip.h>
#include <net/tso.h>
+#include <asm/unaligned.h>
/* Calculate expected number of TX descriptors */
int tso_count_descs(struct sk_buff *skb)
@@ -23,7 +24,7 @@ void tso_build_hdr(struct sk_buff *skb, char *hdr, struct tso_t *tso,
iph->id = htons(tso->ip_id);
iph->tot_len = htons(size + hdr_len - mac_hdr_len);
tcph = (struct tcphdr *)(hdr + skb_transport_offset(skb));
- tcph->seq = htonl(tso->tcp_seq);
+ put_unaligned_be32(tso->tcp_seq, &tcph->seq);
tso->ip_id++;
if (!is_last) {
diff --git a/net/core/user_dma.c b/net/core/user_dma.c
deleted file mode 100644
index 1b5fefdb8198..000000000000
--- a/net/core/user_dma.c
+++ /dev/null
@@ -1,131 +0,0 @@
-/*
- * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved.
- * Portions based on net/core/datagram.c and copyrighted by their authors.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59
- * Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- *
- * The full GNU General Public License is included in this distribution in the
- * file called COPYING.
- */
-
-/*
- * This code allows the net stack to make use of a DMA engine for
- * skb to iovec copies.
- */
-
-#include <linux/dmaengine.h>
-#include <linux/socket.h>
-#include <linux/export.h>
-#include <net/tcp.h>
-#include <net/netdma.h>
-
-#define NET_DMA_DEFAULT_COPYBREAK 4096
-
-int sysctl_tcp_dma_copybreak = NET_DMA_DEFAULT_COPYBREAK;
-EXPORT_SYMBOL(sysctl_tcp_dma_copybreak);
-
-/**
- * dma_skb_copy_datagram_iovec - Copy a datagram to an iovec.
- * @skb - buffer to copy
- * @offset - offset in the buffer to start copying from
- * @iovec - io vector to copy to
- * @len - amount of data to copy from buffer to iovec
- * @pinned_list - locked iovec buffer data
- *
- * Note: the iovec is modified during the copy.
- */
-int dma_skb_copy_datagram_iovec(struct dma_chan *chan,
- struct sk_buff *skb, int offset, struct iovec *to,
- size_t len, struct dma_pinned_list *pinned_list)
-{
- int start = skb_headlen(skb);
- int i, copy = start - offset;
- struct sk_buff *frag_iter;
- dma_cookie_t cookie = 0;
-
- /* Copy header. */
- if (copy > 0) {
- if (copy > len)
- copy = len;
- cookie = dma_memcpy_to_iovec(chan, to, pinned_list,
- skb->data + offset, copy);
- if (cookie < 0)
- goto fault;
- len -= copy;
- if (len == 0)
- goto end;
- offset += copy;
- }
-
- /* Copy paged appendix. Hmm... why does this look so complicated? */
- for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
- int end;
- const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
-
- WARN_ON(start > offset + len);
-
- end = start + skb_frag_size(frag);
- copy = end - offset;
- if (copy > 0) {
- struct page *page = skb_frag_page(frag);
-
- if (copy > len)
- copy = len;
-
- cookie = dma_memcpy_pg_to_iovec(chan, to, pinned_list, page,
- frag->page_offset + offset - start, copy);
- if (cookie < 0)
- goto fault;
- len -= copy;
- if (len == 0)
- goto end;
- offset += copy;
- }
- start = end;
- }
-
- skb_walk_frags(skb, frag_iter) {
- int end;
-
- WARN_ON(start > offset + len);
-
- end = start + frag_iter->len;
- copy = end - offset;
- if (copy > 0) {
- if (copy > len)
- copy = len;
- cookie = dma_skb_copy_datagram_iovec(chan, frag_iter,
- offset - start,
- to, copy,
- pinned_list);
- if (cookie < 0)
- goto fault;
- len -= copy;
- if (len == 0)
- goto end;
- offset += copy;
- }
- start = end;
- }
-
-end:
- if (!len) {
- skb->dma_cookie = cookie;
- return cookie;
- }
-
-fault:
- return -EFAULT;
-}
diff --git a/net/core/utils.c b/net/core/utils.c
index eed34338736c..7b803884c162 100644
--- a/net/core/utils.c
+++ b/net/core/utils.c
@@ -33,9 +33,6 @@
#include <asm/byteorder.h>
#include <asm/uaccess.h>
-int net_msg_warn __read_mostly = 1;
-EXPORT_SYMBOL(net_msg_warn);
-
DEFINE_RATELIMIT_STATE(net_ratelimit_state, 5 * HZ, 10);
/*
* All net warning printk()s should be guarded by this function.
@@ -306,16 +303,14 @@ EXPORT_SYMBOL(in6_pton);
void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb,
__be32 from, __be32 to, int pseudohdr)
{
- __be32 diff[] = { ~from, to };
if (skb->ip_summed != CHECKSUM_PARTIAL) {
- *sum = csum_fold(csum_partial(diff, sizeof(diff),
- ~csum_unfold(*sum)));
+ *sum = csum_fold(csum_add(csum_sub(~csum_unfold(*sum), from),
+ to));
if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr)
- skb->csum = ~csum_partial(diff, sizeof(diff),
- ~skb->csum);
+ skb->csum = ~csum_add(csum_sub(~(skb->csum), from), to);
} else if (pseudohdr)
- *sum = ~csum_fold(csum_partial(diff, sizeof(diff),
- csum_unfold(*sum)));
+ *sum = ~csum_fold(csum_add(csum_sub(csum_unfold(*sum), from),
+ to));
}
EXPORT_SYMBOL(inet_proto_csum_replace4);