summaryrefslogtreecommitdiff
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Kconfig19
-rw-r--r--net/ipv4/Makefile3
-rw-r--r--net/ipv4/af_inet.c79
-rw-r--r--net/ipv4/ah4.c19
-rw-r--r--net/ipv4/arp.c29
-rw-r--r--net/ipv4/cipso_ipv4.c6
-rw-r--r--net/ipv4/devinet.c71
-rw-r--r--net/ipv4/esp4.c41
-rw-r--r--net/ipv4/fib_frontend.c136
-rw-r--r--net/ipv4/fib_rules.c55
-rw-r--r--net/ipv4/fib_semantics.c99
-rw-r--r--net/ipv4/fib_trie.c15
-rw-r--r--net/ipv4/icmp.c125
-rw-r--r--net/ipv4/igmp.c18
-rw-r--r--net/ipv4/inet_connection_sock.c84
-rw-r--r--net/ipv4/inet_diag.c152
-rw-r--r--net/ipv4/inet_fragment.c2
-rw-r--r--net/ipv4/inet_hashtables.c2
-rw-r--r--net/ipv4/inet_timewait_sock.c6
-rw-r--r--net/ipv4/inetpeer.c111
-rw-r--r--net/ipv4/ip_forward.c5
-rw-r--r--net/ipv4/ip_fragment.c51
-rw-r--r--net/ipv4/ip_gre.c125
-rw-r--r--net/ipv4/ip_input.c41
-rw-r--r--net/ipv4/ip_options.c57
-rw-r--r--net/ipv4/ip_output.c94
-rw-r--r--net/ipv4/ip_sockglue.c26
-rw-r--r--net/ipv4/ip_vti.c956
-rw-r--r--net/ipv4/ipcomp.c17
-rw-r--r--net/ipv4/ipconfig.c20
-rw-r--r--net/ipv4/ipip.c83
-rw-r--r--net/ipv4/ipmr.c45
-rw-r--r--net/ipv4/netfilter.c12
-rw-r--r--net/ipv4/netfilter/Makefile3
-rw-r--r--net/ipv4/netfilter/arp_tables.c7
-rw-r--r--net/ipv4/netfilter/ip_queue.c639
-rw-r--r--net/ipv4/netfilter/ip_tables.c5
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c3
-rw-r--r--net/ipv4/netfilter/ipt_ULOG.c23
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c191
-rw-r--r--net/ipv4/netfilter/nf_conntrack_proto_icmp.c93
-rw-r--r--net/ipv4/netfilter/nf_defrag_ipv4.c4
-rw-r--r--net/ipv4/netfilter/nf_nat_amanda.c4
-rw-r--r--net/ipv4/netfilter/nf_nat_core.c6
-rw-r--r--net/ipv4/netfilter/nf_nat_h323.c34
-rw-r--r--net/ipv4/netfilter/nf_nat_helper.c13
-rw-r--r--net/ipv4/netfilter/nf_nat_pptp.c6
-rw-r--r--net/ipv4/netfilter/nf_nat_sip.c2
-rw-r--r--net/ipv4/netfilter/nf_nat_snmp_basic.c12
-rw-r--r--net/ipv4/netfilter/nf_nat_tftp.c4
-rw-r--r--net/ipv4/ping.c23
-rw-r--r--net/ipv4/proc.c7
-rw-r--r--net/ipv4/protocol.c8
-rw-r--r--net/ipv4/raw.c7
-rw-r--r--net/ipv4/route.c1107
-rw-r--r--net/ipv4/syncookies.c2
-rw-r--r--net/ipv4/sysctl_net_ipv4.c54
-rw-r--r--net/ipv4/tcp.c410
-rw-r--r--net/ipv4/tcp_cong.c11
-rw-r--r--net/ipv4/tcp_fastopen.c11
-rw-r--r--net/ipv4/tcp_hybla.c10
-rw-r--r--net/ipv4/tcp_input.c1017
-rw-r--r--net/ipv4/tcp_ipv4.c280
-rw-r--r--net/ipv4/tcp_memcontrol.c111
-rw-r--r--net/ipv4/tcp_metrics.c744
-rw-r--r--net/ipv4/tcp_minisocks.c74
-rw-r--r--net/ipv4/tcp_output.c493
-rw-r--r--net/ipv4/tcp_probe.c4
-rw-r--r--net/ipv4/tcp_timer.c75
-rw-r--r--net/ipv4/udp.c67
-rw-r--r--net/ipv4/udp_diag.c19
-rw-r--r--net/ipv4/udp_impl.h2
-rw-r--r--net/ipv4/xfrm4_mode_tunnel.c68
-rw-r--r--net/ipv4/xfrm4_policy.c30
74 files changed, 5288 insertions, 2999 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index d183262943d9..5a19aeb86094 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -262,8 +262,8 @@ config ARPD
bool "IP: ARP daemon support"
---help---
The kernel maintains an internal cache which maps IP addresses to
- hardware addresses on the local network, so that Ethernet/Token Ring/
- etc. frames are sent to the proper address on the physical networking
+ hardware addresses on the local network, so that Ethernet
+ frames are sent to the proper address on the physical networking
layer. Normally, kernel uses the ARP protocol to resolve these
mappings.
@@ -310,9 +310,20 @@ config SYN_COOKIES
If unsure, say N.
+config NET_IPVTI
+ tristate "Virtual (secure) IP: tunneling"
+ select INET_TUNNEL
+ depends on INET_XFRM_MODE_TUNNEL
+ ---help---
+ Tunneling means encapsulating data of one protocol type within
+ another protocol and sending it over a channel that understands the
+ encapsulating protocol. This can be used with xfrm mode tunnel to give
+ the notion of a secure tunnel for IPSEC and then use routing protocol
+ on top.
+
config INET_AH
tristate "IP: AH transformation"
- select XFRM
+ select XFRM_ALGO
select CRYPTO
select CRYPTO_HMAC
select CRYPTO_MD5
@@ -324,7 +335,7 @@ config INET_AH
config INET_ESP
tristate "IP: ESP transformation"
- select XFRM
+ select XFRM_ALGO
select CRYPTO
select CRYPTO_AUTHENC
select CRYPTO_HMAC
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index ff75d3bbcd6a..ae2ccf2890e4 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -7,7 +7,7 @@ obj-y := route.o inetpeer.o protocol.o \
ip_output.o ip_sockglue.o inet_hashtables.o \
inet_timewait_sock.o inet_connection_sock.o \
tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
- tcp_minisocks.o tcp_cong.o \
+ tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \
datagram.o raw.o udp.o udplite.o \
arp.o icmp.o devinet.o af_inet.o igmp.o \
fib_frontend.o fib_semantics.o fib_trie.o \
@@ -20,6 +20,7 @@ obj-$(CONFIG_IP_MROUTE) += ipmr.o
obj-$(CONFIG_NET_IPIP) += ipip.o
obj-$(CONFIG_NET_IPGRE_DEMUX) += gre.o
obj-$(CONFIG_NET_IPGRE) += ip_gre.o
+obj-$(CONFIG_NET_IPVTI) += ip_vti.o
obj-$(CONFIG_SYN_COOKIES) += syncookies.o
obj-$(CONFIG_INET_AH) += ah4.o
obj-$(CONFIG_INET_ESP) += esp4.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 10e3751466b5..fe4582ca969a 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -157,6 +157,7 @@ void inet_sock_destruct(struct sock *sk)
kfree(rcu_dereference_protected(inet->inet_opt, 1));
dst_release(rcu_dereference_check(sk->sk_dst_cache, 1));
+ dst_release(sk->sk_rx_dst);
sk_refcnt_debug_dec(sk);
}
EXPORT_SYMBOL(inet_sock_destruct);
@@ -242,20 +243,18 @@ void build_ehash_secret(void)
}
EXPORT_SYMBOL(build_ehash_secret);
-static inline int inet_netns_ok(struct net *net, int protocol)
+static inline int inet_netns_ok(struct net *net, __u8 protocol)
{
- int hash;
const struct net_protocol *ipprot;
if (net_eq(net, &init_net))
return 1;
- hash = protocol & (MAX_INET_PROTOS - 1);
- ipprot = rcu_dereference(inet_protos[hash]);
-
- if (ipprot == NULL)
+ ipprot = rcu_dereference(inet_protos[protocol]);
+ if (ipprot == NULL) {
/* raw IP is OK */
return 1;
+ }
return ipprot->netns_ok;
}
@@ -350,7 +349,7 @@ lookup_protocol:
err = 0;
sk->sk_no_check = answer_no_check;
if (INET_PROTOSW_REUSE & answer_flags)
- sk->sk_reuse = 1;
+ sk->sk_reuse = SK_CAN_REUSE;
inet = inet_sk(sk);
inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;
@@ -541,7 +540,7 @@ out:
}
EXPORT_SYMBOL(inet_bind);
-int inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr,
+int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr,
int addr_len, int flags)
{
struct sock *sk = sock->sk;
@@ -553,15 +552,16 @@ int inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr,
if (!inet_sk(sk)->inet_num && inet_autobind(sk))
return -EAGAIN;
- return sk->sk_prot->connect(sk, (struct sockaddr *)uaddr, addr_len);
+ return sk->sk_prot->connect(sk, uaddr, addr_len);
}
EXPORT_SYMBOL(inet_dgram_connect);
-static long inet_wait_for_connect(struct sock *sk, long timeo)
+static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias)
{
DEFINE_WAIT(wait);
prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+ sk->sk_write_pending += writebias;
/* Basic assumption: if someone sets sk->sk_err, he _must_
* change state of the socket from TCP_SYN_*.
@@ -577,6 +577,7 @@ static long inet_wait_for_connect(struct sock *sk, long timeo)
prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
}
finish_wait(sk_sleep(sk), &wait);
+ sk->sk_write_pending -= writebias;
return timeo;
}
@@ -584,8 +585,8 @@ static long inet_wait_for_connect(struct sock *sk, long timeo)
* Connect to a remote host. There is regrettably still a little
* TCP 'magic' in here.
*/
-int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
- int addr_len, int flags)
+int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
+ int addr_len, int flags)
{
struct sock *sk = sock->sk;
int err;
@@ -594,8 +595,6 @@ int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
if (addr_len < sizeof(uaddr->sa_family))
return -EINVAL;
- lock_sock(sk);
-
if (uaddr->sa_family == AF_UNSPEC) {
err = sk->sk_prot->disconnect(sk, flags);
sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
@@ -635,8 +634,12 @@ int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
+ int writebias = (sk->sk_protocol == IPPROTO_TCP) &&
+ tcp_sk(sk)->fastopen_req &&
+ tcp_sk(sk)->fastopen_req->data ? 1 : 0;
+
/* Error code is set above */
- if (!timeo || !inet_wait_for_connect(sk, timeo))
+ if (!timeo || !inet_wait_for_connect(sk, timeo, writebias))
goto out;
err = sock_intr_errno(timeo);
@@ -658,7 +661,6 @@ int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
sock->state = SS_CONNECTED;
err = 0;
out:
- release_sock(sk);
return err;
sock_error:
@@ -668,6 +670,18 @@ sock_error:
sock->state = SS_DISCONNECTING;
goto out;
}
+EXPORT_SYMBOL(__inet_stream_connect);
+
+int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
+ int addr_len, int flags)
+{
+ int err;
+
+ lock_sock(sock->sk);
+ err = __inet_stream_connect(sock, uaddr, addr_len, flags);
+ release_sock(sock->sk);
+ return err;
+}
EXPORT_SYMBOL(inet_stream_connect);
/*
@@ -1216,8 +1230,8 @@ EXPORT_SYMBOL(inet_sk_rebuild_header);
static int inet_gso_send_check(struct sk_buff *skb)
{
- const struct iphdr *iph;
const struct net_protocol *ops;
+ const struct iphdr *iph;
int proto;
int ihl;
int err = -EINVAL;
@@ -1236,7 +1250,7 @@ static int inet_gso_send_check(struct sk_buff *skb)
__skb_pull(skb, ihl);
skb_reset_transport_header(skb);
iph = ip_hdr(skb);
- proto = iph->protocol & (MAX_INET_PROTOS - 1);
+ proto = iph->protocol;
err = -EPROTONOSUPPORT;
rcu_read_lock();
@@ -1253,8 +1267,8 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
netdev_features_t features)
{
struct sk_buff *segs = ERR_PTR(-EINVAL);
- struct iphdr *iph;
const struct net_protocol *ops;
+ struct iphdr *iph;
int proto;
int ihl;
int id;
@@ -1286,7 +1300,7 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
skb_reset_transport_header(skb);
iph = ip_hdr(skb);
id = ntohs(iph->id);
- proto = iph->protocol & (MAX_INET_PROTOS - 1);
+ proto = iph->protocol;
segs = ERR_PTR(-EPROTONOSUPPORT);
rcu_read_lock();
@@ -1340,7 +1354,7 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
goto out;
}
- proto = iph->protocol & (MAX_INET_PROTOS - 1);
+ proto = iph->protocol;
rcu_read_lock();
ops = rcu_dereference(inet_protos[proto]);
@@ -1398,11 +1412,11 @@ out:
static int inet_gro_complete(struct sk_buff *skb)
{
- const struct net_protocol *ops;
+ __be16 newlen = htons(skb->len - skb_network_offset(skb));
struct iphdr *iph = ip_hdr(skb);
- int proto = iph->protocol & (MAX_INET_PROTOS - 1);
+ const struct net_protocol *ops;
+ int proto = iph->protocol;
int err = -ENOSYS;
- __be16 newlen = htons(skb->len - skb_network_offset(skb));
csum_replace2(&iph->check, iph->tot_len, newlen);
iph->tot_len = newlen;
@@ -1520,14 +1534,15 @@ static const struct net_protocol igmp_protocol = {
#endif
static const struct net_protocol tcp_protocol = {
- .handler = tcp_v4_rcv,
- .err_handler = tcp_v4_err,
- .gso_send_check = tcp_v4_gso_send_check,
- .gso_segment = tcp_tso_segment,
- .gro_receive = tcp4_gro_receive,
- .gro_complete = tcp4_gro_complete,
- .no_policy = 1,
- .netns_ok = 1,
+ .early_demux = tcp_v4_early_demux,
+ .handler = tcp_v4_rcv,
+ .err_handler = tcp_v4_err,
+ .gso_send_check = tcp_v4_gso_send_check,
+ .gso_segment = tcp_tso_segment,
+ .gro_receive = tcp4_gro_receive,
+ .gro_complete = tcp4_gro_complete,
+ .no_policy = 1,
+ .netns_ok = 1,
};
static const struct net_protocol udp_protocol = {
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index fd508b526014..a0d8392491c3 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -77,7 +77,7 @@ static inline struct scatterlist *ah_req_sg(struct crypto_ahash *ahash,
static int ip_clear_mutable_options(const struct iphdr *iph, __be32 *daddr)
{
- unsigned char * optptr = (unsigned char*)(iph+1);
+ unsigned char *optptr = (unsigned char *)(iph+1);
int l = iph->ihl*4 - sizeof(struct iphdr);
int optlen;
@@ -398,16 +398,25 @@ static void ah4_err(struct sk_buff *skb, u32 info)
struct ip_auth_hdr *ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2));
struct xfrm_state *x;
- if (icmp_hdr(skb)->type != ICMP_DEST_UNREACH ||
- icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
+ switch (icmp_hdr(skb)->type) {
+ case ICMP_DEST_UNREACH:
+ if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
+ return;
+ case ICMP_REDIRECT:
+ break;
+ default:
return;
+ }
x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
ah->spi, IPPROTO_AH, AF_INET);
if (!x)
return;
- printk(KERN_DEBUG "pmtu discovery on SA AH/%08x/%08x\n",
- ntohl(ah->spi), ntohl(iph->daddr));
+
+ if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
+ ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_AH, 0);
+ else
+ ipv4_redirect(skb, net, 0, 0, IPPROTO_AH, 0);
xfrm_state_put(x);
}
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 18d9b81ecb1a..2e560f0c757d 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -73,6 +73,8 @@
* Jesper D. Brouer: Proxy ARP PVLAN RFC 3069 support.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/module.h>
#include <linux/types.h>
#include <linux/string.h>
@@ -89,7 +91,6 @@
#include <linux/etherdevice.h>
#include <linux/fddidevice.h>
#include <linux/if_arp.h>
-#include <linux/trdevice.h>
#include <linux/skbuff.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
@@ -193,9 +194,6 @@ int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir)
case ARPHRD_IEEE802:
ip_eth_mc_map(addr, haddr);
return 0;
- case ARPHRD_IEEE802_TR:
- ip_tr_mc_map(addr, haddr);
- return 0;
case ARPHRD_INFINIBAND:
ip_ib_mc_map(addr, dev->broadcast, haddr);
return 0;
@@ -364,8 +362,7 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
probes -= neigh->parms->ucast_probes;
if (probes < 0) {
if (!(neigh->nud_state & NUD_VALID))
- printk(KERN_DEBUG
- "trying to ucast probe in NUD_INVALID\n");
+ pr_debug("trying to ucast probe in NUD_INVALID\n");
dst_ha = neigh->ha;
read_lock_bh(&neigh->lock);
} else {
@@ -452,7 +449,7 @@ static int arp_set_predefined(int addr_hint, unsigned char *haddr,
{
switch (addr_hint) {
case RTN_LOCAL:
- printk(KERN_DEBUG "ARP: arp called for own IP address\n");
+ pr_debug("arp called for own IP address\n");
memcpy(haddr, dev->dev_addr, dev->addr_len);
return 1;
case RTN_MULTICAST:
@@ -473,7 +470,7 @@ int arp_find(unsigned char *haddr, struct sk_buff *skb)
struct neighbour *n;
if (!skb_dst(skb)) {
- printk(KERN_DEBUG "arp_find is called with dst==NULL\n");
+ pr_debug("arp_find is called with dst==NULL\n");
kfree_skb(skb);
return 1;
}
@@ -648,12 +645,6 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip,
arp->ar_pro = htons(ETH_P_IP);
break;
#endif
-#if IS_ENABLED(CONFIG_TR)
- case ARPHRD_IEEE802_TR:
- arp->ar_hrd = htons(ARPHRD_IEEE802);
- arp->ar_pro = htons(ETH_P_IP);
- break;
-#endif
}
arp->ar_hln = dev->addr_len;
@@ -751,11 +742,10 @@ static int arp_process(struct sk_buff *skb)
goto out;
break;
case ARPHRD_ETHER:
- case ARPHRD_IEEE802_TR:
case ARPHRD_FDDI:
case ARPHRD_IEEE802:
/*
- * ETHERNET, Token Ring and Fibre Channel (which are IEEE 802
+ * ETHERNET, and Fibre Channel (which are IEEE 802
* devices, according to RFC 2625) devices will accept ARP
* hardware types of either 1 (Ethernet) or 6 (IEEE 802.2).
* This is the case also of FDDI, where the RFC 1390 says that
@@ -800,7 +790,8 @@ static int arp_process(struct sk_buff *skb)
* Check for bad requests for 127.x.x.x and requests for multicast
* addresses. If this is one such, delete it.
*/
- if (ipv4_is_loopback(tip) || ipv4_is_multicast(tip))
+ if (ipv4_is_multicast(tip) ||
+ (!IN_DEV_ROUTE_LOCALNET(in_dev) && ipv4_is_loopback(tip)))
goto out;
/*
@@ -1059,7 +1050,7 @@ static int arp_req_set(struct net *net, struct arpreq *r,
neigh = __neigh_lookup_errno(&arp_tbl, &ip, dev);
err = PTR_ERR(neigh);
if (!IS_ERR(neigh)) {
- unsigned state = NUD_STALE;
+ unsigned int state = NUD_STALE;
if (r->arp_flags & ATF_PERM)
state = NUD_PERMANENT;
err = neigh_update(neigh, (r->arp_flags & ATF_COM) ?
@@ -1071,7 +1062,7 @@ static int arp_req_set(struct net *net, struct arpreq *r,
return err;
}
-static unsigned arp_state_to_flags(struct neighbour *neigh)
+static unsigned int arp_state_to_flags(struct neighbour *neigh)
{
if (neigh->nud_state&NUD_PERMANENT)
return ATF_PERM | ATF_COM;
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index c48adc565e92..667c1d4ca984 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -1725,8 +1725,10 @@ int cipso_v4_validate(const struct sk_buff *skb, unsigned char **option)
case CIPSO_V4_TAG_LOCAL:
/* This is a non-standard tag that we only allow for
* local connections, so if the incoming interface is
- * not the loopback device drop the packet. */
- if (!(skb->dev->flags & IFF_LOOPBACK)) {
+ * not the loopback device drop the packet. Further,
+ * there is no legitimate reason for setting this from
+ * userspace so reject it if skb is NULL. */
+ if (skb == NULL || !(skb->dev->flags & IFF_LOOPBACK)) {
err_offset = opt_iter;
goto validate_return_locked;
}
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 6e447ff94dfa..44bf82e3aef7 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -217,8 +217,7 @@ void in_dev_finish_destroy(struct in_device *idev)
WARN_ON(idev->ifa_list);
WARN_ON(idev->mc_list);
#ifdef NET_REFCNT_DEBUG
- printk(KERN_DEBUG "in_dev_finish_destroy: %p=%s\n",
- idev, dev ? dev->name : "NIL");
+ pr_debug("%s: %p=%s\n", __func__, idev, dev ? dev->name : "NIL");
#endif
dev_put(dev);
if (!idev->dead)
@@ -1125,7 +1124,7 @@ skip:
}
}
-static inline bool inetdev_valid_mtu(unsigned mtu)
+static inline bool inetdev_valid_mtu(unsigned int mtu)
{
return mtu >= 68;
}
@@ -1174,7 +1173,7 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
switch (event) {
case NETDEV_REGISTER:
- printk(KERN_DEBUG "inetdev_event: bug\n");
+ pr_debug("%s: bug\n", __func__);
RCU_INIT_POINTER(dev->ip_ptr, NULL);
break;
case NETDEV_UP:
@@ -1266,17 +1265,15 @@ static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa,
ifm->ifa_scope = ifa->ifa_scope;
ifm->ifa_index = ifa->ifa_dev->dev->ifindex;
- if (ifa->ifa_address)
- NLA_PUT_BE32(skb, IFA_ADDRESS, ifa->ifa_address);
-
- if (ifa->ifa_local)
- NLA_PUT_BE32(skb, IFA_LOCAL, ifa->ifa_local);
-
- if (ifa->ifa_broadcast)
- NLA_PUT_BE32(skb, IFA_BROADCAST, ifa->ifa_broadcast);
-
- if (ifa->ifa_label[0])
- NLA_PUT_STRING(skb, IFA_LABEL, ifa->ifa_label);
+ if ((ifa->ifa_address &&
+ nla_put_be32(skb, IFA_ADDRESS, ifa->ifa_address)) ||
+ (ifa->ifa_local &&
+ nla_put_be32(skb, IFA_LOCAL, ifa->ifa_local)) ||
+ (ifa->ifa_broadcast &&
+ nla_put_be32(skb, IFA_BROADCAST, ifa->ifa_broadcast)) ||
+ (ifa->ifa_label[0] &&
+ nla_put_string(skb, IFA_LABEL, ifa->ifa_label)))
+ goto nla_put_failure;
return nlmsg_end(skb, nlh);
@@ -1503,7 +1500,8 @@ static int devinet_conf_proc(ctl_table *ctl, int write,
if (cnf == net->ipv4.devconf_dflt)
devinet_copy_dflt_conf(net, i);
- if (i == IPV4_DEVCONF_ACCEPT_LOCAL - 1)
+ if (i == IPV4_DEVCONF_ACCEPT_LOCAL - 1 ||
+ i == IPV4_DEVCONF_ROUTE_LOCALNET - 1)
if ((new_value == 0) && (old_value != 0))
rt_cache_flush(net, 0);
}
@@ -1587,7 +1585,6 @@ static int ipv4_doint_and_flush(ctl_table *ctl, int write,
static struct devinet_sysctl_table {
struct ctl_table_header *sysctl_header;
struct ctl_table devinet_vars[__IPV4_DEVCONF_MAX];
- char *dev_name;
} devinet_sysctl = {
.devinet_vars = {
DEVINET_SYSCTL_COMPLEX_ENTRY(FORWARDING, "forwarding",
@@ -1621,6 +1618,8 @@ static struct devinet_sysctl_table {
"force_igmp_version"),
DEVINET_SYSCTL_FLUSHING_ENTRY(PROMOTE_SECONDARIES,
"promote_secondaries"),
+ DEVINET_SYSCTL_FLUSHING_ENTRY(ROUTE_LOCALNET,
+ "route_localnet"),
},
};
@@ -1629,16 +1628,7 @@ static int __devinet_sysctl_register(struct net *net, char *dev_name,
{
int i;
struct devinet_sysctl_table *t;
-
-#define DEVINET_CTL_PATH_DEV 3
-
- struct ctl_path devinet_ctl_path[] = {
- { .procname = "net", },
- { .procname = "ipv4", },
- { .procname = "conf", },
- { /* to be set */ },
- { },
- };
+ char path[sizeof("net/ipv4/conf/") + IFNAMSIZ];
t = kmemdup(&devinet_sysctl, sizeof(*t), GFP_KERNEL);
if (!t)
@@ -1650,27 +1640,15 @@ static int __devinet_sysctl_register(struct net *net, char *dev_name,
t->devinet_vars[i].extra2 = net;
}
- /*
- * Make a copy of dev_name, because '.procname' is regarded as const
- * by sysctl and we wouldn't want anyone to change it under our feet
- * (see SIOCSIFNAME).
- */
- t->dev_name = kstrdup(dev_name, GFP_KERNEL);
- if (!t->dev_name)
- goto free;
-
- devinet_ctl_path[DEVINET_CTL_PATH_DEV].procname = t->dev_name;
+ snprintf(path, sizeof(path), "net/ipv4/conf/%s", dev_name);
- t->sysctl_header = register_net_sysctl_table(net, devinet_ctl_path,
- t->devinet_vars);
+ t->sysctl_header = register_net_sysctl(net, path, t->devinet_vars);
if (!t->sysctl_header)
- goto free_procname;
+ goto free;
p->sysctl = t;
return 0;
-free_procname:
- kfree(t->dev_name);
free:
kfree(t);
out:
@@ -1686,7 +1664,6 @@ static void __devinet_sysctl_unregister(struct ipv4_devconf *cnf)
cnf->sysctl = NULL;
unregister_net_sysctl_table(t->sysctl_header);
- kfree(t->dev_name);
kfree(t);
}
@@ -1716,12 +1693,6 @@ static struct ctl_table ctl_forward_entry[] = {
},
{ },
};
-
-static __net_initdata struct ctl_path net_ipv4_path[] = {
- { .procname = "net", },
- { .procname = "ipv4", },
- { },
-};
#endif
static __net_init int devinet_init_net(struct net *net)
@@ -1767,7 +1738,7 @@ static __net_init int devinet_init_net(struct net *net)
goto err_reg_dflt;
err = -ENOMEM;
- forw_hdr = register_net_sysctl_table(net, net_ipv4_path, tbl);
+ forw_hdr = register_net_sysctl(net, "net/ipv4", tbl);
if (forw_hdr == NULL)
goto err_reg_ctl;
net->ipv4.forw_hdr = forw_hdr;
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 89a47b35905d..b61e9deb7c7e 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -459,28 +459,22 @@ static u32 esp4_get_mtu(struct xfrm_state *x, int mtu)
struct esp_data *esp = x->data;
u32 blksize = ALIGN(crypto_aead_blocksize(esp->aead), 4);
u32 align = max_t(u32, blksize, esp->padlen);
- u32 rem;
-
- mtu -= x->props.header_len + crypto_aead_authsize(esp->aead);
- rem = mtu & (align - 1);
- mtu &= ~(align - 1);
+ unsigned int net_adj;
switch (x->props.mode) {
- case XFRM_MODE_TUNNEL:
- break;
- default:
case XFRM_MODE_TRANSPORT:
- /* The worst case */
- mtu -= blksize - 4;
- mtu += min_t(u32, blksize - 4, rem);
- break;
case XFRM_MODE_BEET:
- /* The worst case. */
- mtu += min_t(u32, IPV4_BEET_PHMAXLEN, rem);
+ net_adj = sizeof(struct iphdr);
+ break;
+ case XFRM_MODE_TUNNEL:
+ net_adj = 0;
break;
+ default:
+ BUG();
}
- return mtu - 2;
+ return ((mtu - x->props.header_len - crypto_aead_authsize(esp->aead) -
+ net_adj) & ~(align - 1)) + (net_adj - 2);
}
static void esp4_err(struct sk_buff *skb, u32 info)
@@ -490,16 +484,25 @@ static void esp4_err(struct sk_buff *skb, u32 info)
struct ip_esp_hdr *esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2));
struct xfrm_state *x;
- if (icmp_hdr(skb)->type != ICMP_DEST_UNREACH ||
- icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
+ switch (icmp_hdr(skb)->type) {
+ case ICMP_DEST_UNREACH:
+ if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
+ return;
+ case ICMP_REDIRECT:
+ break;
+ default:
return;
+ }
x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
esph->spi, IPPROTO_ESP, AF_INET);
if (!x)
return;
- NETDEBUG(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n",
- ntohl(esph->spi), ntohl(iph->daddr));
+
+ if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
+ ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_ESP, 0);
+ else
+ ipv4_redirect(skb, net, 0, 0, IPPROTO_ESP, 0);
xfrm_state_put(x);
}
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index cbe3a68507cf..b83203658ee3 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -31,6 +31,7 @@
#include <linux/if_addr.h>
#include <linux/if_arp.h>
#include <linux/skbuff.h>
+#include <linux/cache.h>
#include <linux/init.h>
#include <linux/list.h>
#include <linux/slab.h>
@@ -85,6 +86,24 @@ struct fib_table *fib_new_table(struct net *net, u32 id)
tb = fib_trie_table(id);
if (!tb)
return NULL;
+
+ switch (id) {
+ case RT_TABLE_LOCAL:
+ net->ipv4.fib_local = tb;
+ break;
+
+ case RT_TABLE_MAIN:
+ net->ipv4.fib_main = tb;
+ break;
+
+ case RT_TABLE_DEFAULT:
+ net->ipv4.fib_default = tb;
+ break;
+
+ default:
+ break;
+ }
+
h = id & (FIB_TABLE_HASHSZ - 1);
hlist_add_head_rcu(&tb->tb_hlist, &net->ipv4.fib_table_hash[h]);
return tb;
@@ -136,13 +155,13 @@ static void fib_flush(struct net *net)
* Find address type as if only "dev" was present in the system. If
* on_dev is NULL then all interfaces are taken into consideration.
*/
-static inline unsigned __inet_dev_addr_type(struct net *net,
- const struct net_device *dev,
- __be32 addr)
+static inline unsigned int __inet_dev_addr_type(struct net *net,
+ const struct net_device *dev,
+ __be32 addr)
{
struct flowi4 fl4 = { .daddr = addr };
struct fib_result res;
- unsigned ret = RTN_BROADCAST;
+ unsigned int ret = RTN_BROADCAST;
struct fib_table *local_table;
if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr))
@@ -150,10 +169,6 @@ static inline unsigned __inet_dev_addr_type(struct net *net,
if (ipv4_is_multicast(addr))
return RTN_MULTICAST;
-#ifdef CONFIG_IP_MULTIPLE_TABLES
- res.r = NULL;
-#endif
-
local_table = fib_get_table(net, RT_TABLE_LOCAL);
if (local_table) {
ret = RTN_UNICAST;
@@ -180,6 +195,44 @@ unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev,
}
EXPORT_SYMBOL(inet_dev_addr_type);
+__be32 fib_compute_spec_dst(struct sk_buff *skb)
+{
+ struct net_device *dev = skb->dev;
+ struct in_device *in_dev;
+ struct fib_result res;
+ struct rtable *rt;
+ struct flowi4 fl4;
+ struct net *net;
+ int scope;
+
+ rt = skb_rtable(skb);
+ if ((rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL)) ==
+ RTCF_LOCAL)
+ return ip_hdr(skb)->daddr;
+
+ in_dev = __in_dev_get_rcu(dev);
+ BUG_ON(!in_dev);
+
+ net = dev_net(dev);
+
+ scope = RT_SCOPE_UNIVERSE;
+ if (!ipv4_is_zeronet(ip_hdr(skb)->saddr)) {
+ fl4.flowi4_oif = 0;
+ fl4.flowi4_iif = net->loopback_dev->ifindex;
+ fl4.daddr = ip_hdr(skb)->saddr;
+ fl4.saddr = 0;
+ fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
+ fl4.flowi4_scope = scope;
+ fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0;
+ if (!fib_lookup(net, &fl4, &res))
+ return FIB_RES_PREFSRC(net, res);
+ } else {
+ scope = RT_SCOPE_LINK;
+ }
+
+ return inet_select_addr(dev, ip_hdr(skb)->saddr, scope);
+}
+
/* Given (packet source, input interface) and optional (dst, oif, tos):
* - (main) check, that source is valid i.e. not broadcast or our local
* address.
@@ -188,17 +241,15 @@ EXPORT_SYMBOL(inet_dev_addr_type);
* - check, that packet arrived from expected physical interface.
* called with rcu_read_lock()
*/
-int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos,
- int oif, struct net_device *dev, __be32 *spec_dst,
- u32 *itag)
+static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
+ u8 tos, int oif, struct net_device *dev,
+ int rpf, struct in_device *idev, u32 *itag)
{
- struct in_device *in_dev;
- struct flowi4 fl4;
+ int ret, no_addr, accept_local;
struct fib_result res;
- int no_addr, rpf, accept_local;
- bool dev_match;
- int ret;
+ struct flowi4 fl4;
struct net *net;
+ bool dev_match;
fl4.flowi4_oif = 0;
fl4.flowi4_iif = oif;
@@ -207,20 +258,11 @@ int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos,
fl4.flowi4_tos = tos;
fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
- no_addr = rpf = accept_local = 0;
- in_dev = __in_dev_get_rcu(dev);
- if (in_dev) {
- no_addr = in_dev->ifa_list == NULL;
-
- /* Ignore rp_filter for packets protected by IPsec. */
- rpf = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(in_dev);
-
- accept_local = IN_DEV_ACCEPT_LOCAL(in_dev);
- fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0;
- }
+ no_addr = accept_local = 0;
+ no_addr = idev->ifa_list == NULL;
- if (in_dev == NULL)
- goto e_inval;
+ accept_local = IN_DEV_ACCEPT_LOCAL(idev);
+ fl4.flowi4_mark = IN_DEV_SRC_VMARK(idev) ? skb->mark : 0;
net = dev_net(dev);
if (fib_lookup(net, &fl4, &res))
@@ -229,7 +271,6 @@ int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos,
if (res.type != RTN_LOCAL || !accept_local)
goto e_inval;
}
- *spec_dst = FIB_RES_PREFSRC(net, res);
fib_combine_itag(itag, &res);
dev_match = false;
@@ -258,17 +299,14 @@ int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos,
ret = 0;
if (fib_lookup(net, &fl4, &res) == 0) {
- if (res.type == RTN_UNICAST) {
- *spec_dst = FIB_RES_PREFSRC(net, res);
+ if (res.type == RTN_UNICAST)
ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
- }
}
return ret;
last_resort:
if (rpf)
goto e_rpf;
- *spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
*itag = 0;
return 0;
@@ -278,6 +316,20 @@ e_rpf:
return -EXDEV;
}
+/* Ignore rp_filter for packets protected by IPsec. */
+int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
+ u8 tos, int oif, struct net_device *dev,
+ struct in_device *idev, u32 *itag)
+{
+ int r = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(idev);
+
+ if (!r && !fib_num_tclassid_users(dev_net(dev))) {
+ *itag = 0;
+ return 0;
+ }
+ return __fib_validate_source(skb, src, dst, tos, oif, dev, r, idev, itag);
+}
+
static inline __be32 sk_extract_addr(struct sockaddr *addr)
{
return ((struct sockaddr_in *) addr)->sin_addr.s_addr;
@@ -740,7 +792,7 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)
#define BRD_OK 2
#define BRD0_OK 4
#define BRD1_OK 8
- unsigned ok = 0;
+ unsigned int ok = 0;
int subnet = 0; /* Primary network */
int gone = 1; /* Address is missing */
int same_prefsrc = 0; /* Another primary with same IP */
@@ -879,10 +931,6 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb)
.flowi4_scope = frn->fl_scope,
};
-#ifdef CONFIG_IP_MULTIPLE_TABLES
- res.r = NULL;
-#endif
-
frn->err = -ENOENT;
if (tb) {
local_bh_disable();
@@ -935,8 +983,11 @@ static void nl_fib_input(struct sk_buff *skb)
static int __net_init nl_fib_lookup_init(struct net *net)
{
struct sock *sk;
- sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, 0,
- nl_fib_input, NULL, THIS_MODULE);
+ struct netlink_kernel_cfg cfg = {
+ .input = nl_fib_input,
+ };
+
+ sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, THIS_MODULE, &cfg);
if (sk == NULL)
return -EAFNOSUPPORT;
net->ipv4.fibnl = sk;
@@ -1090,6 +1141,9 @@ static int __net_init fib_net_init(struct net *net)
{
int error;
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ net->ipv4.fib_num_tclassid_users = 0;
+#endif
error = ip_fib_net_init(net);
if (error < 0)
goto out;
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 799fc790b3cf..a83d74e498d2 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -47,14 +47,7 @@ struct fib4_rule {
#endif
};
-#ifdef CONFIG_IP_ROUTE_CLASSID
-u32 fib_rules_tclass(const struct fib_result *res)
-{
- return res->r ? ((struct fib4_rule *) res->r)->tclassid : 0;
-}
-#endif
-
-int fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res)
+int __fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res)
{
struct fib_lookup_arg arg = {
.result = res,
@@ -63,11 +56,15 @@ int fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res)
int err;
err = fib_rules_lookup(net->ipv4.rules_ops, flowi4_to_flowi(flp), 0, &arg);
- res->r = arg.rule;
-
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ if (arg.rule)
+ res->tclassid = ((struct fib4_rule *)arg.rule)->tclassid;
+ else
+ res->tclassid = 0;
+#endif
return err;
}
-EXPORT_SYMBOL_GPL(fib_lookup);
+EXPORT_SYMBOL_GPL(__fib_lookup);
static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp,
int flags, struct fib_lookup_arg *arg)
@@ -169,8 +166,11 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
rule4->dst = nla_get_be32(tb[FRA_DST]);
#ifdef CONFIG_IP_ROUTE_CLASSID
- if (tb[FRA_FLOW])
+ if (tb[FRA_FLOW]) {
rule4->tclassid = nla_get_u32(tb[FRA_FLOW]);
+ if (rule4->tclassid)
+ net->ipv4.fib_num_tclassid_users++;
+ }
#endif
rule4->src_len = frh->src_len;
@@ -179,11 +179,24 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
rule4->dstmask = inet_make_mask(rule4->dst_len);
rule4->tos = frh->tos;
+ net->ipv4.fib_has_custom_rules = true;
err = 0;
errout:
return err;
}
+static void fib4_rule_delete(struct fib_rule *rule)
+{
+ struct net *net = rule->fr_net;
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ struct fib4_rule *rule4 = (struct fib4_rule *) rule;
+
+ if (rule4->tclassid)
+ net->ipv4.fib_num_tclassid_users--;
+#endif
+ net->ipv4.fib_has_custom_rules = true;
+}
+
static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
struct nlattr **tb)
{
@@ -221,15 +234,15 @@ static int fib4_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
frh->src_len = rule4->src_len;
frh->tos = rule4->tos;
- if (rule4->dst_len)
- NLA_PUT_BE32(skb, FRA_DST, rule4->dst);
-
- if (rule4->src_len)
- NLA_PUT_BE32(skb, FRA_SRC, rule4->src);
-
+ if ((rule4->dst_len &&
+ nla_put_be32(skb, FRA_DST, rule4->dst)) ||
+ (rule4->src_len &&
+ nla_put_be32(skb, FRA_SRC, rule4->src)))
+ goto nla_put_failure;
#ifdef CONFIG_IP_ROUTE_CLASSID
- if (rule4->tclassid)
- NLA_PUT_U32(skb, FRA_FLOW, rule4->tclassid);
+ if (rule4->tclassid &&
+ nla_put_u32(skb, FRA_FLOW, rule4->tclassid))
+ goto nla_put_failure;
#endif
return 0;
@@ -256,6 +269,7 @@ static const struct fib_rules_ops __net_initdata fib4_rules_ops_template = {
.action = fib4_rule_action,
.match = fib4_rule_match,
.configure = fib4_rule_configure,
+ .delete = fib4_rule_delete,
.compare = fib4_rule_compare,
.fill = fib4_rule_fill,
.default_pref = fib_default_rule_pref,
@@ -295,6 +309,7 @@ int __net_init fib4_rules_init(struct net *net)
if (err < 0)
goto fail;
net->ipv4.rules_ops = ops;
+ net->ipv4.fib_has_custom_rules = false;
return 0;
fail:
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 5063fa38ac7b..2b57d768240d 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -140,11 +140,40 @@ const struct fib_prop fib_props[RTN_MAX + 1] = {
},
};
+static void free_nh_exceptions(struct fib_nh *nh)
+{
+ struct fnhe_hash_bucket *hash = nh->nh_exceptions;
+ int i;
+
+ for (i = 0; i < FNHE_HASH_SIZE; i++) {
+ struct fib_nh_exception *fnhe;
+
+ fnhe = rcu_dereference_protected(hash[i].chain, 1);
+ while (fnhe) {
+ struct fib_nh_exception *next;
+
+ next = rcu_dereference_protected(fnhe->fnhe_next, 1);
+ kfree(fnhe);
+
+ fnhe = next;
+ }
+ }
+ kfree(hash);
+}
+
/* Release a nexthop info record */
static void free_fib_info_rcu(struct rcu_head *head)
{
struct fib_info *fi = container_of(head, struct fib_info, rcu);
+ change_nexthops(fi) {
+ if (nexthop_nh->nh_dev)
+ dev_put(nexthop_nh->nh_dev);
+ if (nexthop_nh->nh_exceptions)
+ free_nh_exceptions(nexthop_nh);
+ } endfor_nexthops(fi);
+
+ release_net(fi->fib_net);
if (fi->fib_metrics != (u32 *) dst_default_metrics)
kfree(fi->fib_metrics);
kfree(fi);
@@ -156,13 +185,13 @@ void free_fib_info(struct fib_info *fi)
pr_warn("Freeing alive fib_info %p\n", fi);
return;
}
+ fib_info_cnt--;
+#ifdef CONFIG_IP_ROUTE_CLASSID
change_nexthops(fi) {
- if (nexthop_nh->nh_dev)
- dev_put(nexthop_nh->nh_dev);
- nexthop_nh->nh_dev = NULL;
+ if (nexthop_nh->nh_tclassid)
+ fi->fib_net->ipv4.fib_num_tclassid_users--;
} endfor_nexthops(fi);
- fib_info_cnt--;
- release_net(fi->fib_net);
+#endif
call_rcu(&fi->rcu, free_fib_info_rcu);
}
@@ -421,6 +450,8 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
#ifdef CONFIG_IP_ROUTE_CLASSID
nla = nla_find(attrs, attrlen, RTA_FLOW);
nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0;
+ if (nexthop_nh->nh_tclassid)
+ fi->fib_net->ipv4.fib_num_tclassid_users++;
#endif
}
@@ -779,9 +810,16 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
int type = nla_type(nla);
if (type) {
+ u32 val;
+
if (type > RTAX_MAX)
goto err_inval;
- fi->fib_metrics[type - 1] = nla_get_u32(nla);
+ val = nla_get_u32(nla);
+ if (type == RTAX_ADVMSS && val > 65535 - 40)
+ val = 65535 - 40;
+ if (type == RTAX_MTU && val > 65535 - 15)
+ val = 65535 - 15;
+ fi->fib_metrics[type - 1] = val;
}
}
}
@@ -810,6 +848,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
nh->nh_flags = cfg->fc_flags;
#ifdef CONFIG_IP_ROUTE_CLASSID
nh->nh_tclassid = cfg->fc_flow;
+ if (nh->nh_tclassid)
+ fi->fib_net->ipv4.fib_num_tclassid_users++;
#endif
#ifdef CONFIG_IP_ROUTE_MULTIPATH
nh->nh_weight = 1;
@@ -931,33 +971,36 @@ int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
rtm->rtm_table = tb_id;
else
rtm->rtm_table = RT_TABLE_COMPAT;
- NLA_PUT_U32(skb, RTA_TABLE, tb_id);
+ if (nla_put_u32(skb, RTA_TABLE, tb_id))
+ goto nla_put_failure;
rtm->rtm_type = type;
rtm->rtm_flags = fi->fib_flags;
rtm->rtm_scope = fi->fib_scope;
rtm->rtm_protocol = fi->fib_protocol;
- if (rtm->rtm_dst_len)
- NLA_PUT_BE32(skb, RTA_DST, dst);
-
- if (fi->fib_priority)
- NLA_PUT_U32(skb, RTA_PRIORITY, fi->fib_priority);
-
+ if (rtm->rtm_dst_len &&
+ nla_put_be32(skb, RTA_DST, dst))
+ goto nla_put_failure;
+ if (fi->fib_priority &&
+ nla_put_u32(skb, RTA_PRIORITY, fi->fib_priority))
+ goto nla_put_failure;
if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0)
goto nla_put_failure;
- if (fi->fib_prefsrc)
- NLA_PUT_BE32(skb, RTA_PREFSRC, fi->fib_prefsrc);
-
+ if (fi->fib_prefsrc &&
+ nla_put_be32(skb, RTA_PREFSRC, fi->fib_prefsrc))
+ goto nla_put_failure;
if (fi->fib_nhs == 1) {
- if (fi->fib_nh->nh_gw)
- NLA_PUT_BE32(skb, RTA_GATEWAY, fi->fib_nh->nh_gw);
-
- if (fi->fib_nh->nh_oif)
- NLA_PUT_U32(skb, RTA_OIF, fi->fib_nh->nh_oif);
+ if (fi->fib_nh->nh_gw &&
+ nla_put_be32(skb, RTA_GATEWAY, fi->fib_nh->nh_gw))
+ goto nla_put_failure;
+ if (fi->fib_nh->nh_oif &&
+ nla_put_u32(skb, RTA_OIF, fi->fib_nh->nh_oif))
+ goto nla_put_failure;
#ifdef CONFIG_IP_ROUTE_CLASSID
- if (fi->fib_nh[0].nh_tclassid)
- NLA_PUT_U32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid);
+ if (fi->fib_nh[0].nh_tclassid &&
+ nla_put_u32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid))
+ goto nla_put_failure;
#endif
}
#ifdef CONFIG_IP_ROUTE_MULTIPATH
@@ -978,11 +1021,13 @@ int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
rtnh->rtnh_hops = nh->nh_weight - 1;
rtnh->rtnh_ifindex = nh->nh_oif;
- if (nh->nh_gw)
- NLA_PUT_BE32(skb, RTA_GATEWAY, nh->nh_gw);
+ if (nh->nh_gw &&
+ nla_put_be32(skb, RTA_GATEWAY, nh->nh_gw))
+ goto nla_put_failure;
#ifdef CONFIG_IP_ROUTE_CLASSID
- if (nh->nh_tclassid)
- NLA_PUT_U32(skb, RTA_FLOW, nh->nh_tclassid);
+ if (nh->nh_tclassid &&
+ nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid))
+ goto nla_put_failure;
#endif
/* length of rtnetlink header + attributes */
rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh;
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index bce36f1a37b4..18cbc15b20d5 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1007,9 +1007,9 @@ static void trie_rebalance(struct trie *t, struct tnode *tn)
while (tn != NULL && (tp = node_parent((struct rt_trie_node *)tn)) != NULL) {
cindex = tkey_extract_bits(key, tp->pos, tp->bits);
wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
- tn = (struct tnode *) resize(t, (struct tnode *)tn);
+ tn = (struct tnode *)resize(t, tn);
- tnode_put_child_reorg((struct tnode *)tp, cindex,
+ tnode_put_child_reorg(tp, cindex,
(struct rt_trie_node *)tn, wasfull);
tp = node_parent((struct rt_trie_node *) tn);
@@ -1024,7 +1024,7 @@ static void trie_rebalance(struct trie *t, struct tnode *tn)
/* Handle last (top) tnode */
if (IS_TNODE(tn))
- tn = (struct tnode *)resize(t, (struct tnode *)tn);
+ tn = (struct tnode *)resize(t, tn);
rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
tnode_free_flush();
@@ -1125,7 +1125,7 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
node_set_parent((struct rt_trie_node *)l, tp);
cindex = tkey_extract_bits(key, tp->pos, tp->bits);
- put_child(t, (struct tnode *)tp, cindex, (struct rt_trie_node *)l);
+ put_child(t, tp, cindex, (struct rt_trie_node *)l);
} else {
/* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
/*
@@ -1160,8 +1160,7 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
if (tp) {
cindex = tkey_extract_bits(key, tp->pos, tp->bits);
- put_child(t, (struct tnode *)tp, cindex,
- (struct rt_trie_node *)tn);
+ put_child(t, tp, cindex, (struct rt_trie_node *)tn);
} else {
rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
tp = tn;
@@ -1370,6 +1369,8 @@ static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l,
if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
continue;
+ if (fi->fib_dead)
+ continue;
if (fa->fa_info->fib_scope < flp->flowi4_scope)
continue;
fib_alias_accessed(fa);
@@ -1618,7 +1619,7 @@ static void trie_leaf_remove(struct trie *t, struct leaf *l)
if (tp) {
t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
- put_child(t, (struct tnode *)tp, cindex, NULL);
+ put_child(t, tp, cindex, NULL);
trie_rebalance(t, tp);
} else
RCU_INIT_POINTER(t->trie, NULL);
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 2cb2bf845641..ea3a996de95b 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -95,6 +95,7 @@
#include <net/checksum.h>
#include <net/xfrm.h>
#include <net/inet_common.h>
+#include <net/ip_fib.h>
/*
* Build xmit assembly blocks
@@ -253,10 +254,10 @@ static inline bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
/* Limit if icmp type is enabled in ratemask. */
if ((1 << type) & net->ipv4.sysctl_icmp_ratemask) {
- if (!rt->peer)
- rt_bind_peer(rt, fl4->daddr, 1);
- rc = inet_peer_xrlim_allow(rt->peer,
+ struct inet_peer *peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, 1);
+ rc = inet_peer_xrlim_allow(peer,
net->ipv4.sysctl_icmp_ratelimit);
+ inet_putpeer(peer);
}
out:
return rc;
@@ -334,7 +335,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
struct flowi4 fl4;
struct sock *sk;
struct inet_sock *inet;
- __be32 daddr;
+ __be32 daddr, saddr;
if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb))
return;
@@ -348,6 +349,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
inet->tos = ip_hdr(skb)->tos;
daddr = ipc.addr = ip_hdr(skb)->saddr;
+ saddr = fib_compute_spec_dst(skb);
ipc.opt = NULL;
ipc.tx_flags = 0;
if (icmp_param->replyopts.opt.opt.optlen) {
@@ -357,7 +359,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
}
memset(&fl4, 0, sizeof(fl4));
fl4.daddr = daddr;
- fl4.saddr = rt->rt_spec_dst;
+ fl4.saddr = saddr;
fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
fl4.flowi4_proto = IPPROTO_ICMP;
security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
@@ -632,6 +634,27 @@ out:;
EXPORT_SYMBOL(icmp_send);
+static void icmp_socket_deliver(struct sk_buff *skb, u32 info)
+{
+ const struct iphdr *iph = (const struct iphdr *) skb->data;
+ const struct net_protocol *ipprot;
+ int protocol = iph->protocol;
+
+ /* Checkin full IP header plus 8 bytes of protocol to
+ * avoid additional coding at protocol handlers.
+ */
+ if (!pskb_may_pull(skb, iph->ihl * 4 + 8))
+ return;
+
+ raw_icmp_error(skb, protocol, info);
+
+ rcu_read_lock();
+ ipprot = rcu_dereference(inet_protos[protocol]);
+ if (ipprot && ipprot->err_handler)
+ ipprot->err_handler(skb, info);
+ rcu_read_unlock();
+}
+
/*
* Handle ICMP_DEST_UNREACH, ICMP_TIME_EXCEED, and ICMP_QUENCH.
*/
@@ -640,10 +663,8 @@ static void icmp_unreach(struct sk_buff *skb)
{
const struct iphdr *iph;
struct icmphdr *icmph;
- int hash, protocol;
- const struct net_protocol *ipprot;
- u32 info = 0;
struct net *net;
+ u32 info = 0;
net = dev_net(skb_dst(skb)->dev);
@@ -674,9 +695,7 @@ static void icmp_unreach(struct sk_buff *skb)
LIMIT_NETDEBUG(KERN_INFO pr_fmt("%pI4: fragmentation needed and DF set\n"),
&iph->daddr);
} else {
- info = ip_rt_frag_needed(net, iph,
- ntohs(icmph->un.frag.mtu),
- skb->dev);
+ info = ntohs(icmph->un.frag.mtu);
if (!info)
goto out;
}
@@ -713,34 +732,14 @@ static void icmp_unreach(struct sk_buff *skb)
if (!net->ipv4.sysctl_icmp_ignore_bogus_error_responses &&
inet_addr_type(net, iph->daddr) == RTN_BROADCAST) {
- if (net_ratelimit())
- pr_warn("%pI4 sent an invalid ICMP type %u, code %u error to a broadcast: %pI4 on %s\n",
- &ip_hdr(skb)->saddr,
- icmph->type, icmph->code,
- &iph->daddr, skb->dev->name);
+ net_warn_ratelimited("%pI4 sent an invalid ICMP type %u, code %u error to a broadcast: %pI4 on %s\n",
+ &ip_hdr(skb)->saddr,
+ icmph->type, icmph->code,
+ &iph->daddr, skb->dev->name);
goto out;
}
- /* Checkin full IP header plus 8 bytes of protocol to
- * avoid additional coding at protocol handlers.
- */
- if (!pskb_may_pull(skb, iph->ihl * 4 + 8))
- goto out;
-
- iph = (const struct iphdr *)skb->data;
- protocol = iph->protocol;
-
- /*
- * Deliver ICMP message to raw sockets. Pretty useless feature?
- */
- raw_icmp_error(skb, protocol, info);
-
- hash = protocol & (MAX_INET_PROTOS - 1);
- rcu_read_lock();
- ipprot = rcu_dereference(inet_protos[hash]);
- if (ipprot && ipprot->err_handler)
- ipprot->err_handler(skb, info);
- rcu_read_unlock();
+ icmp_socket_deliver(skb, info);
out:
return;
@@ -756,46 +755,15 @@ out_err:
static void icmp_redirect(struct sk_buff *skb)
{
- const struct iphdr *iph;
-
- if (skb->len < sizeof(struct iphdr))
- goto out_err;
-
- /*
- * Get the copied header of the packet that caused the redirect
- */
- if (!pskb_may_pull(skb, sizeof(struct iphdr)))
- goto out;
-
- iph = (const struct iphdr *)skb->data;
-
- switch (icmp_hdr(skb)->code & 7) {
- case ICMP_REDIR_NET:
- case ICMP_REDIR_NETTOS:
- /*
- * As per RFC recommendations now handle it as a host redirect.
- */
- case ICMP_REDIR_HOST:
- case ICMP_REDIR_HOSTTOS:
- ip_rt_redirect(ip_hdr(skb)->saddr, iph->daddr,
- icmp_hdr(skb)->un.gateway,
- iph->saddr, skb->dev);
- break;
+ if (skb->len < sizeof(struct iphdr)) {
+ ICMP_INC_STATS_BH(dev_net(skb->dev), ICMP_MIB_INERRORS);
+ return;
}
- /* Ping wants to see redirects.
- * Let's pretend they are errors of sorts... */
- if (iph->protocol == IPPROTO_ICMP &&
- iph->ihl >= 5 &&
- pskb_may_pull(skb, (iph->ihl<<2)+8)) {
- ping_err(skb, icmp_hdr(skb)->un.gateway);
- }
+ if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+ return;
-out:
- return;
-out_err:
- ICMP_INC_STATS_BH(dev_net(skb->dev), ICMP_MIB_INERRORS);
- goto out;
+ icmp_socket_deliver(skb, icmp_hdr(skb)->un.gateway);
}
/*
@@ -906,8 +874,7 @@ out_err:
static void icmp_address(struct sk_buff *skb)
{
#if 0
- if (net_ratelimit())
- printk(KERN_DEBUG "a guy asks for address mask. Who is it?\n");
+ net_dbg_ratelimited("a guy asks for address mask. Who is it?\n");
#endif
}
@@ -943,10 +910,10 @@ static void icmp_address_reply(struct sk_buff *skb)
inet_ifa_match(ip_hdr(skb)->saddr, ifa))
break;
}
- if (!ifa && net_ratelimit()) {
- pr_info("Wrong address mask %pI4 from %s/%pI4\n",
- mp, dev->name, &ip_hdr(skb)->saddr);
- }
+ if (!ifa)
+ net_info_ratelimited("Wrong address mask %pI4 from %s/%pI4\n",
+ mp,
+ dev->name, &ip_hdr(skb)->saddr);
}
}
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 5dfecfd7d5e9..6699f23e6f55 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -344,10 +344,10 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
pip->protocol = IPPROTO_IGMP;
pip->tot_len = 0; /* filled in later */
ip_select_ident(pip, &rt->dst, NULL);
- ((u8*)&pip[1])[0] = IPOPT_RA;
- ((u8*)&pip[1])[1] = 4;
- ((u8*)&pip[1])[2] = 0;
- ((u8*)&pip[1])[3] = 0;
+ ((u8 *)&pip[1])[0] = IPOPT_RA;
+ ((u8 *)&pip[1])[1] = 4;
+ ((u8 *)&pip[1])[2] = 0;
+ ((u8 *)&pip[1])[3] = 0;
skb->transport_header = skb->network_header + sizeof(struct iphdr) + 4;
skb_put(skb, sizeof(*pig));
@@ -688,10 +688,10 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
iph->saddr = fl4.saddr;
iph->protocol = IPPROTO_IGMP;
ip_select_ident(iph, &rt->dst, NULL);
- ((u8*)&iph[1])[0] = IPOPT_RA;
- ((u8*)&iph[1])[1] = 4;
- ((u8*)&iph[1])[2] = 0;
- ((u8*)&iph[1])[3] = 0;
+ ((u8 *)&iph[1])[0] = IPOPT_RA;
+ ((u8 *)&iph[1])[1] = 4;
+ ((u8 *)&iph[1])[2] = 0;
+ ((u8 *)&iph[1])[3] = 0;
ih = (struct igmphdr *)skb_put(skb, sizeof(struct igmphdr));
ih->type = type;
@@ -774,7 +774,7 @@ static int igmp_xmarksources(struct ip_mc_list *pmc, int nsrcs, __be32 *srcs)
if (psf->sf_count[MCAST_INCLUDE] ||
pmc->sfcount[MCAST_EXCLUDE] !=
psf->sf_count[MCAST_EXCLUDE])
- continue;
+ break;
if (srcs[i] == psf->sf_inaddr) {
scount++;
break;
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 19d66cefd7d3..c7a4de05ca04 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -42,7 +42,8 @@ EXPORT_SYMBOL(sysctl_local_reserved_ports);
void inet_get_local_port_range(int *low, int *high)
{
- unsigned seq;
+ unsigned int seq;
+
do {
seq = read_seqbegin(&sysctl_local_ports.lock);
@@ -53,7 +54,7 @@ void inet_get_local_port_range(int *low, int *high)
EXPORT_SYMBOL(inet_get_local_port_range);
int inet_csk_bind_conflict(const struct sock *sk,
- const struct inet_bind_bucket *tb)
+ const struct inet_bind_bucket *tb, bool relax)
{
struct sock *sk2;
struct hlist_node *node;
@@ -79,6 +80,14 @@ int inet_csk_bind_conflict(const struct sock *sk,
sk2_rcv_saddr == sk_rcv_saddr(sk))
break;
}
+ if (!relax && reuse && sk2->sk_reuse &&
+ sk2->sk_state != TCP_LISTEN) {
+ const __be32 sk2_rcv_saddr = sk_rcv_saddr(sk2);
+
+ if (!sk2_rcv_saddr || !sk_rcv_saddr(sk) ||
+ sk2_rcv_saddr == sk_rcv_saddr(sk))
+ break;
+ }
}
}
return node != NULL;
@@ -122,12 +131,13 @@ again:
(tb->num_owners < smallest_size || smallest_size == -1)) {
smallest_size = tb->num_owners;
smallest_rover = rover;
- if (atomic_read(&hashinfo->bsockets) > (high - low) + 1) {
+ if (atomic_read(&hashinfo->bsockets) > (high - low) + 1 &&
+ !inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) {
snum = smallest_rover;
goto tb_found;
}
}
- if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb)) {
+ if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) {
snum = rover;
goto tb_found;
}
@@ -172,18 +182,22 @@ have_snum:
goto tb_not_found;
tb_found:
if (!hlist_empty(&tb->owners)) {
+ if (sk->sk_reuse == SK_FORCE_REUSE)
+ goto success;
+
if (tb->fastreuse > 0 &&
sk->sk_reuse && sk->sk_state != TCP_LISTEN &&
smallest_size == -1) {
goto success;
} else {
ret = 1;
- if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb)) {
+ if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) {
if (sk->sk_reuse && sk->sk_state != TCP_LISTEN &&
smallest_size != -1 && --attempts >= 0) {
spin_unlock(&head->lock);
goto again;
}
+
goto fail_unlock;
}
}
@@ -354,16 +368,21 @@ EXPORT_SYMBOL(inet_csk_reset_keepalive_timer);
struct dst_entry *inet_csk_route_req(struct sock *sk,
struct flowi4 *fl4,
- const struct request_sock *req)
+ const struct request_sock *req,
+ bool nocache)
{
struct rtable *rt;
const struct inet_request_sock *ireq = inet_rsk(req);
struct ip_options_rcu *opt = inet_rsk(req)->opt;
struct net *net = sock_net(sk);
+ int flags = inet_sk_flowi_flags(sk);
+ if (nocache)
+ flags |= FLOWI_FLAG_RT_NOCACHE;
flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
- sk->sk_protocol, inet_sk_flowi_flags(sk),
+ sk->sk_protocol,
+ flags,
(opt && opt->opt.srr) ? opt->opt.faddr : ireq->rmt_addr,
ireq->loc_addr, ireq->rmt_port, inet_sk(sk)->inet_sport);
security_req_classify_flow(req, flowi4_to_flowi(fl4));
@@ -514,7 +533,7 @@ void inet_csk_reqsk_queue_prune(struct sock *parent,
/* Normally all the openreqs are young and become mature
* (i.e. converted to established socket) for first timeout.
- * If synack was not acknowledged for 3 seconds, it means
+ * If synack was not acknowledged for 1 second, it means
* one of the following things: synack was lost, ack was lost,
* rtt is high or nobody planned to ack (i.e. synflood).
* When server is a bit loaded, queue is populated with old
@@ -555,8 +574,7 @@ void inet_csk_reqsk_queue_prune(struct sock *parent,
syn_ack_recalc(req, thresh, max_retries,
queue->rskq_defer_accept,
&expire, &resend);
- if (req->rsk_ops->syn_ack_timeout)
- req->rsk_ops->syn_ack_timeout(parent, req);
+ req->rsk_ops->syn_ack_timeout(parent, req);
if (!expire &&
(!resend ||
!req->rsk_ops->rtx_syn_ack(parent, req, NULL) ||
@@ -785,3 +803,49 @@ int inet_csk_compat_setsockopt(struct sock *sk, int level, int optname,
}
EXPORT_SYMBOL_GPL(inet_csk_compat_setsockopt);
#endif
+
+static struct dst_entry *inet_csk_rebuild_route(struct sock *sk, struct flowi *fl)
+{
+ const struct inet_sock *inet = inet_sk(sk);
+ const struct ip_options_rcu *inet_opt;
+ __be32 daddr = inet->inet_daddr;
+ struct flowi4 *fl4;
+ struct rtable *rt;
+
+ rcu_read_lock();
+ inet_opt = rcu_dereference(inet->inet_opt);
+ if (inet_opt && inet_opt->opt.srr)
+ daddr = inet_opt->opt.faddr;
+ fl4 = &fl->u.ip4;
+ rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr,
+ inet->inet_saddr, inet->inet_dport,
+ inet->inet_sport, sk->sk_protocol,
+ RT_CONN_FLAGS(sk), sk->sk_bound_dev_if);
+ if (IS_ERR(rt))
+ rt = NULL;
+ if (rt)
+ sk_setup_caps(sk, &rt->dst);
+ rcu_read_unlock();
+
+ return &rt->dst;
+}
+
+struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu)
+{
+ struct dst_entry *dst = __sk_dst_check(sk, 0);
+ struct inet_sock *inet = inet_sk(sk);
+
+ if (!dst) {
+ dst = inet_csk_rebuild_route(sk, &inet->cork.fl);
+ if (!dst)
+ goto out;
+ }
+ dst->ops->update_pmtu(dst, sk, NULL, mtu);
+
+ dst = __sk_dst_check(sk, 0);
+ if (!dst)
+ dst = inet_csk_rebuild_route(sk, &inet->cork.fl);
+out:
+ return dst;
+}
+EXPORT_SYMBOL_GPL(inet_csk_update_pmtu);
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 8d25a1c557eb..570e61f9611f 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -46,9 +46,6 @@ struct inet_diag_entry {
u16 userlocks;
};
-#define INET_DIAG_PUT(skb, attrtype, attrlen) \
- RTA_DATA(__RTA_PUT(skb, attrtype, attrlen))
-
static DEFINE_MUTEX(inet_diag_table_mutex);
static const struct inet_diag_handler *inet_diag_lock_handler(int proto)
@@ -78,24 +75,22 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
const struct inet_sock *inet = inet_sk(sk);
struct inet_diag_msg *r;
struct nlmsghdr *nlh;
+ struct nlattr *attr;
void *info = NULL;
- struct inet_diag_meminfo *minfo = NULL;
- unsigned char *b = skb_tail_pointer(skb);
const struct inet_diag_handler *handler;
int ext = req->idiag_ext;
handler = inet_diag_table[req->sdiag_protocol];
BUG_ON(handler == NULL);
- nlh = NLMSG_PUT(skb, pid, seq, unlh->nlmsg_type, sizeof(*r));
- nlh->nlmsg_flags = nlmsg_flags;
+ nlh = nlmsg_put(skb, pid, seq, unlh->nlmsg_type, sizeof(*r),
+ nlmsg_flags);
+ if (!nlh)
+ return -EMSGSIZE;
- r = NLMSG_DATA(nlh);
+ r = nlmsg_data(nlh);
BUG_ON(sk->sk_state == TCP_TIME_WAIT);
- if (ext & (1 << (INET_DIAG_MEMINFO - 1)))
- minfo = INET_DIAG_PUT(skb, INET_DIAG_MEMINFO, sizeof(*minfo));
-
r->idiag_family = sk->sk_family;
r->idiag_state = sk->sk_state;
r->idiag_timer = 0;
@@ -113,7 +108,8 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
* hence this needs to be included regardless of socket family.
*/
if (ext & (1 << (INET_DIAG_TOS - 1)))
- RTA_PUT_U8(skb, INET_DIAG_TOS, inet->tos);
+ if (nla_put_u8(skb, INET_DIAG_TOS, inet->tos) < 0)
+ goto errout;
#if IS_ENABLED(CONFIG_IPV6)
if (r->idiag_family == AF_INET6) {
@@ -121,27 +117,34 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
*(struct in6_addr *)r->id.idiag_src = np->rcv_saddr;
*(struct in6_addr *)r->id.idiag_dst = np->daddr;
+
if (ext & (1 << (INET_DIAG_TCLASS - 1)))
- RTA_PUT_U8(skb, INET_DIAG_TCLASS, np->tclass);
+ if (nla_put_u8(skb, INET_DIAG_TCLASS, np->tclass) < 0)
+ goto errout;
}
#endif
r->idiag_uid = sock_i_uid(sk);
r->idiag_inode = sock_i_ino(sk);
- if (minfo) {
- minfo->idiag_rmem = sk_rmem_alloc_get(sk);
- minfo->idiag_wmem = sk->sk_wmem_queued;
- minfo->idiag_fmem = sk->sk_forward_alloc;
- minfo->idiag_tmem = sk_wmem_alloc_get(sk);
+ if (ext & (1 << (INET_DIAG_MEMINFO - 1))) {
+ struct inet_diag_meminfo minfo = {
+ .idiag_rmem = sk_rmem_alloc_get(sk),
+ .idiag_wmem = sk->sk_wmem_queued,
+ .idiag_fmem = sk->sk_forward_alloc,
+ .idiag_tmem = sk_wmem_alloc_get(sk),
+ };
+
+ if (nla_put(skb, INET_DIAG_MEMINFO, sizeof(minfo), &minfo) < 0)
+ goto errout;
}
if (ext & (1 << (INET_DIAG_SKMEMINFO - 1)))
if (sock_diag_put_meminfo(sk, skb, INET_DIAG_SKMEMINFO))
- goto rtattr_failure;
+ goto errout;
if (icsk == NULL) {
- r->idiag_rqueue = r->idiag_wqueue = 0;
+ handler->idiag_get_info(sk, r, NULL);
goto out;
}
@@ -165,16 +168,20 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
}
#undef EXPIRES_IN_MS
- if (ext & (1 << (INET_DIAG_INFO - 1)))
- info = INET_DIAG_PUT(skb, INET_DIAG_INFO, sizeof(struct tcp_info));
-
- if ((ext & (1 << (INET_DIAG_CONG - 1))) && icsk->icsk_ca_ops) {
- const size_t len = strlen(icsk->icsk_ca_ops->name);
+ if (ext & (1 << (INET_DIAG_INFO - 1))) {
+ attr = nla_reserve(skb, INET_DIAG_INFO,
+ sizeof(struct tcp_info));
+ if (!attr)
+ goto errout;
- strcpy(INET_DIAG_PUT(skb, INET_DIAG_CONG, len + 1),
- icsk->icsk_ca_ops->name);
+ info = nla_data(attr);
}
+ if ((ext & (1 << (INET_DIAG_CONG - 1))) && icsk->icsk_ca_ops)
+ if (nla_put_string(skb, INET_DIAG_CONG,
+ icsk->icsk_ca_ops->name) < 0)
+ goto errout;
+
handler->idiag_get_info(sk, r, info);
if (sk->sk_state < TCP_TIME_WAIT &&
@@ -182,12 +189,10 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
icsk->icsk_ca_ops->get_info(sk, ext, skb);
out:
- nlh->nlmsg_len = skb_tail_pointer(skb) - b;
- return skb->len;
+ return nlmsg_end(skb, nlh);
-rtattr_failure:
-nlmsg_failure:
- nlmsg_trim(skb, b);
+errout:
+ nlmsg_cancel(skb, nlh);
return -EMSGSIZE;
}
EXPORT_SYMBOL_GPL(inet_sk_diag_fill);
@@ -208,14 +213,15 @@ static int inet_twsk_diag_fill(struct inet_timewait_sock *tw,
{
long tmo;
struct inet_diag_msg *r;
- const unsigned char *previous_tail = skb_tail_pointer(skb);
- struct nlmsghdr *nlh = NLMSG_PUT(skb, pid, seq,
- unlh->nlmsg_type, sizeof(*r));
+ struct nlmsghdr *nlh;
- r = NLMSG_DATA(nlh);
- BUG_ON(tw->tw_state != TCP_TIME_WAIT);
+ nlh = nlmsg_put(skb, pid, seq, unlh->nlmsg_type, sizeof(*r),
+ nlmsg_flags);
+ if (!nlh)
+ return -EMSGSIZE;
- nlh->nlmsg_flags = nlmsg_flags;
+ r = nlmsg_data(nlh);
+ BUG_ON(tw->tw_state != TCP_TIME_WAIT);
tmo = tw->tw_ttd - jiffies;
if (tmo < 0)
@@ -245,11 +251,8 @@ static int inet_twsk_diag_fill(struct inet_timewait_sock *tw,
*(struct in6_addr *)r->id.idiag_dst = tw6->tw_v6_daddr;
}
#endif
- nlh->nlmsg_len = skb_tail_pointer(skb) - previous_tail;
- return skb->len;
-nlmsg_failure:
- nlmsg_trim(skb, previous_tail);
- return -EMSGSIZE;
+
+ return nlmsg_end(skb, nlh);
}
static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
@@ -269,16 +272,17 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_s
int err;
struct sock *sk;
struct sk_buff *rep;
+ struct net *net = sock_net(in_skb->sk);
err = -EINVAL;
if (req->sdiag_family == AF_INET) {
- sk = inet_lookup(&init_net, hashinfo, req->id.idiag_dst[0],
+ sk = inet_lookup(net, hashinfo, req->id.idiag_dst[0],
req->id.idiag_dport, req->id.idiag_src[0],
req->id.idiag_sport, req->id.idiag_if);
}
#if IS_ENABLED(CONFIG_IPV6)
else if (req->sdiag_family == AF_INET6) {
- sk = inet6_lookup(&init_net, hashinfo,
+ sk = inet6_lookup(net, hashinfo,
(struct in6_addr *)req->id.idiag_dst,
req->id.idiag_dport,
(struct in6_addr *)req->id.idiag_src,
@@ -298,23 +302,23 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_s
if (err)
goto out;
- err = -ENOMEM;
- rep = alloc_skb(NLMSG_SPACE((sizeof(struct inet_diag_msg) +
- sizeof(struct inet_diag_meminfo) +
- sizeof(struct tcp_info) + 64)),
- GFP_KERNEL);
- if (!rep)
+ rep = nlmsg_new(sizeof(struct inet_diag_msg) +
+ sizeof(struct inet_diag_meminfo) +
+ sizeof(struct tcp_info) + 64, GFP_KERNEL);
+ if (!rep) {
+ err = -ENOMEM;
goto out;
+ }
err = sk_diag_fill(sk, rep, req,
NETLINK_CB(in_skb).pid,
nlh->nlmsg_seq, 0, nlh);
if (err < 0) {
WARN_ON(err == -EMSGSIZE);
- kfree_skb(rep);
+ nlmsg_free(rep);
goto out;
}
- err = netlink_unicast(sock_diag_nlsk, rep, NETLINK_CB(in_skb).pid,
+ err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).pid,
MSG_DONTWAIT);
if (err > 0)
err = 0;
@@ -592,15 +596,16 @@ static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk,
{
const struct inet_request_sock *ireq = inet_rsk(req);
struct inet_sock *inet = inet_sk(sk);
- unsigned char *b = skb_tail_pointer(skb);
struct inet_diag_msg *r;
struct nlmsghdr *nlh;
long tmo;
- nlh = NLMSG_PUT(skb, pid, seq, unlh->nlmsg_type, sizeof(*r));
- nlh->nlmsg_flags = NLM_F_MULTI;
- r = NLMSG_DATA(nlh);
+ nlh = nlmsg_put(skb, pid, seq, unlh->nlmsg_type, sizeof(*r),
+ NLM_F_MULTI);
+ if (!nlh)
+ return -EMSGSIZE;
+ r = nlmsg_data(nlh);
r->idiag_family = sk->sk_family;
r->idiag_state = TCP_SYN_RECV;
r->idiag_timer = 1;
@@ -628,13 +633,8 @@ static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk,
*(struct in6_addr *)r->id.idiag_dst = inet6_rsk(req)->rmt_addr;
}
#endif
- nlh->nlmsg_len = skb_tail_pointer(skb) - b;
-
- return skb->len;
-nlmsg_failure:
- nlmsg_trim(skb, b);
- return -1;
+ return nlmsg_end(skb, nlh);
}
static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
@@ -725,6 +725,7 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
{
int i, num;
int s_i, s_num;
+ struct net *net = sock_net(skb->sk);
s_i = cb->args[1];
s_num = num = cb->args[2];
@@ -744,6 +745,9 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
sk_nulls_for_each(sk, node, &ilb->head) {
struct inet_sock *inet = inet_sk(sk);
+ if (!net_eq(sock_net(sk), net))
+ continue;
+
if (num < s_num) {
num++;
continue;
@@ -814,6 +818,8 @@ skip_listen_ht:
sk_nulls_for_each(sk, node, &head->chain) {
struct inet_sock *inet = inet_sk(sk);
+ if (!net_eq(sock_net(sk), net))
+ continue;
if (num < s_num)
goto next_normal;
if (!(r->idiag_states & (1 << sk->sk_state)))
@@ -840,6 +846,8 @@ next_normal:
inet_twsk_for_each(tw, node,
&head->twchain) {
+ if (!net_eq(twsk_net(tw), net))
+ continue;
if (num < s_num)
goto next_dying;
@@ -892,7 +900,7 @@ static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
if (nlmsg_attrlen(cb->nlh, hdrlen))
bc = nlmsg_find_attr(cb->nlh, hdrlen, INET_DIAG_REQ_BYTECODE);
- return __inet_diag_dump(skb, cb, (struct inet_diag_req_v2 *)NLMSG_DATA(cb->nlh), bc);
+ return __inet_diag_dump(skb, cb, nlmsg_data(cb->nlh), bc);
}
static inline int inet_diag_type2proto(int type)
@@ -909,7 +917,7 @@ static inline int inet_diag_type2proto(int type)
static int inet_diag_dump_compat(struct sk_buff *skb, struct netlink_callback *cb)
{
- struct inet_diag_req *rc = NLMSG_DATA(cb->nlh);
+ struct inet_diag_req *rc = nlmsg_data(cb->nlh);
struct inet_diag_req_v2 req;
struct nlattr *bc = NULL;
int hdrlen = sizeof(struct inet_diag_req);
@@ -929,7 +937,7 @@ static int inet_diag_dump_compat(struct sk_buff *skb, struct netlink_callback *c
static int inet_diag_get_exact_compat(struct sk_buff *in_skb,
const struct nlmsghdr *nlh)
{
- struct inet_diag_req *rc = NLMSG_DATA(nlh);
+ struct inet_diag_req *rc = nlmsg_data(nlh);
struct inet_diag_req_v2 req;
req.sdiag_family = rc->idiag_family;
@@ -944,6 +952,7 @@ static int inet_diag_get_exact_compat(struct sk_buff *in_skb,
static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh)
{
int hdrlen = sizeof(struct inet_diag_req);
+ struct net *net = sock_net(skb->sk);
if (nlh->nlmsg_type >= INET_DIAG_GETSOCK_MAX ||
nlmsg_len(nlh) < hdrlen)
@@ -964,7 +973,7 @@ static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh)
struct netlink_dump_control c = {
.dump = inet_diag_dump_compat,
};
- return netlink_dump_start(sock_diag_nlsk, skb, nlh, &c);
+ return netlink_dump_start(net->diag_nlsk, skb, nlh, &c);
}
}
@@ -974,6 +983,7 @@ static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh)
static int inet_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
{
int hdrlen = sizeof(struct inet_diag_req_v2);
+ struct net *net = sock_net(skb->sk);
if (nlmsg_len(h) < hdrlen)
return -EINVAL;
@@ -992,19 +1002,19 @@ static int inet_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
struct netlink_dump_control c = {
.dump = inet_diag_dump,
};
- return netlink_dump_start(sock_diag_nlsk, skb, h, &c);
+ return netlink_dump_start(net->diag_nlsk, skb, h, &c);
}
}
- return inet_diag_get_exact(skb, h, (struct inet_diag_req_v2 *)NLMSG_DATA(h));
+ return inet_diag_get_exact(skb, h, nlmsg_data(h));
}
-static struct sock_diag_handler inet_diag_handler = {
+static const struct sock_diag_handler inet_diag_handler = {
.family = AF_INET,
.dump = inet_diag_handler_dump,
};
-static struct sock_diag_handler inet6_diag_handler = {
+static const struct sock_diag_handler inet6_diag_handler = {
.family = AF_INET6,
.dump = inet_diag_handler_dump,
};
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 5ff2a51b6d0c..85190e69297b 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -243,12 +243,12 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
if (q == NULL)
return NULL;
+ q->net = nf;
f->constructor(q, arg);
atomic_add(f->qsize, &nf->mem);
setup_timer(&q->timer, f->frag_expire, (unsigned long)q);
spin_lock_init(&q->lock);
atomic_set(&q->refcnt, 1);
- q->net = nf;
return q;
}
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 984ec656b03b..7880af970208 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -217,7 +217,7 @@ begin:
}
EXPORT_SYMBOL_GPL(__inet_lookup_listener);
-struct sock * __inet_lookup_established(struct net *net,
+struct sock *__inet_lookup_established(struct net *net,
struct inet_hashinfo *hashinfo,
const __be32 saddr, const __be16 sport,
const __be32 daddr, const u16 hnum,
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 89168c6351ff..2784db3155fb 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -89,8 +89,8 @@ static void __inet_twsk_kill(struct inet_timewait_sock *tw,
#ifdef SOCK_REFCNT_DEBUG
if (atomic_read(&tw->tw_refcnt) != 1) {
- printk(KERN_DEBUG "%s timewait_sock %p refcnt=%d\n",
- tw->tw_prot->name, tw, atomic_read(&tw->tw_refcnt));
+ pr_debug("%s timewait_sock %p refcnt=%d\n",
+ tw->tw_prot->name, tw, atomic_read(&tw->tw_refcnt));
}
#endif
while (refcnt) {
@@ -263,7 +263,7 @@ rescan:
void inet_twdr_hangman(unsigned long data)
{
struct inet_timewait_death_row *twdr;
- int unsigned need_timer;
+ unsigned int need_timer;
twdr = (struct inet_timewait_death_row *)data;
spin_lock(&twdr->death_lock);
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index d4d61b694fab..e1e0a4e8fd34 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -82,23 +82,39 @@ static const struct inet_peer peer_fake_node = {
.avl_height = 0
};
-struct inet_peer_base {
- struct inet_peer __rcu *root;
- seqlock_t lock;
- int total;
-};
+void inet_peer_base_init(struct inet_peer_base *bp)
+{
+ bp->root = peer_avl_empty_rcu;
+ seqlock_init(&bp->lock);
+ bp->flush_seq = ~0U;
+ bp->total = 0;
+}
+EXPORT_SYMBOL_GPL(inet_peer_base_init);
-static struct inet_peer_base v4_peers = {
- .root = peer_avl_empty_rcu,
- .lock = __SEQLOCK_UNLOCKED(v4_peers.lock),
- .total = 0,
-};
+static atomic_t v4_seq = ATOMIC_INIT(0);
+static atomic_t v6_seq = ATOMIC_INIT(0);
-static struct inet_peer_base v6_peers = {
- .root = peer_avl_empty_rcu,
- .lock = __SEQLOCK_UNLOCKED(v6_peers.lock),
- .total = 0,
-};
+static atomic_t *inetpeer_seq_ptr(int family)
+{
+ return (family == AF_INET ? &v4_seq : &v6_seq);
+}
+
+static inline void flush_check(struct inet_peer_base *base, int family)
+{
+ atomic_t *fp = inetpeer_seq_ptr(family);
+
+ if (unlikely(base->flush_seq != atomic_read(fp))) {
+ inetpeer_invalidate_tree(base);
+ base->flush_seq = atomic_read(fp);
+ }
+}
+
+void inetpeer_invalidate_family(int family)
+{
+ atomic_t *fp = inetpeer_seq_ptr(family);
+
+ atomic_inc(fp);
+}
#define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */
@@ -110,7 +126,7 @@ int inet_peer_maxttl __read_mostly = 10 * 60 * HZ; /* usual time to live: 10 min
static void inetpeer_gc_worker(struct work_struct *work)
{
- struct inet_peer *p, *n;
+ struct inet_peer *p, *n, *c;
LIST_HEAD(list);
spin_lock_bh(&gc_lock);
@@ -122,17 +138,19 @@ static void inetpeer_gc_worker(struct work_struct *work)
list_for_each_entry_safe(p, n, &list, gc_list) {
- if(need_resched())
+ if (need_resched())
cond_resched();
- if (p->avl_left != peer_avl_empty) {
- list_add_tail(&p->avl_left->gc_list, &list);
- p->avl_left = peer_avl_empty;
+ c = rcu_dereference_protected(p->avl_left, 1);
+ if (c != peer_avl_empty) {
+ list_add_tail(&c->gc_list, &list);
+ p->avl_left = peer_avl_empty_rcu;
}
- if (p->avl_right != peer_avl_empty) {
- list_add_tail(&p->avl_right->gc_list, &list);
- p->avl_right = peer_avl_empty;
+ c = rcu_dereference_protected(p->avl_right, 1);
+ if (c != peer_avl_empty) {
+ list_add_tail(&c->gc_list, &list);
+ p->avl_right = peer_avl_empty_rcu;
}
n = list_entry(p->gc_list.next, struct inet_peer, gc_list);
@@ -401,11 +419,6 @@ static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base,
call_rcu(&p->rcu, inetpeer_free_rcu);
}
-static struct inet_peer_base *family_to_base(int family)
-{
- return family == AF_INET ? &v4_peers : &v6_peers;
-}
-
/* perform garbage collect on all items stacked during a lookup */
static int inet_peer_gc(struct inet_peer_base *base,
struct inet_peer __rcu **stack[PEER_MAXDEPTH],
@@ -443,14 +456,17 @@ static int inet_peer_gc(struct inet_peer_base *base,
return cnt;
}
-struct inet_peer *inet_getpeer(const struct inetpeer_addr *daddr, int create)
+struct inet_peer *inet_getpeer(struct inet_peer_base *base,
+ const struct inetpeer_addr *daddr,
+ int create)
{
struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr;
- struct inet_peer_base *base = family_to_base(daddr->family);
struct inet_peer *p;
unsigned int sequence;
int invalidated, gccnt = 0;
+ flush_check(base, daddr->family);
+
/* Attempt a lockless lookup first.
* Because of a concurrent writer, we might not find an existing entry.
*/
@@ -492,13 +508,9 @@ relookup:
(daddr->family == AF_INET) ?
secure_ip_id(daddr->addr.a4) :
secure_ipv6_id(daddr->addr.a6));
- p->tcp_ts_stamp = 0;
p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW;
p->rate_tokens = 0;
p->rate_last = 0;
- p->pmtu_expires = 0;
- p->pmtu_orig = 0;
- memset(&p->redirect_learned, 0, sizeof(p->redirect_learned));
INIT_LIST_HEAD(&p->gc_list);
/* Link the node. */
@@ -560,29 +572,30 @@ bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout)
}
EXPORT_SYMBOL(inet_peer_xrlim_allow);
-void inetpeer_invalidate_tree(int family)
+static void inetpeer_inval_rcu(struct rcu_head *head)
{
- struct inet_peer *old, *new, *prev;
- struct inet_peer_base *base = family_to_base(family);
+ struct inet_peer *p = container_of(head, struct inet_peer, gc_rcu);
- write_seqlock_bh(&base->lock);
+ spin_lock_bh(&gc_lock);
+ list_add_tail(&p->gc_list, &gc_list);
+ spin_unlock_bh(&gc_lock);
- old = base->root;
- if (old == peer_avl_empty_rcu)
- goto out;
+ schedule_delayed_work(&gc_work, gc_delay);
+}
- new = peer_avl_empty_rcu;
+void inetpeer_invalidate_tree(struct inet_peer_base *base)
+{
+ struct inet_peer *root;
+
+ write_seqlock_bh(&base->lock);
- prev = cmpxchg(&base->root, old, new);
- if (prev == old) {
+ root = rcu_deref_locked(base->root, base);
+ if (root != peer_avl_empty) {
+ base->root = peer_avl_empty_rcu;
base->total = 0;
- spin_lock(&gc_lock);
- list_add_tail(&prev->gc_list, &gc_list);
- spin_unlock(&gc_lock);
- schedule_delayed_work(&gc_work, gc_delay);
+ call_rcu(&root->gc_rcu, inetpeer_inval_rcu);
}
-out:
write_sequnlock_bh(&base->lock);
}
EXPORT_SYMBOL(inetpeer_invalidate_tree);
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index 29a07b6c7168..ab09b126423c 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -41,9 +41,10 @@
static int ip_forward_finish(struct sk_buff *skb)
{
- struct ip_options * opt = &(IPCB(skb)->opt);
+ struct ip_options *opt = &(IPCB(skb)->opt);
IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS);
+ IP_ADD_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTOCTETS, skb->len);
if (unlikely(opt->optlen))
ip_forward_options(skb);
@@ -55,7 +56,7 @@ int ip_forward(struct sk_buff *skb)
{
struct iphdr *iph; /* Our header */
struct rtable *rt; /* Route we use */
- struct ip_options * opt = &(IPCB(skb)->opt);
+ struct ip_options *opt = &(IPCB(skb)->opt);
if (skb_warn_if_lro(skb))
goto drop;
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 3727e234c884..8d07c973409c 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -148,17 +148,17 @@ static unsigned int ip4_hashfn(struct inet_frag_queue *q)
return ipqhashfn(ipq->id, ipq->saddr, ipq->daddr, ipq->protocol);
}
-static int ip4_frag_match(struct inet_frag_queue *q, void *a)
+static bool ip4_frag_match(struct inet_frag_queue *q, void *a)
{
struct ipq *qp;
struct ip4_create_arg *arg = a;
qp = container_of(q, struct ipq, q);
return qp->id == arg->iph->id &&
- qp->saddr == arg->iph->saddr &&
- qp->daddr == arg->iph->daddr &&
- qp->protocol == arg->iph->protocol &&
- qp->user == arg->user;
+ qp->saddr == arg->iph->saddr &&
+ qp->daddr == arg->iph->daddr &&
+ qp->protocol == arg->iph->protocol &&
+ qp->user == arg->user;
}
/* Memory Tracking Functions. */
@@ -171,6 +171,10 @@ static void frag_kfree_skb(struct netns_frags *nf, struct sk_buff *skb)
static void ip4_frag_init(struct inet_frag_queue *q, void *a)
{
struct ipq *qp = container_of(q, struct ipq, q);
+ struct netns_ipv4 *ipv4 = container_of(q->net, struct netns_ipv4,
+ frags);
+ struct net *net = container_of(ipv4, struct net, ipv4);
+
struct ip4_create_arg *arg = a;
qp->protocol = arg->iph->protocol;
@@ -180,7 +184,7 @@ static void ip4_frag_init(struct inet_frag_queue *q, void *a)
qp->daddr = arg->iph->daddr;
qp->user = arg->user;
qp->peer = sysctl_ipfrag_max_dist ?
- inet_getpeer_v4(arg->iph->saddr, 1) : NULL;
+ inet_getpeer_v4(net->ipv4.peers, arg->iph->saddr, 1) : NULL;
}
static __inline__ void ip4_frag_free(struct inet_frag_queue *q)
@@ -545,6 +549,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
int len;
int ihlen;
int err;
+ int sum_truesize;
u8 ecn;
ipq_kill(qp);
@@ -569,7 +574,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
skb_morph(head, qp->q.fragments);
head->next = qp->q.fragments->next;
- kfree_skb(qp->q.fragments);
+ consume_skb(qp->q.fragments);
qp->q.fragments = head;
}
@@ -611,19 +616,32 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
atomic_add(clone->truesize, &qp->q.net->mem);
}
- skb_shinfo(head)->frag_list = head->next;
skb_push(head, head->data - skb_network_header(head));
- for (fp=head->next; fp; fp = fp->next) {
- head->data_len += fp->len;
- head->len += fp->len;
+ sum_truesize = head->truesize;
+ for (fp = head->next; fp;) {
+ bool headstolen;
+ int delta;
+ struct sk_buff *next = fp->next;
+
+ sum_truesize += fp->truesize;
if (head->ip_summed != fp->ip_summed)
head->ip_summed = CHECKSUM_NONE;
else if (head->ip_summed == CHECKSUM_COMPLETE)
head->csum = csum_add(head->csum, fp->csum);
- head->truesize += fp->truesize;
+
+ if (skb_try_coalesce(head, fp, &headstolen, &delta)) {
+ kfree_skb_partial(fp, headstolen);
+ } else {
+ if (!skb_shinfo(head)->frag_list)
+ skb_shinfo(head)->frag_list = fp;
+ head->data_len += fp->len;
+ head->len += fp->len;
+ head->truesize += fp->truesize;
+ }
+ fp = next;
}
- atomic_sub(head->truesize, &qp->q.net->mem);
+ atomic_sub(sum_truesize, &qp->q.net->mem);
head->next = NULL;
head->dev = dev;
@@ -644,8 +662,7 @@ out_nomem:
err = -ENOMEM;
goto out_fail;
out_oversize:
- if (net_ratelimit())
- pr_info("Oversized IP packet from %pI4\n", &qp->saddr);
+ net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->saddr);
out_fail:
IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
return err;
@@ -782,7 +799,7 @@ static int __net_init ip4_frags_ns_ctl_register(struct net *net)
table[2].data = &net->ipv4.frags.timeout;
}
- hdr = register_net_sysctl_table(net, net_ipv4_ctl_path, table);
+ hdr = register_net_sysctl(net, "net/ipv4", table);
if (hdr == NULL)
goto err_reg;
@@ -807,7 +824,7 @@ static void __net_exit ip4_frags_ns_ctl_unregister(struct net *net)
static void ip4_frags_ctl_register(void)
{
- register_net_sysctl_rotable(net_ipv4_ctl_path, ip4_frags_ctl_table);
+ register_net_sysctl(&init_net, "net/ipv4", ip4_frags_ctl_table);
}
#else
static inline int ip4_frags_ns_ctl_register(struct net *net)
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index b57532d4742c..42c44b1403c9 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -169,37 +169,56 @@ struct ipgre_net {
/* often modified stats are per cpu, other are shared (netdev->stats) */
struct pcpu_tstats {
- unsigned long rx_packets;
- unsigned long rx_bytes;
- unsigned long tx_packets;
- unsigned long tx_bytes;
-} __attribute__((aligned(4*sizeof(unsigned long))));
+ u64 rx_packets;
+ u64 rx_bytes;
+ u64 tx_packets;
+ u64 tx_bytes;
+ struct u64_stats_sync syncp;
+};
-static struct net_device_stats *ipgre_get_stats(struct net_device *dev)
+static struct rtnl_link_stats64 *ipgre_get_stats64(struct net_device *dev,
+ struct rtnl_link_stats64 *tot)
{
- struct pcpu_tstats sum = { 0 };
int i;
for_each_possible_cpu(i) {
const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
-
- sum.rx_packets += tstats->rx_packets;
- sum.rx_bytes += tstats->rx_bytes;
- sum.tx_packets += tstats->tx_packets;
- sum.tx_bytes += tstats->tx_bytes;
+ u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
+ unsigned int start;
+
+ do {
+ start = u64_stats_fetch_begin_bh(&tstats->syncp);
+ rx_packets = tstats->rx_packets;
+ tx_packets = tstats->tx_packets;
+ rx_bytes = tstats->rx_bytes;
+ tx_bytes = tstats->tx_bytes;
+ } while (u64_stats_fetch_retry_bh(&tstats->syncp, start));
+
+ tot->rx_packets += rx_packets;
+ tot->tx_packets += tx_packets;
+ tot->rx_bytes += rx_bytes;
+ tot->tx_bytes += tx_bytes;
}
- dev->stats.rx_packets = sum.rx_packets;
- dev->stats.rx_bytes = sum.rx_bytes;
- dev->stats.tx_packets = sum.tx_packets;
- dev->stats.tx_bytes = sum.tx_bytes;
- return &dev->stats;
+
+ tot->multicast = dev->stats.multicast;
+ tot->rx_crc_errors = dev->stats.rx_crc_errors;
+ tot->rx_fifo_errors = dev->stats.rx_fifo_errors;
+ tot->rx_length_errors = dev->stats.rx_length_errors;
+ tot->rx_errors = dev->stats.rx_errors;
+ tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
+ tot->tx_carrier_errors = dev->stats.tx_carrier_errors;
+ tot->tx_dropped = dev->stats.tx_dropped;
+ tot->tx_aborted_errors = dev->stats.tx_aborted_errors;
+ tot->tx_errors = dev->stats.tx_errors;
+
+ return tot;
}
/* Given src, dst and key, find appropriate for input tunnel. */
-static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
- __be32 remote, __be32 local,
- __be32 key, __be16 gre_proto)
+static struct ip_tunnel *ipgre_tunnel_lookup(struct net_device *dev,
+ __be32 remote, __be32 local,
+ __be32 key, __be16 gre_proto)
{
struct net *net = dev_net(dev);
int link = dev->ifindex;
@@ -464,7 +483,7 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
*/
const struct iphdr *iph = (const struct iphdr *)skb->data;
- __be16 *p = (__be16*)(skb->data+(iph->ihl<<2));
+ __be16 *p = (__be16 *)(skb->data+(iph->ihl<<2));
int grehlen = (iph->ihl<<2) + 4;
const int type = icmp_hdr(skb)->type;
const int code = icmp_hdr(skb)->code;
@@ -497,9 +516,6 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
case ICMP_PORT_UNREACH:
/* Impossible event. */
return;
- case ICMP_FRAG_NEEDED:
- /* Soft state for pmtu is maintained by IP core. */
- return;
default:
/* All others are translated to HOST_UNREACH.
rfc2003 contains "deep thoughts" about NET_UNREACH,
@@ -512,6 +528,9 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
if (code != ICMP_EXC_TTL)
return;
break;
+
+ case ICMP_REDIRECT:
+ break;
}
rcu_read_lock();
@@ -519,7 +538,20 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
flags & GRE_KEY ?
*(((__be32 *)p) + (grehlen / 4) - 1) : 0,
p[1]);
- if (t == NULL || t->parms.iph.daddr == 0 ||
+ if (t == NULL)
+ goto out;
+
+ if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
+ ipv4_update_pmtu(skb, dev_net(skb->dev), info,
+ t->parms.link, 0, IPPROTO_GRE, 0);
+ goto out;
+ }
+ if (type == ICMP_REDIRECT) {
+ ipv4_redirect(skb, dev_net(skb->dev), t->parms.link, 0,
+ IPPROTO_GRE, 0);
+ goto out;
+ }
+ if (t->parms.iph.daddr == 0 ||
ipv4_is_multicast(t->parms.iph.daddr))
goto out;
@@ -574,7 +606,7 @@ static int ipgre_rcv(struct sk_buff *skb)
iph = ip_hdr(skb);
h = skb->data;
- flags = *(__be16*)h;
+ flags = *(__be16 *)h;
if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
/* - Version must be 0.
@@ -598,11 +630,11 @@ static int ipgre_rcv(struct sk_buff *skb)
offset += 4;
}
if (flags&GRE_KEY) {
- key = *(__be32*)(h + offset);
+ key = *(__be32 *)(h + offset);
offset += 4;
}
if (flags&GRE_SEQ) {
- seqno = ntohl(*(__be32*)(h + offset));
+ seqno = ntohl(*(__be32 *)(h + offset));
offset += 4;
}
}
@@ -672,8 +704,10 @@ static int ipgre_rcv(struct sk_buff *skb)
}
tstats = this_cpu_ptr(tunnel->dev->tstats);
+ u64_stats_update_begin(&tstats->syncp);
tstats->rx_packets++;
tstats->rx_bytes += skb->len;
+ u64_stats_update_end(&tstats->syncp);
__skb_tunnel_rx(skb, tunnel->dev);
@@ -799,7 +833,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
if (skb_dst(skb))
- skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
+ skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
if (skb->protocol == htons(ETH_P_IP)) {
df |= (old_iph->frag_off&htons(IP_DF));
@@ -900,7 +934,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
htons(ETH_P_TEB) : skb->protocol;
if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
- __be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
+ __be32 *ptr = (__be32 *)(((u8 *)iph) + tunnel->hlen - 4);
if (tunnel->parms.o_flags&GRE_SEQ) {
++tunnel->o_seqno;
@@ -913,7 +947,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
}
if (tunnel->parms.o_flags&GRE_CSUM) {
*ptr = 0;
- *(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
+ *(__sum16 *)ptr = ip_compute_csum((void *)(iph+1), skb->len - sizeof(struct iphdr));
}
}
@@ -1169,7 +1203,7 @@ static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
{
struct ip_tunnel *t = netdev_priv(dev);
struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
- __be16 *p = (__be16*)(iph+1);
+ __be16 *p = (__be16 *)(iph+1);
memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
p[0] = t->parms.o_flags;
@@ -1253,7 +1287,7 @@ static const struct net_device_ops ipgre_netdev_ops = {
.ndo_start_xmit = ipgre_tunnel_xmit,
.ndo_do_ioctl = ipgre_tunnel_ioctl,
.ndo_change_mtu = ipgre_tunnel_change_mtu,
- .ndo_get_stats = ipgre_get_stats,
+ .ndo_get_stats64 = ipgre_get_stats64,
};
static void ipgre_dev_free(struct net_device *dev)
@@ -1507,7 +1541,7 @@ static const struct net_device_ops ipgre_tap_netdev_ops = {
.ndo_set_mac_address = eth_mac_addr,
.ndo_validate_addr = eth_validate_addr,
.ndo_change_mtu = ipgre_tunnel_change_mtu,
- .ndo_get_stats = ipgre_get_stats,
+ .ndo_get_stats64 = ipgre_get_stats64,
};
static void ipgre_tap_setup(struct net_device *dev)
@@ -1654,17 +1688,18 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
struct ip_tunnel *t = netdev_priv(dev);
struct ip_tunnel_parm *p = &t->parms;
- NLA_PUT_U32(skb, IFLA_GRE_LINK, p->link);
- NLA_PUT_BE16(skb, IFLA_GRE_IFLAGS, p->i_flags);
- NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags);
- NLA_PUT_BE32(skb, IFLA_GRE_IKEY, p->i_key);
- NLA_PUT_BE32(skb, IFLA_GRE_OKEY, p->o_key);
- NLA_PUT_BE32(skb, IFLA_GRE_LOCAL, p->iph.saddr);
- NLA_PUT_BE32(skb, IFLA_GRE_REMOTE, p->iph.daddr);
- NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl);
- NLA_PUT_U8(skb, IFLA_GRE_TOS, p->iph.tos);
- NLA_PUT_U8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF)));
-
+ if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) ||
+ nla_put_be16(skb, IFLA_GRE_IFLAGS, p->i_flags) ||
+ nla_put_be16(skb, IFLA_GRE_OFLAGS, p->o_flags) ||
+ nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) ||
+ nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) ||
+ nla_put_be32(skb, IFLA_GRE_LOCAL, p->iph.saddr) ||
+ nla_put_be32(skb, IFLA_GRE_REMOTE, p->iph.daddr) ||
+ nla_put_u8(skb, IFLA_GRE_TTL, p->iph.ttl) ||
+ nla_put_u8(skb, IFLA_GRE_TOS, p->iph.tos) ||
+ nla_put_u8(skb, IFLA_GRE_PMTUDISC,
+ !!(p->iph.frag_off & htons(IP_DF))))
+ goto nla_put_failure;
return 0;
nla_put_failure:
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 26eccc5bab1c..b27d4440f523 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -198,21 +198,19 @@ static int ip_local_deliver_finish(struct sk_buff *skb)
rcu_read_lock();
{
int protocol = ip_hdr(skb)->protocol;
- int hash, raw;
const struct net_protocol *ipprot;
+ int raw;
resubmit:
raw = raw_local_deliver(skb, protocol);
- hash = protocol & (MAX_INET_PROTOS - 1);
- ipprot = rcu_dereference(inet_protos[hash]);
+ ipprot = rcu_dereference(inet_protos[protocol]);
if (ipprot != NULL) {
int ret;
if (!net_eq(net, &init_net) && !ipprot->netns_ok) {
- if (net_ratelimit())
- printk("%s: proto %d isn't netns-ready\n",
- __func__, protocol);
+ net_info_ratelimited("%s: proto %d isn't netns-ready\n",
+ __func__, protocol);
kfree_skb(skb);
goto out;
}
@@ -298,10 +296,10 @@ static inline bool ip_rcv_options(struct sk_buff *skb)
if (in_dev) {
if (!IN_DEV_SOURCE_ROUTE(in_dev)) {
- if (IN_DEV_LOG_MARTIANS(in_dev) &&
- net_ratelimit())
- pr_info("source route option %pI4 -> %pI4\n",
- &iph->saddr, &iph->daddr);
+ if (IN_DEV_LOG_MARTIANS(in_dev))
+ net_info_ratelimited("source route option %pI4 -> %pI4\n",
+ &iph->saddr,
+ &iph->daddr);
goto drop;
}
}
@@ -315,26 +313,33 @@ drop:
return true;
}
+int sysctl_ip_early_demux __read_mostly = 1;
+
static int ip_rcv_finish(struct sk_buff *skb)
{
const struct iphdr *iph = ip_hdr(skb);
struct rtable *rt;
+ if (sysctl_ip_early_demux && !skb_dst(skb)) {
+ const struct net_protocol *ipprot;
+ int protocol = iph->protocol;
+
+ rcu_read_lock();
+ ipprot = rcu_dereference(inet_protos[protocol]);
+ if (ipprot && ipprot->early_demux)
+ ipprot->early_demux(skb);
+ rcu_read_unlock();
+ }
+
/*
* Initialise the virtual path cache for the packet. It describes
* how the packet travels inside Linux networking.
*/
- if (skb_dst(skb) == NULL) {
+ if (!skb_dst(skb)) {
int err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
iph->tos, skb->dev);
if (unlikely(err)) {
- if (err == -EHOSTUNREACH)
- IP_INC_STATS_BH(dev_net(skb->dev),
- IPSTATS_MIB_INADDRERRORS);
- else if (err == -ENETUNREACH)
- IP_INC_STATS_BH(dev_net(skb->dev),
- IPSTATS_MIB_INNOROUTES);
- else if (err == -EXDEV)
+ if (err == -EXDEV)
NET_INC_STATS_BH(dev_net(skb->dev),
LINUX_MIB_IPRPFILTER);
goto drop;
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index a0d0d9d9b870..1dc01f9793d5 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -27,6 +27,7 @@
#include <net/icmp.h>
#include <net/route.h>
#include <net/cipso_ipv4.h>
+#include <net/ip_fib.h>
/*
* Write options to IP header, record destination address to
@@ -92,7 +93,6 @@ int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb)
unsigned char *sptr, *dptr;
int soffset, doffset;
int optlen;
- __be32 daddr;
memset(dopt, 0, sizeof(struct ip_options));
@@ -104,8 +104,6 @@ int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb)
sptr = skb_network_header(skb);
dptr = dopt->__data;
- daddr = skb_rtable(skb)->rt_spec_dst;
-
if (sopt->rr) {
optlen = sptr[sopt->rr+1];
soffset = sptr[sopt->rr+2];
@@ -179,6 +177,8 @@ int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb)
doffset -= 4;
}
if (doffset > 3) {
+ __be32 daddr = fib_compute_spec_dst(skb);
+
memcpy(&start[doffset-1], &daddr, 4);
dopt->faddr = faddr;
dptr[0] = start[0];
@@ -210,10 +210,10 @@ int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb)
* Simple and stupid 8), but the most efficient way.
*/
-void ip_options_fragment(struct sk_buff * skb)
+void ip_options_fragment(struct sk_buff *skb)
{
unsigned char *optptr = skb_network_header(skb) + sizeof(struct iphdr);
- struct ip_options * opt = &(IPCB(skb)->opt);
+ struct ip_options *opt = &(IPCB(skb)->opt);
int l = opt->optlen;
int optlen;
@@ -241,6 +241,15 @@ void ip_options_fragment(struct sk_buff * skb)
opt->ts_needtime = 0;
}
+/* helper used by ip_options_compile() to call fib_compute_spec_dst()
+ * at most one time.
+ */
+static void spec_dst_fill(__be32 *spec_dst, struct sk_buff *skb)
+{
+ if (*spec_dst == htonl(INADDR_ANY))
+ *spec_dst = fib_compute_spec_dst(skb);
+}
+
/*
* Verify options and fill pointers in struct options.
* Caller should clear *opt, and set opt->data.
@@ -248,14 +257,14 @@ void ip_options_fragment(struct sk_buff * skb)
*/
int ip_options_compile(struct net *net,
- struct ip_options * opt, struct sk_buff * skb)
+ struct ip_options *opt, struct sk_buff *skb)
{
- int l;
- unsigned char * iph;
- unsigned char * optptr;
- int optlen;
- unsigned char * pp_ptr = NULL;
+ __be32 spec_dst = htonl(INADDR_ANY);
+ unsigned char *pp_ptr = NULL;
struct rtable *rt = NULL;
+ unsigned char *optptr;
+ unsigned char *iph;
+ int optlen, l;
if (skb != NULL) {
rt = skb_rtable(skb);
@@ -331,7 +340,8 @@ int ip_options_compile(struct net *net,
goto error;
}
if (rt) {
- memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4);
+ spec_dst_fill(&spec_dst, skb);
+ memcpy(&optptr[optptr[2]-1], &spec_dst, 4);
opt->is_changed = 1;
}
optptr[2] += 4;
@@ -373,7 +383,8 @@ int ip_options_compile(struct net *net,
}
opt->ts = optptr - iph;
if (rt) {
- memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4);
+ spec_dst_fill(&spec_dst, skb);
+ memcpy(&optptr[optptr[2]-1], &spec_dst, 4);
timeptr = &optptr[optptr[2]+3];
}
opt->ts_needaddr = 1;
@@ -413,7 +424,7 @@ int ip_options_compile(struct net *net,
opt->is_changed = 1;
}
} else {
- unsigned overflow = optptr[3]>>4;
+ unsigned int overflow = optptr[3]>>4;
if (overflow == 15) {
pp_ptr = optptr + 3;
goto error;
@@ -473,20 +484,20 @@ EXPORT_SYMBOL(ip_options_compile);
* Undo all the changes done by ip_options_compile().
*/
-void ip_options_undo(struct ip_options * opt)
+void ip_options_undo(struct ip_options *opt)
{
if (opt->srr) {
- unsigned char * optptr = opt->__data+opt->srr-sizeof(struct iphdr);
+ unsigned char *optptr = opt->__data+opt->srr-sizeof(struct iphdr);
memmove(optptr+7, optptr+3, optptr[1]-7);
memcpy(optptr+3, &opt->faddr, 4);
}
if (opt->rr_needaddr) {
- unsigned char * optptr = opt->__data+opt->rr-sizeof(struct iphdr);
+ unsigned char *optptr = opt->__data+opt->rr-sizeof(struct iphdr);
optptr[2] -= 4;
memset(&optptr[optptr[2]-1], 0, 4);
}
if (opt->ts) {
- unsigned char * optptr = opt->__data+opt->ts-sizeof(struct iphdr);
+ unsigned char *optptr = opt->__data+opt->ts-sizeof(struct iphdr);
if (opt->ts_needtime) {
optptr[2] -= 4;
memset(&optptr[optptr[2]-1], 0, 4);
@@ -549,8 +560,8 @@ int ip_options_get(struct net *net, struct ip_options_rcu **optp,
void ip_forward_options(struct sk_buff *skb)
{
- struct ip_options * opt = &(IPCB(skb)->opt);
- unsigned char * optptr;
+ struct ip_options *opt = &(IPCB(skb)->opt);
+ unsigned char *optptr;
struct rtable *rt = skb_rtable(skb);
unsigned char *raw = skb_network_header(skb);
@@ -578,8 +589,10 @@ void ip_forward_options(struct sk_buff *skb)
ip_hdr(skb)->daddr = opt->nexthop;
ip_rt_get_source(&optptr[srrptr-1], skb, rt);
optptr[2] = srrptr+4;
- } else if (net_ratelimit())
- pr_crit("%s(): Argh! Destination lost!\n", __func__);
+ } else {
+ net_crit_ratelimited("%s(): Argh! Destination lost!\n",
+ __func__);
+ }
if (opt->ts_needaddr) {
optptr = raw + opt->ts;
ip_rt_get_source(&optptr[optptr[2]-9], skb, rt);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 4910176d24ed..c528f841ca4b 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -113,19 +113,6 @@ int ip_local_out(struct sk_buff *skb)
}
EXPORT_SYMBOL_GPL(ip_local_out);
-/* dev_loopback_xmit for use with netfilter. */
-static int ip_dev_loopback_xmit(struct sk_buff *newskb)
-{
- skb_reset_mac_header(newskb);
- __skb_pull(newskb, skb_network_offset(newskb));
- newskb->pkt_type = PACKET_LOOPBACK;
- newskb->ip_summed = CHECKSUM_UNNECESSARY;
- WARN_ON(!skb_dst(newskb));
- skb_dst_force(newskb);
- netif_rx_ni(newskb);
- return 0;
-}
-
static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
{
int ttl = inet->uc_ttl;
@@ -183,6 +170,7 @@ static inline int ip_finish_output2(struct sk_buff *skb)
struct net_device *dev = dst->dev;
unsigned int hh_len = LL_RESERVED_SPACE(dev);
struct neighbour *neigh;
+ u32 nexthop;
if (rt->rt_type == RTN_MULTICAST) {
IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
@@ -200,22 +188,25 @@ static inline int ip_finish_output2(struct sk_buff *skb)
}
if (skb->sk)
skb_set_owner_w(skb2, skb->sk);
- kfree_skb(skb);
+ consume_skb(skb);
skb = skb2;
}
- rcu_read_lock();
- neigh = dst_get_neighbour_noref(dst);
+ rcu_read_lock_bh();
+ nexthop = rt->rt_gateway ? rt->rt_gateway : ip_hdr(skb)->daddr;
+ neigh = __ipv4_neigh_lookup_noref(dev, nexthop);
+ if (unlikely(!neigh))
+ neigh = __neigh_create(&arp_tbl, &nexthop, dev, false);
if (neigh) {
- int res = neigh_output(neigh, skb);
+ int res = dst_neigh_output(dst, neigh, skb);
- rcu_read_unlock();
+ rcu_read_unlock_bh();
return res;
}
- rcu_read_unlock();
+ rcu_read_unlock_bh();
- if (net_ratelimit())
- printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
+ net_dbg_ratelimited("%s: No header cache and no neighbour!\n",
+ __func__);
kfree_skb(skb);
return -EINVAL;
}
@@ -281,7 +272,7 @@ int ip_mc_output(struct sk_buff *skb)
if (newskb)
NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
newskb, NULL, newskb->dev,
- ip_dev_loopback_xmit);
+ dev_loopback_xmit);
}
/* Multicasts with ttl 0 must not go beyond the host */
@@ -296,7 +287,7 @@ int ip_mc_output(struct sk_buff *skb)
struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
if (newskb)
NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb,
- NULL, newskb->dev, ip_dev_loopback_xmit);
+ NULL, newskb->dev, dev_loopback_xmit);
}
return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL,
@@ -709,7 +700,7 @@ slow_path:
IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
}
- kfree_skb(skb);
+ consume_skb(skb);
IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
return err;
@@ -1472,19 +1463,33 @@ static int ip_reply_glue_bits(void *dptr, char *to, int offset,
/*
* Generic function to send a packet as reply to another packet.
- * Used to send TCP resets so far. ICMP should use this function too.
+ * Used to send some TCP resets/acks so far.
*
- * Should run single threaded per socket because it uses the sock
- * structure to pass arguments.
+ * Use a fake percpu inet socket to avoid false sharing and contention.
*/
-void ip_send_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr,
- const struct ip_reply_arg *arg, unsigned int len)
+static DEFINE_PER_CPU(struct inet_sock, unicast_sock) = {
+ .sk = {
+ .__sk_common = {
+ .skc_refcnt = ATOMIC_INIT(1),
+ },
+ .sk_wmem_alloc = ATOMIC_INIT(1),
+ .sk_allocation = GFP_ATOMIC,
+ .sk_flags = (1UL << SOCK_USE_WRITE_QUEUE),
+ },
+ .pmtudisc = IP_PMTUDISC_WANT,
+};
+
+void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr,
+ __be32 saddr, const struct ip_reply_arg *arg,
+ unsigned int len)
{
- struct inet_sock *inet = inet_sk(sk);
struct ip_options_data replyopts;
struct ipcm_cookie ipc;
struct flowi4 fl4;
struct rtable *rt = skb_rtable(skb);
+ struct sk_buff *nskb;
+ struct sock *sk;
+ struct inet_sock *inet;
if (ip_options_echo(&replyopts.opt.opt, skb))
return;
@@ -1502,38 +1507,39 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr,
flowi4_init_output(&fl4, arg->bound_dev_if, 0,
RT_TOS(arg->tos),
- RT_SCOPE_UNIVERSE, sk->sk_protocol,
+ RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol,
ip_reply_arg_flowi_flags(arg),
- daddr, rt->rt_spec_dst,
+ daddr, saddr,
tcp_hdr(skb)->source, tcp_hdr(skb)->dest);
security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
- rt = ip_route_output_key(sock_net(sk), &fl4);
+ rt = ip_route_output_key(net, &fl4);
if (IS_ERR(rt))
return;
- /* And let IP do all the hard work.
+ inet = &get_cpu_var(unicast_sock);
- This chunk is not reenterable, hence spinlock.
- Note that it uses the fact, that this function is called
- with locally disabled BH and that sk cannot be already spinlocked.
- */
- bh_lock_sock(sk);
inet->tos = arg->tos;
+ sk = &inet->sk;
sk->sk_priority = skb->priority;
sk->sk_protocol = ip_hdr(skb)->protocol;
sk->sk_bound_dev_if = arg->bound_dev_if;
+ sock_net_set(sk, net);
+ __skb_queue_head_init(&sk->sk_write_queue);
+ sk->sk_sndbuf = sysctl_wmem_default;
ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
&ipc, &rt, MSG_DONTWAIT);
- if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
+ nskb = skb_peek(&sk->sk_write_queue);
+ if (nskb) {
if (arg->csumoffset >= 0)
- *((__sum16 *)skb_transport_header(skb) +
- arg->csumoffset) = csum_fold(csum_add(skb->csum,
+ *((__sum16 *)skb_transport_header(nskb) +
+ arg->csumoffset) = csum_fold(csum_add(nskb->csum,
arg->csum));
- skb->ip_summed = CHECKSUM_NONE;
+ nskb->ip_summed = CHECKSUM_NONE;
+ skb_set_queue_mapping(nskb, skb_get_queue_mapping(skb));
ip_push_pending_frames(sk, &fl4);
}
- bh_unlock_sock(sk);
+ put_cpu_var(unicast_sock);
ip_rt_put(rt);
}
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 2fd0fba77124..de29f46f68b0 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -40,6 +40,7 @@
#if IS_ENABLED(CONFIG_IPV6)
#include <net/transp_v6.h>
#endif
+#include <net/ip_fib.h>
#include <linux/errqueue.h>
#include <asm/uaccess.h>
@@ -90,7 +91,7 @@ static void ip_cmsg_recv_opts(struct msghdr *msg, struct sk_buff *skb)
static void ip_cmsg_recv_retopts(struct msghdr *msg, struct sk_buff *skb)
{
unsigned char optbuf[sizeof(struct ip_options) + 40];
- struct ip_options * opt = (struct ip_options *)optbuf;
+ struct ip_options *opt = (struct ip_options *)optbuf;
if (IPCB(skb)->opt.optlen == 0)
return;
@@ -147,7 +148,7 @@ static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb)
void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb)
{
struct inet_sock *inet = inet_sk(skb->sk);
- unsigned flags = inet->cmsg_flags;
+ unsigned int flags = inet->cmsg_flags;
/* Ordered by supposed usage frequency */
if (flags & 1)
@@ -673,10 +674,15 @@ static int do_ip_setsockopt(struct sock *sk, int level,
break;
} else {
memset(&mreq, 0, sizeof(mreq));
- if (optlen >= sizeof(struct in_addr) &&
- copy_from_user(&mreq.imr_address, optval,
- sizeof(struct in_addr)))
- break;
+ if (optlen >= sizeof(struct ip_mreq)) {
+ if (copy_from_user(&mreq, optval,
+ sizeof(struct ip_mreq)))
+ break;
+ } else if (optlen >= sizeof(struct in_addr)) {
+ if (copy_from_user(&mreq.imr_address, optval,
+ sizeof(struct in_addr)))
+ break;
+ }
}
if (!mreq.imr_ifindex) {
@@ -1014,8 +1020,8 @@ e_inval:
* @sk: socket
* @skb: buffer
*
- * To support IP_CMSG_PKTINFO option, we store rt_iif and rt_spec_dst
- * in skb->cb[] before dst drop.
+ * To support IP_CMSG_PKTINFO option, we store rt_iif and specific
+ * destination in skb->cb[] before dst drop.
* This way, receiver doesnt make cache line misses to read rtable.
*/
void ipv4_pktinfo_prepare(struct sk_buff *skb)
@@ -1025,7 +1031,7 @@ void ipv4_pktinfo_prepare(struct sk_buff *skb)
if (rt) {
pktinfo->ipi_ifindex = rt->rt_iif;
- pktinfo->ipi_spec_dst.s_addr = rt->rt_spec_dst;
+ pktinfo->ipi_spec_dst.s_addr = fib_compute_spec_dst(skb);
} else {
pktinfo->ipi_ifindex = 0;
pktinfo->ipi_spec_dst.s_addr = 0;
@@ -1094,7 +1100,7 @@ EXPORT_SYMBOL(compat_ip_setsockopt);
*/
static int do_ip_getsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int __user *optlen, unsigned flags)
+ char __user *optval, int __user *optlen, unsigned int flags)
{
struct inet_sock *inet = inet_sk(sk);
int val;
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
new file mode 100644
index 000000000000..c41b5c359936
--- /dev/null
+++ b/net/ipv4/ip_vti.c
@@ -0,0 +1,956 @@
+/*
+ * Linux NET3: IP/IP protocol decoder modified to support
+ * virtual tunnel interface
+ *
+ * Authors:
+ * Saurabh Mohan (saurabh.mohan@vyatta.com) 05/07/2012
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+/*
+ This version of net/ipv4/ip_vti.c is cloned of net/ipv4/ipip.c
+
+ For comments look at net/ipv4/ip_gre.c --ANK
+ */
+
+
+#include <linux/capability.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/uaccess.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/in.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/if_arp.h>
+#include <linux/mroute.h>
+#include <linux/init.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/if_ether.h>
+
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/ipip.h>
+#include <net/inet_ecn.h>
+#include <net/xfrm.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+
+#define HASH_SIZE 16
+#define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&(HASH_SIZE-1))
+
+static struct rtnl_link_ops vti_link_ops __read_mostly;
+
+static int vti_net_id __read_mostly;
+struct vti_net {
+ struct ip_tunnel __rcu *tunnels_r_l[HASH_SIZE];
+ struct ip_tunnel __rcu *tunnels_r[HASH_SIZE];
+ struct ip_tunnel __rcu *tunnels_l[HASH_SIZE];
+ struct ip_tunnel __rcu *tunnels_wc[1];
+ struct ip_tunnel **tunnels[4];
+
+ struct net_device *fb_tunnel_dev;
+};
+
+static int vti_fb_tunnel_init(struct net_device *dev);
+static int vti_tunnel_init(struct net_device *dev);
+static void vti_tunnel_setup(struct net_device *dev);
+static void vti_dev_free(struct net_device *dev);
+static int vti_tunnel_bind_dev(struct net_device *dev);
+
+/* Locking : hash tables are protected by RCU and RTNL */
+
+#define for_each_ip_tunnel_rcu(start) \
+ for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
+
+/* often modified stats are per cpu, other are shared (netdev->stats) */
+struct pcpu_tstats {
+ u64 rx_packets;
+ u64 rx_bytes;
+ u64 tx_packets;
+ u64 tx_bytes;
+ struct u64_stats_sync syncp;
+};
+
+#define VTI_XMIT(stats1, stats2) do { \
+ int err; \
+ int pkt_len = skb->len; \
+ err = dst_output(skb); \
+ if (net_xmit_eval(err) == 0) { \
+ u64_stats_update_begin(&(stats1)->syncp); \
+ (stats1)->tx_bytes += pkt_len; \
+ (stats1)->tx_packets++; \
+ u64_stats_update_end(&(stats1)->syncp); \
+ } else { \
+ (stats2)->tx_errors++; \
+ (stats2)->tx_aborted_errors++; \
+ } \
+} while (0)
+
+
+static struct rtnl_link_stats64 *vti_get_stats64(struct net_device *dev,
+ struct rtnl_link_stats64 *tot)
+{
+ int i;
+
+ for_each_possible_cpu(i) {
+ const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
+ u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
+ unsigned int start;
+
+ do {
+ start = u64_stats_fetch_begin_bh(&tstats->syncp);
+ rx_packets = tstats->rx_packets;
+ tx_packets = tstats->tx_packets;
+ rx_bytes = tstats->rx_bytes;
+ tx_bytes = tstats->tx_bytes;
+ } while (u64_stats_fetch_retry_bh(&tstats->syncp, start));
+
+ tot->rx_packets += rx_packets;
+ tot->tx_packets += tx_packets;
+ tot->rx_bytes += rx_bytes;
+ tot->tx_bytes += tx_bytes;
+ }
+
+ tot->multicast = dev->stats.multicast;
+ tot->rx_crc_errors = dev->stats.rx_crc_errors;
+ tot->rx_fifo_errors = dev->stats.rx_fifo_errors;
+ tot->rx_length_errors = dev->stats.rx_length_errors;
+ tot->rx_errors = dev->stats.rx_errors;
+ tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
+ tot->tx_carrier_errors = dev->stats.tx_carrier_errors;
+ tot->tx_dropped = dev->stats.tx_dropped;
+ tot->tx_aborted_errors = dev->stats.tx_aborted_errors;
+ tot->tx_errors = dev->stats.tx_errors;
+
+ return tot;
+}
+
+static struct ip_tunnel *vti_tunnel_lookup(struct net *net,
+ __be32 remote, __be32 local)
+{
+ unsigned h0 = HASH(remote);
+ unsigned h1 = HASH(local);
+ struct ip_tunnel *t;
+ struct vti_net *ipn = net_generic(net, vti_net_id);
+
+ for_each_ip_tunnel_rcu(ipn->tunnels_r_l[h0 ^ h1])
+ if (local == t->parms.iph.saddr &&
+ remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
+ return t;
+ for_each_ip_tunnel_rcu(ipn->tunnels_r[h0])
+ if (remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
+ return t;
+
+ for_each_ip_tunnel_rcu(ipn->tunnels_l[h1])
+ if (local == t->parms.iph.saddr && (t->dev->flags&IFF_UP))
+ return t;
+
+ for_each_ip_tunnel_rcu(ipn->tunnels_wc[0])
+ if (t && (t->dev->flags&IFF_UP))
+ return t;
+ return NULL;
+}
+
+static struct ip_tunnel **__vti_bucket(struct vti_net *ipn,
+ struct ip_tunnel_parm *parms)
+{
+ __be32 remote = parms->iph.daddr;
+ __be32 local = parms->iph.saddr;
+ unsigned h = 0;
+ int prio = 0;
+
+ if (remote) {
+ prio |= 2;
+ h ^= HASH(remote);
+ }
+ if (local) {
+ prio |= 1;
+ h ^= HASH(local);
+ }
+ return &ipn->tunnels[prio][h];
+}
+
+static inline struct ip_tunnel **vti_bucket(struct vti_net *ipn,
+ struct ip_tunnel *t)
+{
+ return __vti_bucket(ipn, &t->parms);
+}
+
+static void vti_tunnel_unlink(struct vti_net *ipn, struct ip_tunnel *t)
+{
+ struct ip_tunnel __rcu **tp;
+ struct ip_tunnel *iter;
+
+ for (tp = vti_bucket(ipn, t);
+ (iter = rtnl_dereference(*tp)) != NULL;
+ tp = &iter->next) {
+ if (t == iter) {
+ rcu_assign_pointer(*tp, t->next);
+ break;
+ }
+ }
+}
+
+static void vti_tunnel_link(struct vti_net *ipn, struct ip_tunnel *t)
+{
+ struct ip_tunnel __rcu **tp = vti_bucket(ipn, t);
+
+ rcu_assign_pointer(t->next, rtnl_dereference(*tp));
+ rcu_assign_pointer(*tp, t);
+}
+
+static struct ip_tunnel *vti_tunnel_locate(struct net *net,
+ struct ip_tunnel_parm *parms,
+ int create)
+{
+ __be32 remote = parms->iph.daddr;
+ __be32 local = parms->iph.saddr;
+ struct ip_tunnel *t, *nt;
+ struct ip_tunnel __rcu **tp;
+ struct net_device *dev;
+ char name[IFNAMSIZ];
+ struct vti_net *ipn = net_generic(net, vti_net_id);
+
+ for (tp = __vti_bucket(ipn, parms);
+ (t = rtnl_dereference(*tp)) != NULL;
+ tp = &t->next) {
+ if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr)
+ return t;
+ }
+ if (!create)
+ return NULL;
+
+ if (parms->name[0])
+ strlcpy(name, parms->name, IFNAMSIZ);
+ else
+ strcpy(name, "vti%d");
+
+ dev = alloc_netdev(sizeof(*t), name, vti_tunnel_setup);
+ if (dev == NULL)
+ return NULL;
+
+ dev_net_set(dev, net);
+
+ nt = netdev_priv(dev);
+ nt->parms = *parms;
+ dev->rtnl_link_ops = &vti_link_ops;
+
+ vti_tunnel_bind_dev(dev);
+
+ if (register_netdevice(dev) < 0)
+ goto failed_free;
+
+ dev_hold(dev);
+ vti_tunnel_link(ipn, nt);
+ return nt;
+
+failed_free:
+ free_netdev(dev);
+ return NULL;
+}
+
+static void vti_tunnel_uninit(struct net_device *dev)
+{
+ struct net *net = dev_net(dev);
+ struct vti_net *ipn = net_generic(net, vti_net_id);
+
+ vti_tunnel_unlink(ipn, netdev_priv(dev));
+ dev_put(dev);
+}
+
+static int vti_err(struct sk_buff *skb, u32 info)
+{
+
+ /* All the routers (except for Linux) return only
+ * 8 bytes of packet payload. It means, that precise relaying of
+ * ICMP in the real Internet is absolutely infeasible.
+ */
+ struct iphdr *iph = (struct iphdr *)skb->data;
+ const int type = icmp_hdr(skb)->type;
+ const int code = icmp_hdr(skb)->code;
+ struct ip_tunnel *t;
+ int err;
+
+ switch (type) {
+ default:
+ case ICMP_PARAMETERPROB:
+ return 0;
+
+ case ICMP_DEST_UNREACH:
+ switch (code) {
+ case ICMP_SR_FAILED:
+ case ICMP_PORT_UNREACH:
+ /* Impossible event. */
+ return 0;
+ default:
+ /* All others are translated to HOST_UNREACH. */
+ break;
+ }
+ break;
+ case ICMP_TIME_EXCEEDED:
+ if (code != ICMP_EXC_TTL)
+ return 0;
+ break;
+ }
+
+ err = -ENOENT;
+
+ rcu_read_lock();
+ t = vti_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr);
+ if (t == NULL)
+ goto out;
+
+ if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
+ ipv4_update_pmtu(skb, dev_net(skb->dev), info,
+ t->parms.link, 0, IPPROTO_IPIP, 0);
+ err = 0;
+ goto out;
+ }
+
+ err = 0;
+ if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
+ goto out;
+
+ if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
+ t->err_count++;
+ else
+ t->err_count = 1;
+ t->err_time = jiffies;
+out:
+ rcu_read_unlock();
+ return err;
+}
+
+/* We dont digest the packet therefore let the packet pass */
+static int vti_rcv(struct sk_buff *skb)
+{
+ struct ip_tunnel *tunnel;
+ const struct iphdr *iph = ip_hdr(skb);
+
+ rcu_read_lock();
+ tunnel = vti_tunnel_lookup(dev_net(skb->dev), iph->saddr, iph->daddr);
+ if (tunnel != NULL) {
+ struct pcpu_tstats *tstats;
+
+ tstats = this_cpu_ptr(tunnel->dev->tstats);
+ u64_stats_update_begin(&tstats->syncp);
+ tstats->rx_packets++;
+ tstats->rx_bytes += skb->len;
+ u64_stats_update_end(&tstats->syncp);
+
+ skb->dev = tunnel->dev;
+ rcu_read_unlock();
+ return 1;
+ }
+ rcu_read_unlock();
+
+ return -1;
+}
+
+/* This function assumes it is being called from dev_queue_xmit()
+ * and that skb is filled properly by that function.
+ */
+
+static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ struct pcpu_tstats *tstats;
+ struct iphdr *tiph = &tunnel->parms.iph;
+ u8 tos;
+ struct rtable *rt; /* Route to the other host */
+ struct net_device *tdev; /* Device to other host */
+ struct iphdr *old_iph = ip_hdr(skb);
+ __be32 dst = tiph->daddr;
+ struct flowi4 fl4;
+
+ if (skb->protocol != htons(ETH_P_IP))
+ goto tx_error;
+
+ tos = old_iph->tos;
+
+ memset(&fl4, 0, sizeof(fl4));
+ flowi4_init_output(&fl4, tunnel->parms.link,
+ htonl(tunnel->parms.i_key), RT_TOS(tos),
+ RT_SCOPE_UNIVERSE,
+ IPPROTO_IPIP, 0,
+ dst, tiph->saddr, 0, 0);
+ rt = ip_route_output_key(dev_net(dev), &fl4);
+ if (IS_ERR(rt)) {
+ dev->stats.tx_carrier_errors++;
+ goto tx_error_icmp;
+ }
+ /* if there is no transform then this tunnel is not functional.
+ * Or if the xfrm is not mode tunnel.
+ */
+ if (!rt->dst.xfrm ||
+ rt->dst.xfrm->props.mode != XFRM_MODE_TUNNEL) {
+ dev->stats.tx_carrier_errors++;
+ goto tx_error_icmp;
+ }
+ tdev = rt->dst.dev;
+
+ if (tdev == dev) {
+ ip_rt_put(rt);
+ dev->stats.collisions++;
+ goto tx_error;
+ }
+
+ if (tunnel->err_count > 0) {
+ if (time_before(jiffies,
+ tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
+ tunnel->err_count--;
+ dst_link_failure(skb);
+ } else
+ tunnel->err_count = 0;
+ }
+
+ IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
+ IPSKB_REROUTED);
+ skb_dst_drop(skb);
+ skb_dst_set(skb, &rt->dst);
+ nf_reset(skb);
+ skb->dev = skb_dst(skb)->dev;
+
+ tstats = this_cpu_ptr(dev->tstats);
+ VTI_XMIT(tstats, &dev->stats);
+ return NETDEV_TX_OK;
+
+tx_error_icmp:
+ dst_link_failure(skb);
+tx_error:
+ dev->stats.tx_errors++;
+ dev_kfree_skb(skb);
+ return NETDEV_TX_OK;
+}
+
+static int vti_tunnel_bind_dev(struct net_device *dev)
+{
+ struct net_device *tdev = NULL;
+ struct ip_tunnel *tunnel;
+ struct iphdr *iph;
+
+ tunnel = netdev_priv(dev);
+ iph = &tunnel->parms.iph;
+
+ if (iph->daddr) {
+ struct rtable *rt;
+ struct flowi4 fl4;
+ memset(&fl4, 0, sizeof(fl4));
+ flowi4_init_output(&fl4, tunnel->parms.link,
+ htonl(tunnel->parms.i_key),
+ RT_TOS(iph->tos), RT_SCOPE_UNIVERSE,
+ IPPROTO_IPIP, 0,
+ iph->daddr, iph->saddr, 0, 0);
+ rt = ip_route_output_key(dev_net(dev), &fl4);
+ if (!IS_ERR(rt)) {
+ tdev = rt->dst.dev;
+ ip_rt_put(rt);
+ }
+ dev->flags |= IFF_POINTOPOINT;
+ }
+
+ if (!tdev && tunnel->parms.link)
+ tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
+
+ if (tdev) {
+ dev->hard_header_len = tdev->hard_header_len +
+ sizeof(struct iphdr);
+ dev->mtu = tdev->mtu;
+ }
+ dev->iflink = tunnel->parms.link;
+ return dev->mtu;
+}
+
+static int
+vti_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+{
+ int err = 0;
+ struct ip_tunnel_parm p;
+ struct ip_tunnel *t;
+ struct net *net = dev_net(dev);
+ struct vti_net *ipn = net_generic(net, vti_net_id);
+
+ switch (cmd) {
+ case SIOCGETTUNNEL:
+ t = NULL;
+ if (dev == ipn->fb_tunnel_dev) {
+ if (copy_from_user(&p, ifr->ifr_ifru.ifru_data,
+ sizeof(p))) {
+ err = -EFAULT;
+ break;
+ }
+ t = vti_tunnel_locate(net, &p, 0);
+ }
+ if (t == NULL)
+ t = netdev_priv(dev);
+ memcpy(&p, &t->parms, sizeof(p));
+ p.i_flags |= GRE_KEY | VTI_ISVTI;
+ p.o_flags |= GRE_KEY;
+ if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
+ err = -EFAULT;
+ break;
+
+ case SIOCADDTUNNEL:
+ case SIOCCHGTUNNEL:
+ err = -EPERM;
+ if (!capable(CAP_NET_ADMIN))
+ goto done;
+
+ err = -EFAULT;
+ if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
+ goto done;
+
+ err = -EINVAL;
+ if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP ||
+ p.iph.ihl != 5)
+ goto done;
+
+ t = vti_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
+
+ if (dev != ipn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
+ if (t != NULL) {
+ if (t->dev != dev) {
+ err = -EEXIST;
+ break;
+ }
+ } else {
+ if (((dev->flags&IFF_POINTOPOINT) &&
+ !p.iph.daddr) ||
+ (!(dev->flags&IFF_POINTOPOINT) &&
+ p.iph.daddr)) {
+ err = -EINVAL;
+ break;
+ }
+ t = netdev_priv(dev);
+ vti_tunnel_unlink(ipn, t);
+ synchronize_net();
+ t->parms.iph.saddr = p.iph.saddr;
+ t->parms.iph.daddr = p.iph.daddr;
+ t->parms.i_key = p.i_key;
+ t->parms.o_key = p.o_key;
+ t->parms.iph.protocol = IPPROTO_IPIP;
+ memcpy(dev->dev_addr, &p.iph.saddr, 4);
+ memcpy(dev->broadcast, &p.iph.daddr, 4);
+ vti_tunnel_link(ipn, t);
+ netdev_state_change(dev);
+ }
+ }
+
+ if (t) {
+ err = 0;
+ if (cmd == SIOCCHGTUNNEL) {
+ t->parms.i_key = p.i_key;
+ t->parms.o_key = p.o_key;
+ if (t->parms.link != p.link) {
+ t->parms.link = p.link;
+ vti_tunnel_bind_dev(dev);
+ netdev_state_change(dev);
+ }
+ }
+ p.i_flags |= GRE_KEY | VTI_ISVTI;
+ p.o_flags |= GRE_KEY;
+ if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms,
+ sizeof(p)))
+ err = -EFAULT;
+ } else
+ err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
+ break;
+
+ case SIOCDELTUNNEL:
+ err = -EPERM;
+ if (!capable(CAP_NET_ADMIN))
+ goto done;
+
+ if (dev == ipn->fb_tunnel_dev) {
+ err = -EFAULT;
+ if (copy_from_user(&p, ifr->ifr_ifru.ifru_data,
+ sizeof(p)))
+ goto done;
+ err = -ENOENT;
+
+ t = vti_tunnel_locate(net, &p, 0);
+ if (t == NULL)
+ goto done;
+ err = -EPERM;
+ if (t->dev == ipn->fb_tunnel_dev)
+ goto done;
+ dev = t->dev;
+ }
+ unregister_netdevice(dev);
+ err = 0;
+ break;
+
+ default:
+ err = -EINVAL;
+ }
+
+done:
+ return err;
+}
+
+static int vti_tunnel_change_mtu(struct net_device *dev, int new_mtu)
+{
+ if (new_mtu < 68 || new_mtu > 0xFFF8)
+ return -EINVAL;
+ dev->mtu = new_mtu;
+ return 0;
+}
+
+static const struct net_device_ops vti_netdev_ops = {
+ .ndo_init = vti_tunnel_init,
+ .ndo_uninit = vti_tunnel_uninit,
+ .ndo_start_xmit = vti_tunnel_xmit,
+ .ndo_do_ioctl = vti_tunnel_ioctl,
+ .ndo_change_mtu = vti_tunnel_change_mtu,
+ .ndo_get_stats64 = vti_get_stats64,
+};
+
+static void vti_dev_free(struct net_device *dev)
+{
+ free_percpu(dev->tstats);
+ free_netdev(dev);
+}
+
+static void vti_tunnel_setup(struct net_device *dev)
+{
+ dev->netdev_ops = &vti_netdev_ops;
+ dev->destructor = vti_dev_free;
+
+ dev->type = ARPHRD_TUNNEL;
+ dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr);
+ dev->mtu = ETH_DATA_LEN;
+ dev->flags = IFF_NOARP;
+ dev->iflink = 0;
+ dev->addr_len = 4;
+ dev->features |= NETIF_F_NETNS_LOCAL;
+ dev->features |= NETIF_F_LLTX;
+ dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
+}
+
+static int vti_tunnel_init(struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+
+ tunnel->dev = dev;
+ strcpy(tunnel->parms.name, dev->name);
+
+ memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
+ memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
+
+ dev->tstats = alloc_percpu(struct pcpu_tstats);
+ if (!dev->tstats)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static int __net_init vti_fb_tunnel_init(struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ struct iphdr *iph = &tunnel->parms.iph;
+ struct vti_net *ipn = net_generic(dev_net(dev), vti_net_id);
+
+ tunnel->dev = dev;
+ strcpy(tunnel->parms.name, dev->name);
+
+ iph->version = 4;
+ iph->protocol = IPPROTO_IPIP;
+ iph->ihl = 5;
+
+ dev->tstats = alloc_percpu(struct pcpu_tstats);
+ if (!dev->tstats)
+ return -ENOMEM;
+
+ dev_hold(dev);
+ rcu_assign_pointer(ipn->tunnels_wc[0], tunnel);
+ return 0;
+}
+
+static struct xfrm_tunnel vti_handler __read_mostly = {
+ .handler = vti_rcv,
+ .err_handler = vti_err,
+ .priority = 1,
+};
+
+static void vti_destroy_tunnels(struct vti_net *ipn, struct list_head *head)
+{
+ int prio;
+
+ for (prio = 1; prio < 4; prio++) {
+ int h;
+ for (h = 0; h < HASH_SIZE; h++) {
+ struct ip_tunnel *t;
+
+ t = rtnl_dereference(ipn->tunnels[prio][h]);
+ while (t != NULL) {
+ unregister_netdevice_queue(t->dev, head);
+ t = rtnl_dereference(t->next);
+ }
+ }
+ }
+}
+
+static int __net_init vti_init_net(struct net *net)
+{
+ int err;
+ struct vti_net *ipn = net_generic(net, vti_net_id);
+
+ ipn->tunnels[0] = ipn->tunnels_wc;
+ ipn->tunnels[1] = ipn->tunnels_l;
+ ipn->tunnels[2] = ipn->tunnels_r;
+ ipn->tunnels[3] = ipn->tunnels_r_l;
+
+ ipn->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel),
+ "ip_vti0",
+ vti_tunnel_setup);
+ if (!ipn->fb_tunnel_dev) {
+ err = -ENOMEM;
+ goto err_alloc_dev;
+ }
+ dev_net_set(ipn->fb_tunnel_dev, net);
+
+ err = vti_fb_tunnel_init(ipn->fb_tunnel_dev);
+ if (err)
+ goto err_reg_dev;
+ ipn->fb_tunnel_dev->rtnl_link_ops = &vti_link_ops;
+
+ err = register_netdev(ipn->fb_tunnel_dev);
+ if (err)
+ goto err_reg_dev;
+ return 0;
+
+err_reg_dev:
+ vti_dev_free(ipn->fb_tunnel_dev);
+err_alloc_dev:
+ /* nothing */
+ return err;
+}
+
+static void __net_exit vti_exit_net(struct net *net)
+{
+ struct vti_net *ipn = net_generic(net, vti_net_id);
+ LIST_HEAD(list);
+
+ rtnl_lock();
+ vti_destroy_tunnels(ipn, &list);
+ unregister_netdevice_many(&list);
+ rtnl_unlock();
+}
+
+static struct pernet_operations vti_net_ops = {
+ .init = vti_init_net,
+ .exit = vti_exit_net,
+ .id = &vti_net_id,
+ .size = sizeof(struct vti_net),
+};
+
+static int vti_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
+{
+ return 0;
+}
+
+static void vti_netlink_parms(struct nlattr *data[],
+ struct ip_tunnel_parm *parms)
+{
+ memset(parms, 0, sizeof(*parms));
+
+ parms->iph.protocol = IPPROTO_IPIP;
+
+ if (!data)
+ return;
+
+ if (data[IFLA_VTI_LINK])
+ parms->link = nla_get_u32(data[IFLA_VTI_LINK]);
+
+ if (data[IFLA_VTI_IKEY])
+ parms->i_key = nla_get_be32(data[IFLA_VTI_IKEY]);
+
+ if (data[IFLA_VTI_OKEY])
+ parms->o_key = nla_get_be32(data[IFLA_VTI_OKEY]);
+
+ if (data[IFLA_VTI_LOCAL])
+ parms->iph.saddr = nla_get_be32(data[IFLA_VTI_LOCAL]);
+
+ if (data[IFLA_VTI_REMOTE])
+ parms->iph.daddr = nla_get_be32(data[IFLA_VTI_REMOTE]);
+
+}
+
+static int vti_newlink(struct net *src_net, struct net_device *dev,
+ struct nlattr *tb[], struct nlattr *data[])
+{
+ struct ip_tunnel *nt;
+ struct net *net = dev_net(dev);
+ struct vti_net *ipn = net_generic(net, vti_net_id);
+ int mtu;
+ int err;
+
+ nt = netdev_priv(dev);
+ vti_netlink_parms(data, &nt->parms);
+
+ if (vti_tunnel_locate(net, &nt->parms, 0))
+ return -EEXIST;
+
+ mtu = vti_tunnel_bind_dev(dev);
+ if (!tb[IFLA_MTU])
+ dev->mtu = mtu;
+
+ err = register_netdevice(dev);
+ if (err)
+ goto out;
+
+ dev_hold(dev);
+ vti_tunnel_link(ipn, nt);
+
+out:
+ return err;
+}
+
+static int vti_changelink(struct net_device *dev, struct nlattr *tb[],
+ struct nlattr *data[])
+{
+ struct ip_tunnel *t, *nt;
+ struct net *net = dev_net(dev);
+ struct vti_net *ipn = net_generic(net, vti_net_id);
+ struct ip_tunnel_parm p;
+ int mtu;
+
+ if (dev == ipn->fb_tunnel_dev)
+ return -EINVAL;
+
+ nt = netdev_priv(dev);
+ vti_netlink_parms(data, &p);
+
+ t = vti_tunnel_locate(net, &p, 0);
+
+ if (t) {
+ if (t->dev != dev)
+ return -EEXIST;
+ } else {
+ t = nt;
+
+ vti_tunnel_unlink(ipn, t);
+ t->parms.iph.saddr = p.iph.saddr;
+ t->parms.iph.daddr = p.iph.daddr;
+ t->parms.i_key = p.i_key;
+ t->parms.o_key = p.o_key;
+ if (dev->type != ARPHRD_ETHER) {
+ memcpy(dev->dev_addr, &p.iph.saddr, 4);
+ memcpy(dev->broadcast, &p.iph.daddr, 4);
+ }
+ vti_tunnel_link(ipn, t);
+ netdev_state_change(dev);
+ }
+
+ if (t->parms.link != p.link) {
+ t->parms.link = p.link;
+ mtu = vti_tunnel_bind_dev(dev);
+ if (!tb[IFLA_MTU])
+ dev->mtu = mtu;
+ netdev_state_change(dev);
+ }
+
+ return 0;
+}
+
+static size_t vti_get_size(const struct net_device *dev)
+{
+ return
+ /* IFLA_VTI_LINK */
+ nla_total_size(4) +
+ /* IFLA_VTI_IKEY */
+ nla_total_size(4) +
+ /* IFLA_VTI_OKEY */
+ nla_total_size(4) +
+ /* IFLA_VTI_LOCAL */
+ nla_total_size(4) +
+ /* IFLA_VTI_REMOTE */
+ nla_total_size(4) +
+ 0;
+}
+
+static int vti_fill_info(struct sk_buff *skb, const struct net_device *dev)
+{
+ struct ip_tunnel *t = netdev_priv(dev);
+ struct ip_tunnel_parm *p = &t->parms;
+
+ nla_put_u32(skb, IFLA_VTI_LINK, p->link);
+ nla_put_be32(skb, IFLA_VTI_IKEY, p->i_key);
+ nla_put_be32(skb, IFLA_VTI_OKEY, p->o_key);
+ nla_put_be32(skb, IFLA_VTI_LOCAL, p->iph.saddr);
+ nla_put_be32(skb, IFLA_VTI_REMOTE, p->iph.daddr);
+
+ return 0;
+}
+
+static const struct nla_policy vti_policy[IFLA_VTI_MAX + 1] = {
+ [IFLA_VTI_LINK] = { .type = NLA_U32 },
+ [IFLA_VTI_IKEY] = { .type = NLA_U32 },
+ [IFLA_VTI_OKEY] = { .type = NLA_U32 },
+ [IFLA_VTI_LOCAL] = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
+ [IFLA_VTI_REMOTE] = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
+};
+
+static struct rtnl_link_ops vti_link_ops __read_mostly = {
+ .kind = "vti",
+ .maxtype = IFLA_VTI_MAX,
+ .policy = vti_policy,
+ .priv_size = sizeof(struct ip_tunnel),
+ .setup = vti_tunnel_setup,
+ .validate = vti_tunnel_validate,
+ .newlink = vti_newlink,
+ .changelink = vti_changelink,
+ .get_size = vti_get_size,
+ .fill_info = vti_fill_info,
+};
+
+static int __init vti_init(void)
+{
+ int err;
+
+ pr_info("IPv4 over IPSec tunneling driver\n");
+
+ err = register_pernet_device(&vti_net_ops);
+ if (err < 0)
+ return err;
+ err = xfrm4_mode_tunnel_input_register(&vti_handler);
+ if (err < 0) {
+ unregister_pernet_device(&vti_net_ops);
+ pr_info(KERN_INFO "vti init: can't register tunnel\n");
+ }
+
+ err = rtnl_link_register(&vti_link_ops);
+ if (err < 0)
+ goto rtnl_link_failed;
+
+ return err;
+
+rtnl_link_failed:
+ xfrm4_mode_tunnel_input_deregister(&vti_handler);
+ unregister_pernet_device(&vti_net_ops);
+ return err;
+}
+
+static void __exit vti_fini(void)
+{
+ rtnl_link_unregister(&vti_link_ops);
+ if (xfrm4_mode_tunnel_input_deregister(&vti_handler))
+ pr_info("vti close: can't deregister tunnel\n");
+
+ unregister_pernet_device(&vti_net_ops);
+}
+
+module_init(vti_init);
+module_exit(vti_fini);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_RTNL_LINK("vti");
+MODULE_ALIAS_NETDEV("ip_vti0");
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
index 63b64c45a826..d3ab47e19a89 100644
--- a/net/ipv4/ipcomp.c
+++ b/net/ipv4/ipcomp.c
@@ -31,17 +31,26 @@ static void ipcomp4_err(struct sk_buff *skb, u32 info)
struct ip_comp_hdr *ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2));
struct xfrm_state *x;
- if (icmp_hdr(skb)->type != ICMP_DEST_UNREACH ||
- icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
+ switch (icmp_hdr(skb)->type) {
+ case ICMP_DEST_UNREACH:
+ if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
+ return;
+ case ICMP_REDIRECT:
+ break;
+ default:
return;
+ }
spi = htonl(ntohs(ipch->cpi));
x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
spi, IPPROTO_COMP, AF_INET);
if (!x)
return;
- NETDEBUG(KERN_DEBUG "pmtu discovery on SA IPCOMP/%08x/%pI4\n",
- spi, &iph->daddr);
+
+ if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
+ ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_COMP, 0);
+ else
+ ipv4_redirect(skb, net, 0, 0, IPPROTO_COMP, 0);
xfrm_state_put(x);
}
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index 92ac7e7363a0..67e8a6b086ea 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -808,8 +808,6 @@ static void __init ic_bootp_send_if(struct ic_device *d, unsigned long jiffies_d
b->op = BOOTP_REQUEST;
if (dev->type < 256) /* check for false types */
b->htype = dev->type;
- else if (dev->type == ARPHRD_IEEE802_TR) /* fix for token ring */
- b->htype = ARPHRD_IEEE802;
else if (dev->type == ARPHRD_FDDI)
b->htype = ARPHRD_ETHER;
else {
@@ -955,8 +953,7 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str
/* Fragments are not supported */
if (ip_is_fragment(h)) {
- if (net_ratelimit())
- pr_err("DHCP/BOOTP: Ignoring fragmented reply\n");
+ net_err_ratelimited("DHCP/BOOTP: Ignoring fragmented reply\n");
goto drop;
}
@@ -1004,16 +1001,14 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str
/* Is it a reply to our BOOTP request? */
if (b->op != BOOTP_REPLY ||
b->xid != d->xid) {
- if (net_ratelimit())
- pr_err("DHCP/BOOTP: Reply not for us, op[%x] xid[%x]\n",
- b->op, b->xid);
+ net_err_ratelimited("DHCP/BOOTP: Reply not for us, op[%x] xid[%x]\n",
+ b->op, b->xid);
goto drop_unlock;
}
/* Is it a reply for the device we are configuring? */
if (b->xid != ic_dev_xid) {
- if (net_ratelimit())
- pr_err("DHCP/BOOTP: Ignoring delayed packet\n");
+ net_err_ratelimited("DHCP/BOOTP: Ignoring delayed packet\n");
goto drop_unlock;
}
@@ -1198,7 +1193,7 @@ static int __init ic_dynamic(void)
d = ic_first_dev;
retries = CONF_SEND_RETRIES;
get_random_bytes(&timeout, sizeof(timeout));
- timeout = CONF_BASE_TIMEOUT + (timeout % (unsigned) CONF_TIMEOUT_RANDOM);
+ timeout = CONF_BASE_TIMEOUT + (timeout % (unsigned int) CONF_TIMEOUT_RANDOM);
for (;;) {
/* Track the device we are configuring */
ic_dev_xid = d->xid;
@@ -1626,11 +1621,13 @@ static int __init ip_auto_config_setup(char *addrs)
return 1;
}
+__setup("ip=", ip_auto_config_setup);
static int __init nfsaddrs_config_setup(char *addrs)
{
return ip_auto_config_setup(addrs);
}
+__setup("nfsaddrs=", nfsaddrs_config_setup);
static int __init vendor_class_identifier_setup(char *addrs)
{
@@ -1641,7 +1638,4 @@ static int __init vendor_class_identifier_setup(char *addrs)
vendor_class_identifier);
return 1;
}
-
-__setup("ip=", ip_auto_config_setup);
-__setup("nfsaddrs=", nfsaddrs_config_setup);
__setup("dhcpclass=", vendor_class_identifier_setup);
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index ae1413e3f2f8..2c2c35bace76 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -144,33 +144,48 @@ static void ipip_dev_free(struct net_device *dev);
/* often modified stats are per cpu, other are shared (netdev->stats) */
struct pcpu_tstats {
- unsigned long rx_packets;
- unsigned long rx_bytes;
- unsigned long tx_packets;
- unsigned long tx_bytes;
-} __attribute__((aligned(4*sizeof(unsigned long))));
+ u64 rx_packets;
+ u64 rx_bytes;
+ u64 tx_packets;
+ u64 tx_bytes;
+ struct u64_stats_sync syncp;
+};
-static struct net_device_stats *ipip_get_stats(struct net_device *dev)
+static struct rtnl_link_stats64 *ipip_get_stats64(struct net_device *dev,
+ struct rtnl_link_stats64 *tot)
{
- struct pcpu_tstats sum = { 0 };
int i;
for_each_possible_cpu(i) {
const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
-
- sum.rx_packets += tstats->rx_packets;
- sum.rx_bytes += tstats->rx_bytes;
- sum.tx_packets += tstats->tx_packets;
- sum.tx_bytes += tstats->tx_bytes;
+ u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
+ unsigned int start;
+
+ do {
+ start = u64_stats_fetch_begin_bh(&tstats->syncp);
+ rx_packets = tstats->rx_packets;
+ tx_packets = tstats->tx_packets;
+ rx_bytes = tstats->rx_bytes;
+ tx_bytes = tstats->tx_bytes;
+ } while (u64_stats_fetch_retry_bh(&tstats->syncp, start));
+
+ tot->rx_packets += rx_packets;
+ tot->tx_packets += tx_packets;
+ tot->rx_bytes += rx_bytes;
+ tot->tx_bytes += tx_bytes;
}
- dev->stats.rx_packets = sum.rx_packets;
- dev->stats.rx_bytes = sum.rx_bytes;
- dev->stats.tx_packets = sum.tx_packets;
- dev->stats.tx_bytes = sum.tx_bytes;
- return &dev->stats;
+
+ tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
+ tot->tx_carrier_errors = dev->stats.tx_carrier_errors;
+ tot->tx_dropped = dev->stats.tx_dropped;
+ tot->tx_aborted_errors = dev->stats.tx_aborted_errors;
+ tot->tx_errors = dev->stats.tx_errors;
+ tot->collisions = dev->stats.collisions;
+
+ return tot;
}
-static struct ip_tunnel * ipip_tunnel_lookup(struct net *net,
+static struct ip_tunnel *ipip_tunnel_lookup(struct net *net,
__be32 remote, __be32 local)
{
unsigned int h0 = HASH(remote);
@@ -245,7 +260,7 @@ static void ipip_tunnel_link(struct ipip_net *ipn, struct ip_tunnel *t)
rcu_assign_pointer(*tp, t);
}
-static struct ip_tunnel * ipip_tunnel_locate(struct net *net,
+static struct ip_tunnel *ipip_tunnel_locate(struct net *net,
struct ip_tunnel_parm *parms, int create)
{
__be32 remote = parms->iph.daddr;
@@ -333,9 +348,6 @@ static int ipip_err(struct sk_buff *skb, u32 info)
case ICMP_PORT_UNREACH:
/* Impossible event. */
return 0;
- case ICMP_FRAG_NEEDED:
- /* Soft state for pmtu is maintained by IP core. */
- return 0;
default:
/* All others are translated to HOST_UNREACH.
rfc2003 contains "deep thoughts" about NET_UNREACH,
@@ -348,13 +360,32 @@ static int ipip_err(struct sk_buff *skb, u32 info)
if (code != ICMP_EXC_TTL)
return 0;
break;
+ case ICMP_REDIRECT:
+ break;
}
err = -ENOENT;
rcu_read_lock();
t = ipip_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr);
- if (t == NULL || t->parms.iph.daddr == 0)
+ if (t == NULL)
+ goto out;
+
+ if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
+ ipv4_update_pmtu(skb, dev_net(skb->dev), info,
+ t->dev->ifindex, 0, IPPROTO_IPIP, 0);
+ err = 0;
+ goto out;
+ }
+
+ if (type == ICMP_REDIRECT) {
+ ipv4_redirect(skb, dev_net(skb->dev), t->dev->ifindex, 0,
+ IPPROTO_IPIP, 0);
+ err = 0;
+ goto out;
+ }
+
+ if (t->parms.iph.daddr == 0)
goto out;
err = 0;
@@ -404,8 +435,10 @@ static int ipip_rcv(struct sk_buff *skb)
skb->pkt_type = PACKET_HOST;
tstats = this_cpu_ptr(tunnel->dev->tstats);
+ u64_stats_update_begin(&tstats->syncp);
tstats->rx_packets++;
tstats->rx_bytes += skb->len;
+ u64_stats_update_end(&tstats->syncp);
__skb_tunnel_rx(skb, tunnel->dev);
@@ -486,7 +519,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
}
if (skb_dst(skb))
- skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
+ skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
if ((old_iph->frag_off & htons(IP_DF)) &&
mtu < ntohs(old_iph->tot_len)) {
@@ -730,7 +763,7 @@ static const struct net_device_ops ipip_netdev_ops = {
.ndo_start_xmit = ipip_tunnel_xmit,
.ndo_do_ioctl = ipip_tunnel_ioctl,
.ndo_change_mtu = ipip_tunnel_change_mtu,
- .ndo_get_stats = ipip_get_stats,
+ .ndo_get_stats64 = ipip_get_stats64,
};
static void ipip_dev_free(struct net_device *dev)
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 960fbfc3e976..5716c6b808d6 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -524,8 +524,8 @@ failure:
}
#endif
-/*
- * Delete a VIF entry
+/**
+ * vif_delete - Delete a VIF entry
* @notify: Set to 1, if the caller is a notifier_call
*/
@@ -949,8 +949,7 @@ static int ipmr_cache_report(struct mr_table *mrt,
ret = sock_queue_rcv_skb(mroute_sk, skb);
rcu_read_unlock();
if (ret < 0) {
- if (net_ratelimit())
- pr_warn("mroute: pending queue full, dropping entries\n");
+ net_warn_ratelimited("mroute: pending queue full, dropping entries\n");
kfree_skb(skb);
}
@@ -1575,6 +1574,7 @@ static inline int ipmr_forward_finish(struct sk_buff *skb)
struct ip_options *opt = &(IPCB(skb)->opt);
IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS);
+ IP_ADD_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTOCTETS, skb->len);
if (unlikely(opt->optlen))
ip_forward_options(skb);
@@ -2006,37 +2006,37 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
{
int ct;
struct rtnexthop *nhp;
- u8 *b = skb_tail_pointer(skb);
- struct rtattr *mp_head;
+ struct nlattr *mp_attr;
/* If cache is unresolved, don't try to parse IIF and OIF */
if (c->mfc_parent >= MAXVIFS)
return -ENOENT;
- if (VIF_EXISTS(mrt, c->mfc_parent))
- RTA_PUT(skb, RTA_IIF, 4, &mrt->vif_table[c->mfc_parent].dev->ifindex);
+ if (VIF_EXISTS(mrt, c->mfc_parent) &&
+ nla_put_u32(skb, RTA_IIF, mrt->vif_table[c->mfc_parent].dev->ifindex) < 0)
+ return -EMSGSIZE;
- mp_head = (struct rtattr *)skb_put(skb, RTA_LENGTH(0));
+ if (!(mp_attr = nla_nest_start(skb, RTA_MULTIPATH)))
+ return -EMSGSIZE;
for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
if (VIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) {
- if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
- goto rtattr_failure;
- nhp = (struct rtnexthop *)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
+ if (!(nhp = nla_reserve_nohdr(skb, sizeof(*nhp)))) {
+ nla_nest_cancel(skb, mp_attr);
+ return -EMSGSIZE;
+ }
+
nhp->rtnh_flags = 0;
nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
nhp->rtnh_ifindex = mrt->vif_table[ct].dev->ifindex;
nhp->rtnh_len = sizeof(*nhp);
}
}
- mp_head->rta_type = RTA_MULTIPATH;
- mp_head->rta_len = skb_tail_pointer(skb) - (u8 *)mp_head;
+
+ nla_nest_end(skb, mp_attr);
+
rtm->rtm_type = RTN_MULTICAST;
return 1;
-
-rtattr_failure:
- nlmsg_trim(skb, b);
- return -EMSGSIZE;
}
int ipmr_get_route(struct net *net, struct sk_buff *skb,
@@ -2119,15 +2119,16 @@ static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
rtm->rtm_src_len = 32;
rtm->rtm_tos = 0;
rtm->rtm_table = mrt->id;
- NLA_PUT_U32(skb, RTA_TABLE, mrt->id);
+ if (nla_put_u32(skb, RTA_TABLE, mrt->id))
+ goto nla_put_failure;
rtm->rtm_type = RTN_MULTICAST;
rtm->rtm_scope = RT_SCOPE_UNIVERSE;
rtm->rtm_protocol = RTPROT_UNSPEC;
rtm->rtm_flags = 0;
- NLA_PUT_BE32(skb, RTA_SRC, c->mfc_origin);
- NLA_PUT_BE32(skb, RTA_DST, c->mfc_mcastgrp);
-
+ if (nla_put_be32(skb, RTA_SRC, c->mfc_origin) ||
+ nla_put_be32(skb, RTA_DST, c->mfc_mcastgrp))
+ goto nla_put_failure;
if (__ipmr_fill_mroute(mrt, skb, c, rtm) < 0)
goto nla_put_failure;
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index 4f47e064e262..ed1b36783192 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -12,7 +12,7 @@
#include <net/netfilter/nf_queue.h>
/* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */
-int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
+int ip_route_me_harder(struct sk_buff *skb, unsigned int addr_type)
{
struct net *net = dev_net(skb_dst(skb)->dev);
const struct iphdr *iph = ip_hdr(skb);
@@ -237,13 +237,3 @@ static void ipv4_netfilter_fini(void)
module_init(ipv4_netfilter_init);
module_exit(ipv4_netfilter_fini);
-
-#ifdef CONFIG_SYSCTL
-struct ctl_path nf_net_ipv4_netfilter_sysctl_path[] = {
- { .procname = "net", },
- { .procname = "ipv4", },
- { .procname = "netfilter", },
- { }
-};
-EXPORT_SYMBOL_GPL(nf_net_ipv4_netfilter_sysctl_path);
-#endif /* CONFIG_SYSCTL */
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 240b68469a7a..c20674dc9452 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -66,6 +66,3 @@ obj-$(CONFIG_IP_NF_ARP_MANGLE) += arpt_mangle.o
# just filtering instance of ARP tables for now
obj-$(CONFIG_IP_NF_ARPFILTER) += arptable_filter.o
-
-obj-$(CONFIG_IP_NF_QUEUE) += ip_queue.o
-
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index fd7a3f68917f..97e61eadf580 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -221,9 +221,8 @@ static inline int arp_checkentry(const struct arpt_arp *arp)
static unsigned int
arpt_error(struct sk_buff *skb, const struct xt_action_param *par)
{
- if (net_ratelimit())
- pr_err("arp_tables: error: '%s'\n",
- (const char *)par->targinfo);
+ net_err_ratelimited("arp_tables: error: '%s'\n",
+ (const char *)par->targinfo);
return NF_DROP;
}
@@ -303,7 +302,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
if (v < 0) {
/* Pop from stack? */
if (v != XT_RETURN) {
- verdict = (unsigned)(-v) - 1;
+ verdict = (unsigned int)(-v) - 1;
break;
}
e = back;
diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c
deleted file mode 100644
index 94d45e1f8882..000000000000
--- a/net/ipv4/netfilter/ip_queue.c
+++ /dev/null
@@ -1,639 +0,0 @@
-/*
- * This is a module which is used for queueing IPv4 packets and
- * communicating with userspace via netlink.
- *
- * (C) 2000-2002 James Morris <jmorris@intercode.com.au>
- * (C) 2003-2005 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/init.h>
-#include <linux/ip.h>
-#include <linux/notifier.h>
-#include <linux/netdevice.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv4/ip_queue.h>
-#include <linux/netfilter_ipv4/ip_tables.h>
-#include <linux/netlink.h>
-#include <linux/spinlock.h>
-#include <linux/sysctl.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-#include <linux/security.h>
-#include <linux/net.h>
-#include <linux/mutex.h>
-#include <linux/slab.h>
-#include <net/net_namespace.h>
-#include <net/sock.h>
-#include <net/route.h>
-#include <net/netfilter/nf_queue.h>
-#include <net/ip.h>
-
-#define IPQ_QMAX_DEFAULT 1024
-#define IPQ_PROC_FS_NAME "ip_queue"
-#define NET_IPQ_QMAX 2088
-#define NET_IPQ_QMAX_NAME "ip_queue_maxlen"
-
-typedef int (*ipq_cmpfn)(struct nf_queue_entry *, unsigned long);
-
-static unsigned char copy_mode __read_mostly = IPQ_COPY_NONE;
-static unsigned int queue_maxlen __read_mostly = IPQ_QMAX_DEFAULT;
-static DEFINE_SPINLOCK(queue_lock);
-static int peer_pid __read_mostly;
-static unsigned int copy_range __read_mostly;
-static unsigned int queue_total;
-static unsigned int queue_dropped = 0;
-static unsigned int queue_user_dropped = 0;
-static struct sock *ipqnl __read_mostly;
-static LIST_HEAD(queue_list);
-static DEFINE_MUTEX(ipqnl_mutex);
-
-static inline void
-__ipq_enqueue_entry(struct nf_queue_entry *entry)
-{
- list_add_tail(&entry->list, &queue_list);
- queue_total++;
-}
-
-static inline int
-__ipq_set_mode(unsigned char mode, unsigned int range)
-{
- int status = 0;
-
- switch(mode) {
- case IPQ_COPY_NONE:
- case IPQ_COPY_META:
- copy_mode = mode;
- copy_range = 0;
- break;
-
- case IPQ_COPY_PACKET:
- if (range > 0xFFFF)
- range = 0xFFFF;
- copy_range = range;
- copy_mode = mode;
- break;
-
- default:
- status = -EINVAL;
-
- }
- return status;
-}
-
-static void __ipq_flush(ipq_cmpfn cmpfn, unsigned long data);
-
-static inline void
-__ipq_reset(void)
-{
- peer_pid = 0;
- net_disable_timestamp();
- __ipq_set_mode(IPQ_COPY_NONE, 0);
- __ipq_flush(NULL, 0);
-}
-
-static struct nf_queue_entry *
-ipq_find_dequeue_entry(unsigned long id)
-{
- struct nf_queue_entry *entry = NULL, *i;
-
- spin_lock_bh(&queue_lock);
-
- list_for_each_entry(i, &queue_list, list) {
- if ((unsigned long)i == id) {
- entry = i;
- break;
- }
- }
-
- if (entry) {
- list_del(&entry->list);
- queue_total--;
- }
-
- spin_unlock_bh(&queue_lock);
- return entry;
-}
-
-static void
-__ipq_flush(ipq_cmpfn cmpfn, unsigned long data)
-{
- struct nf_queue_entry *entry, *next;
-
- list_for_each_entry_safe(entry, next, &queue_list, list) {
- if (!cmpfn || cmpfn(entry, data)) {
- list_del(&entry->list);
- queue_total--;
- nf_reinject(entry, NF_DROP);
- }
- }
-}
-
-static void
-ipq_flush(ipq_cmpfn cmpfn, unsigned long data)
-{
- spin_lock_bh(&queue_lock);
- __ipq_flush(cmpfn, data);
- spin_unlock_bh(&queue_lock);
-}
-
-static struct sk_buff *
-ipq_build_packet_message(struct nf_queue_entry *entry, int *errp)
-{
- sk_buff_data_t old_tail;
- size_t size = 0;
- size_t data_len = 0;
- struct sk_buff *skb;
- struct ipq_packet_msg *pmsg;
- struct nlmsghdr *nlh;
- struct timeval tv;
-
- switch (ACCESS_ONCE(copy_mode)) {
- case IPQ_COPY_META:
- case IPQ_COPY_NONE:
- size = NLMSG_SPACE(sizeof(*pmsg));
- break;
-
- case IPQ_COPY_PACKET:
- if (entry->skb->ip_summed == CHECKSUM_PARTIAL &&
- (*errp = skb_checksum_help(entry->skb)))
- return NULL;
-
- data_len = ACCESS_ONCE(copy_range);
- if (data_len == 0 || data_len > entry->skb->len)
- data_len = entry->skb->len;
-
- size = NLMSG_SPACE(sizeof(*pmsg) + data_len);
- break;
-
- default:
- *errp = -EINVAL;
- return NULL;
- }
-
- skb = alloc_skb(size, GFP_ATOMIC);
- if (!skb)
- goto nlmsg_failure;
-
- old_tail = skb->tail;
- nlh = NLMSG_PUT(skb, 0, 0, IPQM_PACKET, size - sizeof(*nlh));
- pmsg = NLMSG_DATA(nlh);
- memset(pmsg, 0, sizeof(*pmsg));
-
- pmsg->packet_id = (unsigned long )entry;
- pmsg->data_len = data_len;
- tv = ktime_to_timeval(entry->skb->tstamp);
- pmsg->timestamp_sec = tv.tv_sec;
- pmsg->timestamp_usec = tv.tv_usec;
- pmsg->mark = entry->skb->mark;
- pmsg->hook = entry->hook;
- pmsg->hw_protocol = entry->skb->protocol;
-
- if (entry->indev)
- strcpy(pmsg->indev_name, entry->indev->name);
- else
- pmsg->indev_name[0] = '\0';
-
- if (entry->outdev)
- strcpy(pmsg->outdev_name, entry->outdev->name);
- else
- pmsg->outdev_name[0] = '\0';
-
- if (entry->indev && entry->skb->dev &&
- entry->skb->mac_header != entry->skb->network_header) {
- pmsg->hw_type = entry->skb->dev->type;
- pmsg->hw_addrlen = dev_parse_header(entry->skb,
- pmsg->hw_addr);
- }
-
- if (data_len)
- if (skb_copy_bits(entry->skb, 0, pmsg->payload, data_len))
- BUG();
-
- nlh->nlmsg_len = skb->tail - old_tail;
- return skb;
-
-nlmsg_failure:
- kfree_skb(skb);
- *errp = -EINVAL;
- printk(KERN_ERR "ip_queue: error creating packet message\n");
- return NULL;
-}
-
-static int
-ipq_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
-{
- int status = -EINVAL;
- struct sk_buff *nskb;
-
- if (copy_mode == IPQ_COPY_NONE)
- return -EAGAIN;
-
- nskb = ipq_build_packet_message(entry, &status);
- if (nskb == NULL)
- return status;
-
- spin_lock_bh(&queue_lock);
-
- if (!peer_pid)
- goto err_out_free_nskb;
-
- if (queue_total >= queue_maxlen) {
- queue_dropped++;
- status = -ENOSPC;
- if (net_ratelimit())
- printk (KERN_WARNING "ip_queue: full at %d entries, "
- "dropping packets(s). Dropped: %d\n", queue_total,
- queue_dropped);
- goto err_out_free_nskb;
- }
-
- /* netlink_unicast will either free the nskb or attach it to a socket */
- status = netlink_unicast(ipqnl, nskb, peer_pid, MSG_DONTWAIT);
- if (status < 0) {
- queue_user_dropped++;
- goto err_out_unlock;
- }
-
- __ipq_enqueue_entry(entry);
-
- spin_unlock_bh(&queue_lock);
- return status;
-
-err_out_free_nskb:
- kfree_skb(nskb);
-
-err_out_unlock:
- spin_unlock_bh(&queue_lock);
- return status;
-}
-
-static int
-ipq_mangle_ipv4(ipq_verdict_msg_t *v, struct nf_queue_entry *e)
-{
- int diff;
- struct iphdr *user_iph = (struct iphdr *)v->payload;
- struct sk_buff *nskb;
-
- if (v->data_len < sizeof(*user_iph))
- return 0;
- diff = v->data_len - e->skb->len;
- if (diff < 0) {
- if (pskb_trim(e->skb, v->data_len))
- return -ENOMEM;
- } else if (diff > 0) {
- if (v->data_len > 0xFFFF)
- return -EINVAL;
- if (diff > skb_tailroom(e->skb)) {
- nskb = skb_copy_expand(e->skb, skb_headroom(e->skb),
- diff, GFP_ATOMIC);
- if (!nskb) {
- printk(KERN_WARNING "ip_queue: error "
- "in mangle, dropping packet\n");
- return -ENOMEM;
- }
- kfree_skb(e->skb);
- e->skb = nskb;
- }
- skb_put(e->skb, diff);
- }
- if (!skb_make_writable(e->skb, v->data_len))
- return -ENOMEM;
- skb_copy_to_linear_data(e->skb, v->payload, v->data_len);
- e->skb->ip_summed = CHECKSUM_NONE;
-
- return 0;
-}
-
-static int
-ipq_set_verdict(struct ipq_verdict_msg *vmsg, unsigned int len)
-{
- struct nf_queue_entry *entry;
-
- if (vmsg->value > NF_MAX_VERDICT || vmsg->value == NF_STOLEN)
- return -EINVAL;
-
- entry = ipq_find_dequeue_entry(vmsg->id);
- if (entry == NULL)
- return -ENOENT;
- else {
- int verdict = vmsg->value;
-
- if (vmsg->data_len && vmsg->data_len == len)
- if (ipq_mangle_ipv4(vmsg, entry) < 0)
- verdict = NF_DROP;
-
- nf_reinject(entry, verdict);
- return 0;
- }
-}
-
-static int
-ipq_set_mode(unsigned char mode, unsigned int range)
-{
- int status;
-
- spin_lock_bh(&queue_lock);
- status = __ipq_set_mode(mode, range);
- spin_unlock_bh(&queue_lock);
- return status;
-}
-
-static int
-ipq_receive_peer(struct ipq_peer_msg *pmsg,
- unsigned char type, unsigned int len)
-{
- int status = 0;
-
- if (len < sizeof(*pmsg))
- return -EINVAL;
-
- switch (type) {
- case IPQM_MODE:
- status = ipq_set_mode(pmsg->msg.mode.value,
- pmsg->msg.mode.range);
- break;
-
- case IPQM_VERDICT:
- status = ipq_set_verdict(&pmsg->msg.verdict,
- len - sizeof(*pmsg));
- break;
- default:
- status = -EINVAL;
- }
- return status;
-}
-
-static int
-dev_cmp(struct nf_queue_entry *entry, unsigned long ifindex)
-{
- if (entry->indev)
- if (entry->indev->ifindex == ifindex)
- return 1;
- if (entry->outdev)
- if (entry->outdev->ifindex == ifindex)
- return 1;
-#ifdef CONFIG_BRIDGE_NETFILTER
- if (entry->skb->nf_bridge) {
- if (entry->skb->nf_bridge->physindev &&
- entry->skb->nf_bridge->physindev->ifindex == ifindex)
- return 1;
- if (entry->skb->nf_bridge->physoutdev &&
- entry->skb->nf_bridge->physoutdev->ifindex == ifindex)
- return 1;
- }
-#endif
- return 0;
-}
-
-static void
-ipq_dev_drop(int ifindex)
-{
- ipq_flush(dev_cmp, ifindex);
-}
-
-#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0)
-
-static inline void
-__ipq_rcv_skb(struct sk_buff *skb)
-{
- int status, type, pid, flags;
- unsigned int nlmsglen, skblen;
- struct nlmsghdr *nlh;
- bool enable_timestamp = false;
-
- skblen = skb->len;
- if (skblen < sizeof(*nlh))
- return;
-
- nlh = nlmsg_hdr(skb);
- nlmsglen = nlh->nlmsg_len;
- if (nlmsglen < sizeof(*nlh) || skblen < nlmsglen)
- return;
-
- pid = nlh->nlmsg_pid;
- flags = nlh->nlmsg_flags;
-
- if(pid <= 0 || !(flags & NLM_F_REQUEST) || flags & NLM_F_MULTI)
- RCV_SKB_FAIL(-EINVAL);
-
- if (flags & MSG_TRUNC)
- RCV_SKB_FAIL(-ECOMM);
-
- type = nlh->nlmsg_type;
- if (type < NLMSG_NOOP || type >= IPQM_MAX)
- RCV_SKB_FAIL(-EINVAL);
-
- if (type <= IPQM_BASE)
- return;
-
- if (!capable(CAP_NET_ADMIN))
- RCV_SKB_FAIL(-EPERM);
-
- spin_lock_bh(&queue_lock);
-
- if (peer_pid) {
- if (peer_pid != pid) {
- spin_unlock_bh(&queue_lock);
- RCV_SKB_FAIL(-EBUSY);
- }
- } else {
- enable_timestamp = true;
- peer_pid = pid;
- }
-
- spin_unlock_bh(&queue_lock);
- if (enable_timestamp)
- net_enable_timestamp();
- status = ipq_receive_peer(NLMSG_DATA(nlh), type,
- nlmsglen - NLMSG_LENGTH(0));
- if (status < 0)
- RCV_SKB_FAIL(status);
-
- if (flags & NLM_F_ACK)
- netlink_ack(skb, nlh, 0);
-}
-
-static void
-ipq_rcv_skb(struct sk_buff *skb)
-{
- mutex_lock(&ipqnl_mutex);
- __ipq_rcv_skb(skb);
- mutex_unlock(&ipqnl_mutex);
-}
-
-static int
-ipq_rcv_dev_event(struct notifier_block *this,
- unsigned long event, void *ptr)
-{
- struct net_device *dev = ptr;
-
- if (!net_eq(dev_net(dev), &init_net))
- return NOTIFY_DONE;
-
- /* Drop any packets associated with the downed device */
- if (event == NETDEV_DOWN)
- ipq_dev_drop(dev->ifindex);
- return NOTIFY_DONE;
-}
-
-static struct notifier_block ipq_dev_notifier = {
- .notifier_call = ipq_rcv_dev_event,
-};
-
-static int
-ipq_rcv_nl_event(struct notifier_block *this,
- unsigned long event, void *ptr)
-{
- struct netlink_notify *n = ptr;
-
- if (event == NETLINK_URELEASE && n->protocol == NETLINK_FIREWALL) {
- spin_lock_bh(&queue_lock);
- if ((net_eq(n->net, &init_net)) && (n->pid == peer_pid))
- __ipq_reset();
- spin_unlock_bh(&queue_lock);
- }
- return NOTIFY_DONE;
-}
-
-static struct notifier_block ipq_nl_notifier = {
- .notifier_call = ipq_rcv_nl_event,
-};
-
-#ifdef CONFIG_SYSCTL
-static struct ctl_table_header *ipq_sysctl_header;
-
-static ctl_table ipq_table[] = {
- {
- .procname = NET_IPQ_QMAX_NAME,
- .data = &queue_maxlen,
- .maxlen = sizeof(queue_maxlen),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
- { }
-};
-#endif
-
-#ifdef CONFIG_PROC_FS
-static int ip_queue_show(struct seq_file *m, void *v)
-{
- spin_lock_bh(&queue_lock);
-
- seq_printf(m,
- "Peer PID : %d\n"
- "Copy mode : %hu\n"
- "Copy range : %u\n"
- "Queue length : %u\n"
- "Queue max. length : %u\n"
- "Queue dropped : %u\n"
- "Netlink dropped : %u\n",
- peer_pid,
- copy_mode,
- copy_range,
- queue_total,
- queue_maxlen,
- queue_dropped,
- queue_user_dropped);
-
- spin_unlock_bh(&queue_lock);
- return 0;
-}
-
-static int ip_queue_open(struct inode *inode, struct file *file)
-{
- return single_open(file, ip_queue_show, NULL);
-}
-
-static const struct file_operations ip_queue_proc_fops = {
- .open = ip_queue_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
- .owner = THIS_MODULE,
-};
-#endif
-
-static const struct nf_queue_handler nfqh = {
- .name = "ip_queue",
- .outfn = &ipq_enqueue_packet,
-};
-
-static int __init ip_queue_init(void)
-{
- int status = -ENOMEM;
- struct proc_dir_entry *proc __maybe_unused;
-
- netlink_register_notifier(&ipq_nl_notifier);
- ipqnl = netlink_kernel_create(&init_net, NETLINK_FIREWALL, 0,
- ipq_rcv_skb, NULL, THIS_MODULE);
- if (ipqnl == NULL) {
- printk(KERN_ERR "ip_queue: failed to create netlink socket\n");
- goto cleanup_netlink_notifier;
- }
-
-#ifdef CONFIG_PROC_FS
- proc = proc_create(IPQ_PROC_FS_NAME, 0, init_net.proc_net,
- &ip_queue_proc_fops);
- if (!proc) {
- printk(KERN_ERR "ip_queue: failed to create proc entry\n");
- goto cleanup_ipqnl;
- }
-#endif
- register_netdevice_notifier(&ipq_dev_notifier);
-#ifdef CONFIG_SYSCTL
- ipq_sysctl_header = register_sysctl_paths(net_ipv4_ctl_path, ipq_table);
-#endif
- status = nf_register_queue_handler(NFPROTO_IPV4, &nfqh);
- if (status < 0) {
- printk(KERN_ERR "ip_queue: failed to register queue handler\n");
- goto cleanup_sysctl;
- }
- return status;
-
-cleanup_sysctl:
-#ifdef CONFIG_SYSCTL
- unregister_sysctl_table(ipq_sysctl_header);
-#endif
- unregister_netdevice_notifier(&ipq_dev_notifier);
- proc_net_remove(&init_net, IPQ_PROC_FS_NAME);
-cleanup_ipqnl: __maybe_unused
- netlink_kernel_release(ipqnl);
- mutex_lock(&ipqnl_mutex);
- mutex_unlock(&ipqnl_mutex);
-
-cleanup_netlink_notifier:
- netlink_unregister_notifier(&ipq_nl_notifier);
- return status;
-}
-
-static void __exit ip_queue_fini(void)
-{
- nf_unregister_queue_handlers(&nfqh);
-
- ipq_flush(NULL, 0);
-
-#ifdef CONFIG_SYSCTL
- unregister_sysctl_table(ipq_sysctl_header);
-#endif
- unregister_netdevice_notifier(&ipq_dev_notifier);
- proc_net_remove(&init_net, IPQ_PROC_FS_NAME);
-
- netlink_kernel_release(ipqnl);
- mutex_lock(&ipqnl_mutex);
- mutex_unlock(&ipqnl_mutex);
-
- netlink_unregister_notifier(&ipq_nl_notifier);
-}
-
-MODULE_DESCRIPTION("IPv4 packet queue handler");
-MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_FIREWALL);
-
-module_init(ip_queue_init);
-module_exit(ip_queue_fini);
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 24e556e83a3b..170b1fdd6b72 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -153,8 +153,7 @@ ip_checkentry(const struct ipt_ip *ip)
static unsigned int
ipt_error(struct sk_buff *skb, const struct xt_action_param *par)
{
- if (net_ratelimit())
- pr_info("error: `%s'\n", (const char *)par->targinfo);
+ net_info_ratelimited("error: `%s'\n", (const char *)par->targinfo);
return NF_DROP;
}
@@ -377,7 +376,7 @@ ipt_do_table(struct sk_buff *skb,
if (v < 0) {
/* Pop from stack? */
if (v != XT_RETURN) {
- verdict = (unsigned)(-v) - 1;
+ verdict = (unsigned int)(-v) - 1;
break;
}
if (*stackptr <= origptr) {
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index a639967eb727..fe5daea5214d 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -246,8 +246,7 @@ clusterip_hashfn(const struct sk_buff *skb,
dport = ports[1];
}
} else {
- if (net_ratelimit())
- pr_info("unknown protocol %u\n", iph->protocol);
+ net_info_ratelimited("unknown protocol %u\n", iph->protocol);
}
switch (config->hash_mode) {
diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c
index ba5756d20165..1109f7f6c254 100644
--- a/net/ipv4/netfilter/ipt_ULOG.c
+++ b/net/ipv4/netfilter/ipt_ULOG.c
@@ -196,12 +196,15 @@ static void ipt_ulog_packet(unsigned int hooknum,
pr_debug("qlen %d, qthreshold %Zu\n", ub->qlen, loginfo->qthreshold);
- /* NLMSG_PUT contains a hidden goto nlmsg_failure !!! */
- nlh = NLMSG_PUT(ub->skb, 0, ub->qlen, ULOG_NL_EVENT,
- sizeof(*pm)+copy_len);
+ nlh = nlmsg_put(ub->skb, 0, ub->qlen, ULOG_NL_EVENT,
+ sizeof(*pm)+copy_len, 0);
+ if (!nlh) {
+ pr_debug("error during nlmsg_put\n");
+ goto out_unlock;
+ }
ub->qlen++;
- pm = NLMSG_DATA(nlh);
+ pm = nlmsg_data(nlh);
/* We might not have a timestamp, get one */
if (skb->tstamp.tv64 == 0)
@@ -261,13 +264,11 @@ static void ipt_ulog_packet(unsigned int hooknum,
nlh->nlmsg_type = NLMSG_DONE;
ulog_send(groupnum);
}
-
+out_unlock:
spin_unlock_bh(&ulog_lock);
return;
-nlmsg_failure:
- pr_debug("error during NLMSG_PUT\n");
alloc_failure:
pr_debug("Error building netlink message\n");
spin_unlock_bh(&ulog_lock);
@@ -380,6 +381,9 @@ static struct nf_logger ipt_ulog_logger __read_mostly = {
static int __init ulog_tg_init(void)
{
int ret, i;
+ struct netlink_kernel_cfg cfg = {
+ .groups = ULOG_MAXNLGROUPS,
+ };
pr_debug("init module\n");
@@ -392,9 +396,8 @@ static int __init ulog_tg_init(void)
for (i = 0; i < ULOG_MAXNLGROUPS; i++)
setup_timer(&ulog_buffers[i].timer, ulog_timer, i);
- nflognl = netlink_kernel_create(&init_net,
- NETLINK_NFLOG, ULOG_MAXNLGROUPS, NULL,
- NULL, THIS_MODULE);
+ nflognl = netlink_kernel_create(&init_net, NETLINK_NFLOG,
+ THIS_MODULE, &cfg);
if (!nflognl)
return -ENOMEM;
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index de9da21113a1..e7ff2dcab6ce 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -74,24 +74,32 @@ static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph);
if (iph == NULL)
- return -NF_DROP;
+ return -NF_ACCEPT;
/* Conntrack defragments packets, we might still see fragments
* inside ICMP packets though. */
if (iph->frag_off & htons(IP_OFFSET))
- return -NF_DROP;
+ return -NF_ACCEPT;
*dataoff = nhoff + (iph->ihl << 2);
*protonum = iph->protocol;
+ /* Check bogus IP headers */
+ if (*dataoff > skb->len) {
+ pr_debug("nf_conntrack_ipv4: bogus IPv4 packet: "
+ "nhoff %u, ihl %u, skblen %u\n",
+ nhoff, iph->ihl << 2, skb->len);
+ return -NF_ACCEPT;
+ }
+
return NF_ACCEPT;
}
-static unsigned int ipv4_confirm(unsigned int hooknum,
- struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
+static unsigned int ipv4_helper(unsigned int hooknum,
+ struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
{
struct nf_conn *ct;
enum ip_conntrack_info ctinfo;
@@ -102,24 +110,38 @@ static unsigned int ipv4_confirm(unsigned int hooknum,
/* This is where we call the helper: as the packet goes out. */
ct = nf_ct_get(skb, &ctinfo);
if (!ct || ctinfo == IP_CT_RELATED_REPLY)
- goto out;
+ return NF_ACCEPT;
help = nfct_help(ct);
if (!help)
- goto out;
+ return NF_ACCEPT;
/* rcu_read_lock()ed by nf_hook_slow */
helper = rcu_dereference(help->helper);
if (!helper)
- goto out;
+ return NF_ACCEPT;
ret = helper->help(skb, skb_network_offset(skb) + ip_hdrlen(skb),
ct, ctinfo);
- if (ret != NF_ACCEPT) {
+ if (ret != NF_ACCEPT && (ret & NF_VERDICT_MASK) != NF_QUEUE) {
nf_log_packet(NFPROTO_IPV4, hooknum, skb, in, out, NULL,
"nf_ct_%s: dropping packet", helper->name);
- return ret;
}
+ return ret;
+}
+
+static unsigned int ipv4_confirm(unsigned int hooknum,
+ struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ struct nf_conn *ct;
+ enum ip_conntrack_info ctinfo;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (!ct || ctinfo == IP_CT_RELATED_REPLY)
+ goto out;
/* adjust seqs for loopback traffic only in outgoing direction */
if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
@@ -177,6 +199,13 @@ static struct nf_hook_ops ipv4_conntrack_ops[] __read_mostly = {
.priority = NF_IP_PRI_CONNTRACK,
},
{
+ .hook = ipv4_helper,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_POST_ROUTING,
+ .priority = NF_IP_PRI_CONNTRACK_HELPER,
+ },
+ {
.hook = ipv4_confirm,
.owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
@@ -184,6 +213,13 @@ static struct nf_hook_ops ipv4_conntrack_ops[] __read_mostly = {
.priority = NF_IP_PRI_CONNTRACK_CONFIRM,
},
{
+ .hook = ipv4_helper,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_LOCAL_IN,
+ .priority = NF_IP_PRI_CONNTRACK_HELPER,
+ },
+ {
.hook = ipv4_confirm,
.owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
@@ -199,35 +235,30 @@ static int log_invalid_proto_max = 255;
static ctl_table ip_ct_sysctl_table[] = {
{
.procname = "ip_conntrack_max",
- .data = &nf_conntrack_max,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "ip_conntrack_count",
- .data = &init_net.ct.count,
.maxlen = sizeof(int),
.mode = 0444,
.proc_handler = proc_dointvec,
},
{
.procname = "ip_conntrack_buckets",
- .data = &init_net.ct.htable_size,
.maxlen = sizeof(unsigned int),
.mode = 0444,
.proc_handler = proc_dointvec,
},
{
.procname = "ip_conntrack_checksum",
- .data = &init_net.ct.sysctl_checksum,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "ip_conntrack_log_invalid",
- .data = &init_net.ct.sysctl_log_invalid,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
@@ -303,8 +334,9 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len)
static int ipv4_tuple_to_nlattr(struct sk_buff *skb,
const struct nf_conntrack_tuple *tuple)
{
- NLA_PUT_BE32(skb, CTA_IP_V4_SRC, tuple->src.u3.ip);
- NLA_PUT_BE32(skb, CTA_IP_V4_DST, tuple->dst.u3.ip);
+ if (nla_put_be32(skb, CTA_IP_V4_SRC, tuple->src.u3.ip) ||
+ nla_put_be32(skb, CTA_IP_V4_DST, tuple->dst.u3.ip))
+ goto nla_put_failure;
return 0;
nla_put_failure:
@@ -342,6 +374,25 @@ static struct nf_sockopt_ops so_getorigdst = {
.owner = THIS_MODULE,
};
+static int ipv4_init_net(struct net *net)
+{
+#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
+ struct nf_ip_net *in = &net->ct.nf_ct_proto;
+ in->ctl_table = kmemdup(ip_ct_sysctl_table,
+ sizeof(ip_ct_sysctl_table),
+ GFP_KERNEL);
+ if (!in->ctl_table)
+ return -ENOMEM;
+
+ in->ctl_table[0].data = &nf_conntrack_max;
+ in->ctl_table[1].data = &net->ct.count;
+ in->ctl_table[2].data = &net->ct.htable_size;
+ in->ctl_table[3].data = &net->ct.sysctl_checksum;
+ in->ctl_table[4].data = &net->ct.sysctl_log_invalid;
+#endif
+ return 0;
+}
+
struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = {
.l3proto = PF_INET,
.name = "ipv4",
@@ -356,9 +407,9 @@ struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = {
.nla_policy = ipv4_nla_policy,
#endif
#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
- .ctl_table_path = nf_net_ipv4_netfilter_sysctl_path,
- .ctl_table = ip_ct_sysctl_table,
+ .ctl_table_path = "net/ipv4/netfilter",
#endif
+ .init_net = ipv4_init_net,
.me = THIS_MODULE,
};
@@ -369,6 +420,65 @@ MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET));
MODULE_ALIAS("ip_conntrack");
MODULE_LICENSE("GPL");
+static int ipv4_net_init(struct net *net)
+{
+ int ret = 0;
+
+ ret = nf_conntrack_l4proto_register(net,
+ &nf_conntrack_l4proto_tcp4);
+ if (ret < 0) {
+ pr_err("nf_conntrack_l4proto_tcp4 :protocol register failed\n");
+ goto out_tcp;
+ }
+ ret = nf_conntrack_l4proto_register(net,
+ &nf_conntrack_l4proto_udp4);
+ if (ret < 0) {
+ pr_err("nf_conntrack_l4proto_udp4 :protocol register failed\n");
+ goto out_udp;
+ }
+ ret = nf_conntrack_l4proto_register(net,
+ &nf_conntrack_l4proto_icmp);
+ if (ret < 0) {
+ pr_err("nf_conntrack_l4proto_icmp4 :protocol register failed\n");
+ goto out_icmp;
+ }
+ ret = nf_conntrack_l3proto_register(net,
+ &nf_conntrack_l3proto_ipv4);
+ if (ret < 0) {
+ pr_err("nf_conntrack_l3proto_ipv4 :protocol register failed\n");
+ goto out_ipv4;
+ }
+ return 0;
+out_ipv4:
+ nf_conntrack_l4proto_unregister(net,
+ &nf_conntrack_l4proto_icmp);
+out_icmp:
+ nf_conntrack_l4proto_unregister(net,
+ &nf_conntrack_l4proto_udp4);
+out_udp:
+ nf_conntrack_l4proto_unregister(net,
+ &nf_conntrack_l4proto_tcp4);
+out_tcp:
+ return ret;
+}
+
+static void ipv4_net_exit(struct net *net)
+{
+ nf_conntrack_l3proto_unregister(net,
+ &nf_conntrack_l3proto_ipv4);
+ nf_conntrack_l4proto_unregister(net,
+ &nf_conntrack_l4proto_icmp);
+ nf_conntrack_l4proto_unregister(net,
+ &nf_conntrack_l4proto_udp4);
+ nf_conntrack_l4proto_unregister(net,
+ &nf_conntrack_l4proto_tcp4);
+}
+
+static struct pernet_operations ipv4_net_ops = {
+ .init = ipv4_net_init,
+ .exit = ipv4_net_exit,
+};
+
static int __init nf_conntrack_l3proto_ipv4_init(void)
{
int ret = 0;
@@ -382,35 +492,17 @@ static int __init nf_conntrack_l3proto_ipv4_init(void)
return ret;
}
- ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_tcp4);
+ ret = register_pernet_subsys(&ipv4_net_ops);
if (ret < 0) {
- pr_err("nf_conntrack_ipv4: can't register tcp.\n");
+ pr_err("nf_conntrack_ipv4: can't register pernet ops\n");
goto cleanup_sockopt;
}
- ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_udp4);
- if (ret < 0) {
- pr_err("nf_conntrack_ipv4: can't register udp.\n");
- goto cleanup_tcp;
- }
-
- ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_icmp);
- if (ret < 0) {
- pr_err("nf_conntrack_ipv4: can't register icmp.\n");
- goto cleanup_udp;
- }
-
- ret = nf_conntrack_l3proto_register(&nf_conntrack_l3proto_ipv4);
- if (ret < 0) {
- pr_err("nf_conntrack_ipv4: can't register ipv4\n");
- goto cleanup_icmp;
- }
-
ret = nf_register_hooks(ipv4_conntrack_ops,
ARRAY_SIZE(ipv4_conntrack_ops));
if (ret < 0) {
pr_err("nf_conntrack_ipv4: can't register hooks.\n");
- goto cleanup_ipv4;
+ goto cleanup_pernet;
}
#if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
ret = nf_conntrack_ipv4_compat_init();
@@ -422,14 +514,8 @@ static int __init nf_conntrack_l3proto_ipv4_init(void)
cleanup_hooks:
nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops));
#endif
- cleanup_ipv4:
- nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv4);
- cleanup_icmp:
- nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_icmp);
- cleanup_udp:
- nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp4);
- cleanup_tcp:
- nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp4);
+ cleanup_pernet:
+ unregister_pernet_subsys(&ipv4_net_ops);
cleanup_sockopt:
nf_unregister_sockopt(&so_getorigdst);
return ret;
@@ -442,10 +528,7 @@ static void __exit nf_conntrack_l3proto_ipv4_fini(void)
nf_conntrack_ipv4_compat_fini();
#endif
nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops));
- nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv4);
- nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_icmp);
- nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp4);
- nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp4);
+ unregister_pernet_subsys(&ipv4_net_ops);
nf_unregister_sockopt(&so_getorigdst);
}
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index 7cbe9cb261c2..5241d997ab75 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -23,6 +23,11 @@
static unsigned int nf_ct_icmp_timeout __read_mostly = 30*HZ;
+static inline struct nf_icmp_net *icmp_pernet(struct net *net)
+{
+ return &net->ct.nf_ct_proto.icmp;
+}
+
static bool icmp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
struct nf_conntrack_tuple *tuple)
{
@@ -77,7 +82,7 @@ static int icmp_print_tuple(struct seq_file *s,
static unsigned int *icmp_get_timeouts(struct net *net)
{
- return &nf_ct_icmp_timeout;
+ return &icmp_pernet(net)->timeout;
}
/* Returns verdict for packet, or -1 for invalid. */
@@ -228,10 +233,10 @@ icmp_error(struct net *net, struct nf_conn *tmpl,
static int icmp_tuple_to_nlattr(struct sk_buff *skb,
const struct nf_conntrack_tuple *t)
{
- NLA_PUT_BE16(skb, CTA_PROTO_ICMP_ID, t->src.u.icmp.id);
- NLA_PUT_U8(skb, CTA_PROTO_ICMP_TYPE, t->dst.u.icmp.type);
- NLA_PUT_U8(skb, CTA_PROTO_ICMP_CODE, t->dst.u.icmp.code);
-
+ if (nla_put_be16(skb, CTA_PROTO_ICMP_ID, t->src.u.icmp.id) ||
+ nla_put_u8(skb, CTA_PROTO_ICMP_TYPE, t->dst.u.icmp.type) ||
+ nla_put_u8(skb, CTA_PROTO_ICMP_CODE, t->dst.u.icmp.code))
+ goto nla_put_failure;
return 0;
nla_put_failure:
@@ -274,16 +279,18 @@ static int icmp_nlattr_tuple_size(void)
#include <linux/netfilter/nfnetlink.h>
#include <linux/netfilter/nfnetlink_cttimeout.h>
-static int icmp_timeout_nlattr_to_obj(struct nlattr *tb[], void *data)
+static int icmp_timeout_nlattr_to_obj(struct nlattr *tb[],
+ struct net *net, void *data)
{
unsigned int *timeout = data;
+ struct nf_icmp_net *in = icmp_pernet(net);
if (tb[CTA_TIMEOUT_ICMP_TIMEOUT]) {
*timeout =
ntohl(nla_get_be32(tb[CTA_TIMEOUT_ICMP_TIMEOUT])) * HZ;
} else {
/* Set default ICMP timeout. */
- *timeout = nf_ct_icmp_timeout;
+ *timeout = in->timeout;
}
return 0;
}
@@ -293,8 +300,8 @@ icmp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data)
{
const unsigned int *timeout = data;
- NLA_PUT_BE32(skb, CTA_TIMEOUT_ICMP_TIMEOUT, htonl(*timeout / HZ));
-
+ if (nla_put_be32(skb, CTA_TIMEOUT_ICMP_TIMEOUT, htonl(*timeout / HZ)))
+ goto nla_put_failure;
return 0;
nla_put_failure:
@@ -308,11 +315,9 @@ icmp_timeout_nla_policy[CTA_TIMEOUT_ICMP_MAX+1] = {
#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
#ifdef CONFIG_SYSCTL
-static struct ctl_table_header *icmp_sysctl_header;
static struct ctl_table icmp_sysctl_table[] = {
{
.procname = "nf_conntrack_icmp_timeout",
- .data = &nf_ct_icmp_timeout,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
@@ -323,7 +328,6 @@ static struct ctl_table icmp_sysctl_table[] = {
static struct ctl_table icmp_compat_sysctl_table[] = {
{
.procname = "ip_conntrack_icmp_timeout",
- .data = &nf_ct_icmp_timeout,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
@@ -333,6 +337,62 @@ static struct ctl_table icmp_compat_sysctl_table[] = {
#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */
#endif /* CONFIG_SYSCTL */
+static int icmp_kmemdup_sysctl_table(struct nf_proto_net *pn,
+ struct nf_icmp_net *in)
+{
+#ifdef CONFIG_SYSCTL
+ pn->ctl_table = kmemdup(icmp_sysctl_table,
+ sizeof(icmp_sysctl_table),
+ GFP_KERNEL);
+ if (!pn->ctl_table)
+ return -ENOMEM;
+
+ pn->ctl_table[0].data = &in->timeout;
+#endif
+ return 0;
+}
+
+static int icmp_kmemdup_compat_sysctl_table(struct nf_proto_net *pn,
+ struct nf_icmp_net *in)
+{
+#ifdef CONFIG_SYSCTL
+#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
+ pn->ctl_compat_table = kmemdup(icmp_compat_sysctl_table,
+ sizeof(icmp_compat_sysctl_table),
+ GFP_KERNEL);
+ if (!pn->ctl_compat_table)
+ return -ENOMEM;
+
+ pn->ctl_compat_table[0].data = &in->timeout;
+#endif
+#endif
+ return 0;
+}
+
+static int icmp_init_net(struct net *net, u_int16_t proto)
+{
+ int ret;
+ struct nf_icmp_net *in = icmp_pernet(net);
+ struct nf_proto_net *pn = &in->pn;
+
+ in->timeout = nf_ct_icmp_timeout;
+
+ ret = icmp_kmemdup_compat_sysctl_table(pn, in);
+ if (ret < 0)
+ return ret;
+
+ ret = icmp_kmemdup_sysctl_table(pn, in);
+ if (ret < 0)
+ nf_ct_kfree_compat_sysctl_table(pn);
+
+ return ret;
+}
+
+static struct nf_proto_net *icmp_get_net_proto(struct net *net)
+{
+ return &net->ct.nf_ct_proto.icmp.pn;
+}
+
struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp __read_mostly =
{
.l3proto = PF_INET,
@@ -362,11 +422,6 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp __read_mostly =
.nla_policy = icmp_timeout_nla_policy,
},
#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
-#ifdef CONFIG_SYSCTL
- .ctl_table_header = &icmp_sysctl_header,
- .ctl_table = icmp_sysctl_table,
-#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
- .ctl_compat_table = icmp_compat_sysctl_table,
-#endif
-#endif
+ .init_net = icmp_init_net,
+ .get_net_proto = icmp_get_net_proto,
};
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
index 9bb1b8a37a22..742815518b0f 100644
--- a/net/ipv4/netfilter/nf_defrag_ipv4.c
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -94,14 +94,14 @@ static struct nf_hook_ops ipv4_defrag_ops[] = {
{
.hook = ipv4_conntrack_defrag,
.owner = THIS_MODULE,
- .pf = PF_INET,
+ .pf = NFPROTO_IPV4,
.hooknum = NF_INET_PRE_ROUTING,
.priority = NF_IP_PRI_CONNTRACK_DEFRAG,
},
{
.hook = ipv4_conntrack_defrag,
.owner = THIS_MODULE,
- .pf = PF_INET,
+ .pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_OUT,
.priority = NF_IP_PRI_CONNTRACK_DEFRAG,
},
diff --git a/net/ipv4/netfilter/nf_nat_amanda.c b/net/ipv4/netfilter/nf_nat_amanda.c
index 7b22382ff0e9..3c04d24e2976 100644
--- a/net/ipv4/netfilter/nf_nat_amanda.c
+++ b/net/ipv4/netfilter/nf_nat_amanda.c
@@ -13,10 +13,10 @@
#include <linux/skbuff.h>
#include <linux/udp.h>
-#include <net/netfilter/nf_nat_helper.h>
-#include <net/netfilter/nf_nat_rule.h>
#include <net/netfilter/nf_conntrack_helper.h>
#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_nat_helper.h>
+#include <net/netfilter/nf_nat_rule.h>
#include <linux/netfilter/nf_conntrack_amanda.h>
MODULE_AUTHOR("Brian J. Murrell <netfilter@interlinx.bc.ca>");
diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c
index abb52adf5acd..44b082fd48ab 100644
--- a/net/ipv4/netfilter/nf_nat_core.c
+++ b/net/ipv4/netfilter/nf_nat_core.c
@@ -691,6 +691,10 @@ static struct nf_ct_helper_expectfn follow_master_nat = {
.expectfn = nf_nat_follow_master,
};
+static struct nfq_ct_nat_hook nfq_ct_nat = {
+ .seq_adjust = nf_nat_tcp_seq_adjust,
+};
+
static int __init nf_nat_init(void)
{
size_t i;
@@ -731,6 +735,7 @@ static int __init nf_nat_init(void)
nfnetlink_parse_nat_setup);
BUG_ON(nf_ct_nat_offset != NULL);
RCU_INIT_POINTER(nf_ct_nat_offset, nf_nat_get_offset);
+ RCU_INIT_POINTER(nfq_ct_nat_hook, &nfq_ct_nat);
return 0;
cleanup_extend:
@@ -747,6 +752,7 @@ static void __exit nf_nat_cleanup(void)
RCU_INIT_POINTER(nf_nat_seq_adjust_hook, NULL);
RCU_INIT_POINTER(nfnetlink_parse_nat_setup_hook, NULL);
RCU_INIT_POINTER(nf_ct_nat_offset, NULL);
+ RCU_INIT_POINTER(nfq_ct_nat_hook, NULL);
synchronize_net();
}
diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c
index 82536701e3a3..c6784a18c1c4 100644
--- a/net/ipv4/netfilter/nf_nat_h323.c
+++ b/net/ipv4/netfilter/nf_nat_h323.c
@@ -42,9 +42,7 @@ static int set_addr(struct sk_buff *skb,
if (!nf_nat_mangle_tcp_packet(skb, ct, ctinfo,
addroff, sizeof(buf),
(char *) &buf, sizeof(buf))) {
- if (net_ratelimit())
- pr_notice("nf_nat_h323: nf_nat_mangle_tcp_packet"
- " error\n");
+ net_notice_ratelimited("nf_nat_h323: nf_nat_mangle_tcp_packet error\n");
return -1;
}
@@ -58,9 +56,7 @@ static int set_addr(struct sk_buff *skb,
if (!nf_nat_mangle_udp_packet(skb, ct, ctinfo,
addroff, sizeof(buf),
(char *) &buf, sizeof(buf))) {
- if (net_ratelimit())
- pr_notice("nf_nat_h323: nf_nat_mangle_udp_packet"
- " error\n");
+ net_notice_ratelimited("nf_nat_h323: nf_nat_mangle_udp_packet error\n");
return -1;
}
/* nf_nat_mangle_udp_packet uses skb_make_writable() to copy
@@ -99,7 +95,7 @@ static int set_sig_addr(struct sk_buff *skb, struct nf_conn *ct,
unsigned char **data,
TransportAddress *taddr, int count)
{
- const struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info;
+ const struct nf_ct_h323_master *info = nfct_help_data(ct);
int dir = CTINFO2DIR(ctinfo);
int i;
__be16 port;
@@ -182,7 +178,7 @@ static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
struct nf_conntrack_expect *rtp_exp,
struct nf_conntrack_expect *rtcp_exp)
{
- struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info;
+ struct nf_ct_h323_master *info = nfct_help_data(ct);
int dir = CTINFO2DIR(ctinfo);
int i;
u_int16_t nated_port;
@@ -214,8 +210,7 @@ static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
/* Run out of expectations */
if (i >= H323_RTP_CHANNEL_MAX) {
- if (net_ratelimit())
- pr_notice("nf_nat_h323: out of expectations\n");
+ net_notice_ratelimited("nf_nat_h323: out of expectations\n");
return 0;
}
@@ -244,8 +239,7 @@ static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
}
if (nated_port == 0) { /* No port available */
- if (net_ratelimit())
- pr_notice("nf_nat_h323: out of RTP ports\n");
+ net_notice_ratelimited("nf_nat_h323: out of RTP ports\n");
return 0;
}
@@ -308,8 +302,7 @@ static int nat_t120(struct sk_buff *skb, struct nf_conn *ct,
}
if (nated_port == 0) { /* No port available */
- if (net_ratelimit())
- pr_notice("nf_nat_h323: out of TCP ports\n");
+ net_notice_ratelimited("nf_nat_h323: out of TCP ports\n");
return 0;
}
@@ -337,7 +330,7 @@ static int nat_h245(struct sk_buff *skb, struct nf_conn *ct,
TransportAddress *taddr, __be16 port,
struct nf_conntrack_expect *exp)
{
- struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info;
+ struct nf_ct_h323_master *info = nfct_help_data(ct);
int dir = CTINFO2DIR(ctinfo);
u_int16_t nated_port = ntohs(port);
@@ -365,8 +358,7 @@ static int nat_h245(struct sk_buff *skb, struct nf_conn *ct,
}
if (nated_port == 0) { /* No port available */
- if (net_ratelimit())
- pr_notice("nf_nat_q931: out of TCP ports\n");
+ net_notice_ratelimited("nf_nat_q931: out of TCP ports\n");
return 0;
}
@@ -427,7 +419,7 @@ static int nat_q931(struct sk_buff *skb, struct nf_conn *ct,
unsigned char **data, TransportAddress *taddr, int idx,
__be16 port, struct nf_conntrack_expect *exp)
{
- struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info;
+ struct nf_ct_h323_master *info = nfct_help_data(ct);
int dir = CTINFO2DIR(ctinfo);
u_int16_t nated_port = ntohs(port);
union nf_inet_addr addr;
@@ -456,8 +448,7 @@ static int nat_q931(struct sk_buff *skb, struct nf_conn *ct,
}
if (nated_port == 0) { /* No port available */
- if (net_ratelimit())
- pr_notice("nf_nat_ras: out of TCP ports\n");
+ net_notice_ratelimited("nf_nat_ras: out of TCP ports\n");
return 0;
}
@@ -545,8 +536,7 @@ static int nat_callforwarding(struct sk_buff *skb, struct nf_conn *ct,
}
if (nated_port == 0) { /* No port available */
- if (net_ratelimit())
- pr_notice("nf_nat_q931: out of TCP ports\n");
+ net_notice_ratelimited("nf_nat_q931: out of TCP ports\n");
return 0;
}
diff --git a/net/ipv4/netfilter/nf_nat_helper.c b/net/ipv4/netfilter/nf_nat_helper.c
index af65958f6308..2e59ad0b90ca 100644
--- a/net/ipv4/netfilter/nf_nat_helper.c
+++ b/net/ipv4/netfilter/nf_nat_helper.c
@@ -153,6 +153,19 @@ void nf_nat_set_seq_adjust(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
}
EXPORT_SYMBOL_GPL(nf_nat_set_seq_adjust);
+void nf_nat_tcp_seq_adjust(struct sk_buff *skb, struct nf_conn *ct,
+ u32 ctinfo, int off)
+{
+ const struct tcphdr *th;
+
+ if (nf_ct_protonum(ct) != IPPROTO_TCP)
+ return;
+
+ th = (struct tcphdr *)(skb_network_header(skb)+ ip_hdrlen(skb));
+ nf_nat_set_seq_adjust(ct, ctinfo, th->seq, off);
+}
+EXPORT_SYMBOL_GPL(nf_nat_tcp_seq_adjust);
+
static void nf_nat_csum(struct sk_buff *skb, const struct iphdr *iph, void *data,
int datalen, __sum16 *check, int oldlen)
{
diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c
index c273d58980ae..388140881ebe 100644
--- a/net/ipv4/netfilter/nf_nat_pptp.c
+++ b/net/ipv4/netfilter/nf_nat_pptp.c
@@ -49,7 +49,7 @@ static void pptp_nat_expected(struct nf_conn *ct,
const struct nf_nat_pptp *nat_pptp_info;
struct nf_nat_ipv4_range range;
- ct_pptp_info = &nfct_help(master)->help.ct_pptp_info;
+ ct_pptp_info = nfct_help_data(master);
nat_pptp_info = &nfct_nat(master)->help.nat_pptp_info;
/* And here goes the grand finale of corrosion... */
@@ -123,7 +123,7 @@ pptp_outbound_pkt(struct sk_buff *skb,
__be16 new_callid;
unsigned int cid_off;
- ct_pptp_info = &nfct_help(ct)->help.ct_pptp_info;
+ ct_pptp_info = nfct_help_data(ct);
nat_pptp_info = &nfct_nat(ct)->help.nat_pptp_info;
new_callid = ct_pptp_info->pns_call_id;
@@ -192,7 +192,7 @@ pptp_exp_gre(struct nf_conntrack_expect *expect_orig,
struct nf_ct_pptp_master *ct_pptp_info;
struct nf_nat_pptp *nat_pptp_info;
- ct_pptp_info = &nfct_help(ct)->help.ct_pptp_info;
+ ct_pptp_info = nfct_help_data(ct);
nat_pptp_info = &nfct_nat(ct)->help.nat_pptp_info;
/* save original PAC call ID in nat_info */
diff --git a/net/ipv4/netfilter/nf_nat_sip.c b/net/ipv4/netfilter/nf_nat_sip.c
index 57932c43960e..ea4a23813d26 100644
--- a/net/ipv4/netfilter/nf_nat_sip.c
+++ b/net/ipv4/netfilter/nf_nat_sip.c
@@ -283,7 +283,7 @@ static unsigned int ip_nat_sip_expect(struct sk_buff *skb, unsigned int dataoff,
__be32 newip;
u_int16_t port;
char buffer[sizeof("nnn.nnn.nnn.nnn:nnnnn")];
- unsigned buflen;
+ unsigned int buflen;
/* Connection will come from reply */
if (ct->tuplehash[dir].tuple.src.u3.ip == ct->tuplehash[!dir].tuple.dst.u3.ip)
diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c
index 2133c30a4a5f..bac712293fd6 100644
--- a/net/ipv4/netfilter/nf_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c
@@ -405,7 +405,7 @@ static unsigned char asn1_octets_decode(struct asn1_ctx *ctx,
ptr = *octets;
while (ctx->pointer < eoc) {
- if (!asn1_octet_decode(ctx, (unsigned char *)ptr++)) {
+ if (!asn1_octet_decode(ctx, ptr++)) {
kfree(*octets);
*octets = NULL;
return 0;
@@ -759,7 +759,7 @@ static unsigned char snmp_object_decode(struct asn1_ctx *ctx,
}
break;
case SNMP_OBJECTID:
- if (!asn1_oid_decode(ctx, end, (unsigned long **)&lp, &len)) {
+ if (!asn1_oid_decode(ctx, end, &lp, &len)) {
kfree(id);
return 0;
}
@@ -1206,8 +1206,7 @@ static int snmp_translate(struct nf_conn *ct,
if (!snmp_parse_mangle((unsigned char *)udph + sizeof(struct udphdr),
paylen, &map, &udph->check)) {
- if (net_ratelimit())
- printk(KERN_WARNING "bsalg: parser failed\n");
+ net_warn_ratelimited("bsalg: parser failed\n");
return NF_DROP;
}
return NF_ACCEPT;
@@ -1241,9 +1240,8 @@ static int help(struct sk_buff *skb, unsigned int protoff,
* can mess around with the payload.
*/
if (ntohs(udph->len) != skb->len - (iph->ihl << 2)) {
- if (net_ratelimit())
- printk(KERN_WARNING "SNMP: dropping malformed packet src=%pI4 dst=%pI4\n",
- &iph->saddr, &iph->daddr);
+ net_warn_ratelimited("SNMP: dropping malformed packet src=%pI4 dst=%pI4\n",
+ &iph->saddr, &iph->daddr);
return NF_DROP;
}
diff --git a/net/ipv4/netfilter/nf_nat_tftp.c b/net/ipv4/netfilter/nf_nat_tftp.c
index a2901bf829c0..9dbb8d284f99 100644
--- a/net/ipv4/netfilter/nf_nat_tftp.c
+++ b/net/ipv4/netfilter/nf_nat_tftp.c
@@ -8,10 +8,10 @@
#include <linux/module.h>
#include <linux/udp.h>
-#include <net/netfilter/nf_nat_helper.h>
-#include <net/netfilter/nf_nat_rule.h>
#include <net/netfilter/nf_conntrack_helper.h>
#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_nat_helper.h>
+#include <net/netfilter/nf_nat_rule.h>
#include <linux/netfilter/nf_conntrack_tftp.h>
MODULE_AUTHOR("Magnus Boden <mb@ozaba.mine.nu>");
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 50009c787bcd..6232d476f37e 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -51,15 +51,16 @@ static struct ping_table ping_table;
static u16 ping_port_rover;
-static inline int ping_hashfn(struct net *net, unsigned num, unsigned mask)
+static inline int ping_hashfn(struct net *net, unsigned int num, unsigned int mask)
{
int res = (num + net_hash_mix(net)) & mask;
+
pr_debug("hash(%d) = %d\n", num, res);
return res;
}
static inline struct hlist_nulls_head *ping_hashslot(struct ping_table *table,
- struct net *net, unsigned num)
+ struct net *net, unsigned int num)
{
return &table->hash[ping_hashfn(net, num, PING_HTABLE_MASK)];
}
@@ -188,7 +189,8 @@ static void inet_get_ping_group_range_net(struct net *net, gid_t *low,
gid_t *high)
{
gid_t *data = net->ipv4.sysctl_ping_group_range;
- unsigned seq;
+ unsigned int seq;
+
do {
seq = read_seqbegin(&sysctl_local_ports.lock);
@@ -205,17 +207,22 @@ static int ping_init_sock(struct sock *sk)
gid_t range[2];
struct group_info *group_info = get_current_groups();
int i, j, count = group_info->ngroups;
+ kgid_t low, high;
inet_get_ping_group_range_net(net, range, range+1);
+ low = make_kgid(&init_user_ns, range[0]);
+ high = make_kgid(&init_user_ns, range[1]);
+ if (!gid_valid(low) || !gid_valid(high) || gid_lt(high, low))
+ return -EACCES;
+
if (range[0] <= group && group <= range[1])
return 0;
for (i = 0; i < group_info->nblocks; i++) {
int cp_count = min_t(int, NGROUPS_PER_BLOCK, count);
-
for (j = 0; j < cp_count; j++) {
- group = group_info->blocks[i][j];
- if (range[0] <= group && group <= range[1])
+ kgid_t gid = group_info->blocks[i][j];
+ if (gid_lte(low, gid) && gid_lte(gid, high))
return 0;
}
@@ -364,6 +371,7 @@ void ping_err(struct sk_buff *skb, u32 info)
break;
case ICMP_DEST_UNREACH:
if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */
+ ipv4_sk_update_pmtu(skb, sk, info);
if (inet_sock->pmtudisc != IP_PMTUDISC_DONT) {
err = EMSGSIZE;
harderr = 1;
@@ -379,6 +387,7 @@ void ping_err(struct sk_buff *skb, u32 info)
break;
case ICMP_REDIRECT:
/* See ICMP_SOURCE_QUENCH */
+ ipv4_sk_redirect(skb, sk);
err = EREMOTEIO;
break;
}
@@ -410,7 +419,7 @@ struct pingfakehdr {
__wsum wcheck;
};
-static int ping_getfrag(void *from, char * to,
+static int ping_getfrag(void *from, char *to,
int offset, int fraglen, int odd, struct sk_buff *skb)
{
struct pingfakehdr *pfh = (struct pingfakehdr *)from;
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 8af0d44e4e22..957acd12250b 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -232,7 +232,6 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("TCPDSACKOfoSent", LINUX_MIB_TCPDSACKOFOSENT),
SNMP_MIB_ITEM("TCPDSACKRecv", LINUX_MIB_TCPDSACKRECV),
SNMP_MIB_ITEM("TCPDSACKOfoRecv", LINUX_MIB_TCPDSACKOFORECV),
- SNMP_MIB_ITEM("TCPAbortOnSyn", LINUX_MIB_TCPABORTONSYN),
SNMP_MIB_ITEM("TCPAbortOnData", LINUX_MIB_TCPABORTONDATA),
SNMP_MIB_ITEM("TCPAbortOnClose", LINUX_MIB_TCPABORTONCLOSE),
SNMP_MIB_ITEM("TCPAbortOnMemory", LINUX_MIB_TCPABORTONMEMORY),
@@ -258,6 +257,12 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("TCPReqQFullDrop", LINUX_MIB_TCPREQQFULLDROP),
SNMP_MIB_ITEM("TCPRetransFail", LINUX_MIB_TCPRETRANSFAIL),
SNMP_MIB_ITEM("TCPRcvCoalesce", LINUX_MIB_TCPRCVCOALESCE),
+ SNMP_MIB_ITEM("TCPOFOQueue", LINUX_MIB_TCPOFOQUEUE),
+ SNMP_MIB_ITEM("TCPOFODrop", LINUX_MIB_TCPOFODROP),
+ SNMP_MIB_ITEM("TCPOFOMerge", LINUX_MIB_TCPOFOMERGE),
+ SNMP_MIB_ITEM("TCPChallengeACK", LINUX_MIB_TCPCHALLENGEACK),
+ SNMP_MIB_ITEM("TCPSYNChallenge", LINUX_MIB_TCPSYNCHALLENGE),
+ SNMP_MIB_ITEM("TCPFastOpenActive", LINUX_MIB_TCPFASTOPENACTIVE),
SNMP_MIB_SENTINEL
};
diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c
index 9ae5c01cd0b2..8918eff1426d 100644
--- a/net/ipv4/protocol.c
+++ b/net/ipv4/protocol.c
@@ -36,9 +36,7 @@ const struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly;
int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol)
{
- int hash = protocol & (MAX_INET_PROTOS - 1);
-
- return !cmpxchg((const struct net_protocol **)&inet_protos[hash],
+ return !cmpxchg((const struct net_protocol **)&inet_protos[protocol],
NULL, prot) ? 0 : -1;
}
EXPORT_SYMBOL(inet_add_protocol);
@@ -49,9 +47,9 @@ EXPORT_SYMBOL(inet_add_protocol);
int inet_del_protocol(const struct net_protocol *prot, unsigned char protocol)
{
- int ret, hash = protocol & (MAX_INET_PROTOS - 1);
+ int ret;
- ret = (cmpxchg((const struct net_protocol **)&inet_protos[hash],
+ ret = (cmpxchg((const struct net_protocol **)&inet_protos[protocol],
prot, NULL) == prot) ? 0 : -1;
synchronize_net();
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index bbd604c68e68..ff0f071969ea 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -216,6 +216,11 @@ static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info)
int err = 0;
int harderr = 0;
+ if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
+ ipv4_sk_update_pmtu(skb, sk, info);
+ else if (type == ICMP_REDIRECT)
+ ipv4_sk_redirect(skb, sk);
+
/* Report error on raw socket, if:
1. User requested ip_recverr.
2. Socket is connected (otherwise the error indication
@@ -288,7 +293,7 @@ void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info)
read_unlock(&raw_v4_hashinfo.lock);
}
-static int raw_rcv_skb(struct sock * sk, struct sk_buff * skb)
+static int raw_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
/* Charge it to the socket. */
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 4dc1c104c942..d547f6fae20d 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -109,6 +109,7 @@
#include <net/rtnetlink.h>
#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
+#include <linux/kmemleak.h>
#endif
#include <net/secure_seq.h>
@@ -147,7 +148,10 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst);
static void ipv4_dst_destroy(struct dst_entry *dst);
static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
static void ipv4_link_failure(struct sk_buff *skb);
-static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
+static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
+ struct sk_buff *skb, u32 mtu);
+static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
+ struct sk_buff *skb);
static int rt_garbage_collect(struct dst_ops *ops);
static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
@@ -157,40 +161,13 @@ static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
{
- struct rtable *rt = (struct rtable *) dst;
- struct inet_peer *peer;
- u32 *p = NULL;
-
- if (!rt->peer)
- rt_bind_peer(rt, rt->rt_dst, 1);
-
- peer = rt->peer;
- if (peer) {
- u32 *old_p = __DST_METRICS_PTR(old);
- unsigned long prev, new;
-
- p = peer->metrics;
- if (inet_metrics_new(peer))
- memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
-
- new = (unsigned long) p;
- prev = cmpxchg(&dst->_metrics, old, new);
-
- if (prev != old) {
- p = __DST_METRICS_PTR(prev);
- if (prev & DST_METRICS_READ_ONLY)
- p = NULL;
- } else {
- if (rt->fi) {
- fib_info_put(rt->fi);
- rt->fi = NULL;
- }
- }
- }
- return p;
+ WARN_ON(1);
+ return NULL;
}
-static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
+static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
+ struct sk_buff *skb,
+ const void *daddr);
static struct dst_ops ipv4_dst_ops = {
.family = AF_INET,
@@ -205,6 +182,7 @@ static struct dst_ops ipv4_dst_ops = {
.negative_advice = ipv4_negative_advice,
.link_failure = ipv4_link_failure,
.update_pmtu = ip_rt_update_pmtu,
+ .redirect = ip_do_redirect,
.local_out = __ip_local_out,
.neigh_lookup = ipv4_neigh_lookup,
};
@@ -229,7 +207,7 @@ const __u8 ip_tos2prio[16] = {
TC_PRIO_INTERACTIVE_BULK,
ECN_OR_COST(INTERACTIVE_BULK)
};
-
+EXPORT_SYMBOL(ip_tos2prio);
/*
* Route cache.
@@ -296,7 +274,7 @@ static inline void rt_hash_lock_init(void)
#endif
static struct rt_hash_bucket *rt_hash_table __read_mostly;
-static unsigned rt_hash_mask __read_mostly;
+static unsigned int rt_hash_mask __read_mostly;
static unsigned int rt_hash_log __read_mostly;
static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
@@ -420,29 +398,19 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v)
"HHUptod\tSpecDst");
else {
struct rtable *r = v;
- struct neighbour *n;
- int len, HHUptod;
-
- rcu_read_lock();
- n = dst_get_neighbour_noref(&r->dst);
- HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0;
- rcu_read_unlock();
+ int len;
seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
- "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
- r->dst.dev ? r->dst.dev->name : "*",
- (__force u32)r->rt_dst,
- (__force u32)r->rt_gateway,
- r->rt_flags, atomic_read(&r->dst.__refcnt),
- r->dst.__use, 0, (__force u32)r->rt_src,
- dst_metric_advmss(&r->dst) + 40,
- dst_metric(&r->dst, RTAX_WINDOW),
- (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
- dst_metric(&r->dst, RTAX_RTTVAR)),
- r->rt_key_tos,
- -1,
- HHUptod,
- r->rt_spec_dst, &len);
+ "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
+ r->dst.dev ? r->dst.dev->name : "*",
+ (__force u32)r->rt_dst,
+ (__force u32)r->rt_gateway,
+ r->rt_flags, atomic_read(&r->dst.__refcnt),
+ r->dst.__use, 0, (__force u32)r->rt_src,
+ dst_metric_advmss(&r->dst) + 40,
+ dst_metric(&r->dst, RTAX_WINDOW), 0,
+ r->rt_key_tos,
+ -1, 0, 0, &len);
seq_printf(seq, "%*s\n", 127 - len, "");
}
@@ -679,7 +647,7 @@ static inline int rt_fast_clean(struct rtable *rth)
static inline int rt_valuable(struct rtable *rth)
{
return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
- (rth->peer && rth->peer->pmtu_expires);
+ rth->dst.expires;
}
static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
@@ -872,34 +840,22 @@ static void rt_check_expire(void)
while ((rth = rcu_dereference_protected(*rthp,
lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
prefetch(rth->dst.rt_next);
- if (rt_is_expired(rth)) {
+ if (rt_is_expired(rth) ||
+ rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
*rthp = rth->dst.rt_next;
rt_free(rth);
continue;
}
- if (rth->dst.expires) {
- /* Entry is expired even if it is in use */
- if (time_before_eq(jiffies, rth->dst.expires)) {
-nofree:
- tmo >>= 1;
- rthp = &rth->dst.rt_next;
- /*
- * We only count entries on
- * a chain with equal hash inputs once
- * so that entries for different QOS
- * levels, and other non-hash input
- * attributes don't unfairly skew
- * the length computation
- */
- length += has_noalias(rt_hash_table[i].chain, rth);
- continue;
- }
- } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
- goto nofree;
- /* Cleanup aged off entries. */
- *rthp = rth->dst.rt_next;
- rt_free(rth);
+ /* We only count entries on a chain with equal
+ * hash inputs once so that entries for
+ * different QOS levels, and other non-hash
+ * input attributes don't unfairly skew the
+ * length computation
+ */
+ tmo >>= 1;
+ rthp = &rth->dst.rt_next;
+ length += has_noalias(rt_hash_table[i].chain, rth);
}
spin_unlock_bh(rt_hash_lock_addr(i));
sum += length;
@@ -937,7 +893,6 @@ static void rt_cache_invalidate(struct net *net)
get_random_bytes(&shuffle, sizeof(shuffle));
atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
- inetpeer_invalidate_tree(AF_INET);
}
/*
@@ -959,8 +914,7 @@ void rt_cache_flush_batch(struct net *net)
static void rt_emergency_hash_rebuild(struct net *net)
{
- if (net_ratelimit())
- pr_warn("Route hash chain too long!\n");
+ net_warn_ratelimited("Route hash chain too long!\n");
rt_cache_invalidate(net);
}
@@ -1083,8 +1037,7 @@ static int rt_garbage_collect(struct dst_ops *ops)
goto out;
if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
goto out;
- if (net_ratelimit())
- pr_warn("dst cache overflow\n");
+ net_warn_ratelimited("dst cache overflow\n");
RT_CACHE_STAT_INC(gc_dst_overflow);
return 1;
@@ -1112,20 +1065,20 @@ static int slow_chain_length(const struct rtable *head)
return length >> FRACT_BITS;
}
-static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
+static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
+ struct sk_buff *skb,
+ const void *daddr)
{
- static const __be32 inaddr_any = 0;
struct net_device *dev = dst->dev;
const __be32 *pkey = daddr;
const struct rtable *rt;
struct neighbour *n;
rt = (const struct rtable *) dst;
-
- if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
- pkey = &inaddr_any;
- else if (rt->rt_gateway)
+ if (rt->rt_gateway)
pkey = (const __be32 *) &rt->rt_gateway;
+ else if (skb)
+ pkey = &ip_hdr(skb)->daddr;
n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
if (n)
@@ -1133,17 +1086,7 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const vo
return neigh_create(&arp_tbl, pkey, dev);
}
-static int rt_bind_neighbour(struct rtable *rt)
-{
- struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
- if (IS_ERR(n))
- return PTR_ERR(n);
- dst_set_neighbour(&rt->dst, n);
-
- return 0;
-}
-
-static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
+static struct rtable *rt_intern_hash(unsigned int hash, struct rtable *rt,
struct sk_buff *skb, int ifindex)
{
struct rtable *rth, *cand;
@@ -1151,7 +1094,6 @@ static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
unsigned long now;
u32 min_score;
int chain_length;
- int attempts = !in_softirq();
restart:
chain_length = 0;
@@ -1160,7 +1102,7 @@ restart:
candp = NULL;
now = jiffies;
- if (!rt_caching(dev_net(rt->dst.dev))) {
+ if (!rt_caching(dev_net(rt->dst.dev)) || (rt->dst.flags & DST_NOCACHE)) {
/*
* If we're not caching, just tell the caller we
* were successful and don't touch the route. The
@@ -1178,16 +1120,6 @@ restart:
*/
rt->dst.flags |= DST_NOCACHE;
- if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
- int err = rt_bind_neighbour(rt);
- if (err) {
- if (net_ratelimit())
- pr_warn("Neighbour table failure & not caching routes\n");
- ip_rt_put(rt);
- return ERR_PTR(err);
- }
- }
-
goto skip_hashing;
}
@@ -1270,41 +1202,6 @@ restart:
}
}
- /* Try to bind route to arp only if it is output
- route or unicast forwarding path.
- */
- if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
- int err = rt_bind_neighbour(rt);
- if (err) {
- spin_unlock_bh(rt_hash_lock_addr(hash));
-
- if (err != -ENOBUFS) {
- rt_drop(rt);
- return ERR_PTR(err);
- }
-
- /* Neighbour tables are full and nothing
- can be released. Try to shrink route cache,
- it is most likely it holds some neighbour records.
- */
- if (attempts-- > 0) {
- int saved_elasticity = ip_rt_gc_elasticity;
- int saved_int = ip_rt_gc_min_interval;
- ip_rt_gc_elasticity = 1;
- ip_rt_gc_min_interval = 0;
- rt_garbage_collect(&ipv4_dst_ops);
- ip_rt_gc_min_interval = saved_int;
- ip_rt_gc_elasticity = saved_elasticity;
- goto restart;
- }
-
- if (net_ratelimit())
- pr_warn("Neighbour table overflow\n");
- rt_drop(rt);
- return ERR_PTR(-ENOBUFS);
- }
- }
-
rt->dst.rt_next = rt_hash_table[hash].chain;
/*
@@ -1322,25 +1219,6 @@ skip_hashing:
return rt;
}
-static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
-
-static u32 rt_peer_genid(void)
-{
- return atomic_read(&__rt_peer_genid);
-}
-
-void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
-{
- struct inet_peer *peer;
-
- peer = inet_getpeer_v4(daddr, create);
-
- if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
- inet_putpeer(peer);
- else
- rt->rt_peer_genid = rt_peer_genid();
-}
-
/*
* Peer allocation may fail only in serious out-of-memory conditions. However
* we still can generate some output.
@@ -1363,28 +1241,21 @@ static void ip_select_fb_ident(struct iphdr *iph)
void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
{
- struct rtable *rt = (struct rtable *) dst;
-
- if (rt && !(rt->dst.flags & DST_NOPEER)) {
- if (rt->peer == NULL)
- rt_bind_peer(rt, rt->rt_dst, 1);
+ struct net *net = dev_net(dst->dev);
+ struct inet_peer *peer;
- /* If peer is attached to destination, it is never detached,
- so that we need not to grab a lock to dereference it.
- */
- if (rt->peer) {
- iph->id = htons(inet_getid(rt->peer, more));
- return;
- }
- } else if (!rt)
- printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
- __builtin_return_address(0));
+ peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
+ if (peer) {
+ iph->id = htons(inet_getid(peer, more));
+ inet_putpeer(peer);
+ return;
+ }
ip_select_fb_ident(iph);
}
EXPORT_SYMBOL(__ip_select_ident);
-static void rt_del(unsigned hash, struct rtable *rt)
+static void rt_del(unsigned int hash, struct rtable *rt)
{
struct rtable __rcu **rthp;
struct rtable *aux;
@@ -1404,43 +1275,173 @@ static void rt_del(unsigned hash, struct rtable *rt)
spin_unlock_bh(rt_hash_lock_addr(hash));
}
-static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
+static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
+ const struct iphdr *iph,
+ int oif, u8 tos,
+ u8 prot, u32 mark, int flow_flags)
{
- struct rtable *rt = (struct rtable *) dst;
- __be32 orig_gw = rt->rt_gateway;
- struct neighbour *n, *old_n;
+ if (sk) {
+ const struct inet_sock *inet = inet_sk(sk);
+
+ oif = sk->sk_bound_dev_if;
+ mark = sk->sk_mark;
+ tos = RT_CONN_FLAGS(sk);
+ prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
+ }
+ flowi4_init_output(fl4, oif, mark, tos,
+ RT_SCOPE_UNIVERSE, prot,
+ flow_flags,
+ iph->daddr, iph->saddr, 0, 0);
+}
- dst_confirm(&rt->dst);
+static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
+ const struct sock *sk)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+ int oif = skb->dev->ifindex;
+ u8 tos = RT_TOS(iph->tos);
+ u8 prot = iph->protocol;
+ u32 mark = skb->mark;
- rt->rt_gateway = peer->redirect_learned.a4;
+ __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
+}
- n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
- if (IS_ERR(n)) {
- rt->rt_gateway = orig_gw;
- return;
+static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
+{
+ const struct inet_sock *inet = inet_sk(sk);
+ const struct ip_options_rcu *inet_opt;
+ __be32 daddr = inet->inet_daddr;
+
+ rcu_read_lock();
+ inet_opt = rcu_dereference(inet->inet_opt);
+ if (inet_opt && inet_opt->opt.srr)
+ daddr = inet_opt->opt.faddr;
+ flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
+ RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
+ inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
+ inet_sk_flowi_flags(sk),
+ daddr, inet->inet_saddr, 0, 0);
+ rcu_read_unlock();
+}
+
+static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
+ const struct sk_buff *skb)
+{
+ if (skb)
+ build_skb_flow_key(fl4, skb, sk);
+ else
+ build_sk_flow_key(fl4, sk);
+}
+
+static DEFINE_SEQLOCK(fnhe_seqlock);
+
+static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
+{
+ struct fib_nh_exception *fnhe, *oldest;
+
+ oldest = rcu_dereference(hash->chain);
+ for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
+ fnhe = rcu_dereference(fnhe->fnhe_next)) {
+ if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
+ oldest = fnhe;
+ }
+ return oldest;
+}
+
+static inline u32 fnhe_hashfun(__be32 daddr)
+{
+ u32 hval;
+
+ hval = (__force u32) daddr;
+ hval ^= (hval >> 11) ^ (hval >> 22);
+
+ return hval & (FNHE_HASH_SIZE - 1);
+}
+
+static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
+ u32 pmtu, unsigned long expires)
+{
+ struct fnhe_hash_bucket *hash;
+ struct fib_nh_exception *fnhe;
+ int depth;
+ u32 hval = fnhe_hashfun(daddr);
+
+ write_seqlock_bh(&fnhe_seqlock);
+
+ hash = nh->nh_exceptions;
+ if (!hash) {
+ hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
+ if (!hash)
+ goto out_unlock;
+ nh->nh_exceptions = hash;
+ }
+
+ hash += hval;
+
+ depth = 0;
+ for (fnhe = rcu_dereference(hash->chain); fnhe;
+ fnhe = rcu_dereference(fnhe->fnhe_next)) {
+ if (fnhe->fnhe_daddr == daddr)
+ break;
+ depth++;
}
- old_n = xchg(&rt->dst._neighbour, n);
- if (old_n)
- neigh_release(old_n);
- if (!(n->nud_state & NUD_VALID)) {
- neigh_event_send(n, NULL);
+
+ if (fnhe) {
+ if (gw)
+ fnhe->fnhe_gw = gw;
+ if (pmtu) {
+ fnhe->fnhe_pmtu = pmtu;
+ fnhe->fnhe_expires = expires;
+ }
} else {
- rt->rt_flags |= RTCF_REDIRECTED;
- call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
+ if (depth > FNHE_RECLAIM_DEPTH)
+ fnhe = fnhe_oldest(hash);
+ else {
+ fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
+ if (!fnhe)
+ goto out_unlock;
+
+ fnhe->fnhe_next = hash->chain;
+ rcu_assign_pointer(hash->chain, fnhe);
+ }
+ fnhe->fnhe_daddr = daddr;
+ fnhe->fnhe_gw = gw;
+ fnhe->fnhe_pmtu = pmtu;
+ fnhe->fnhe_expires = expires;
}
+
+ fnhe->fnhe_stamp = jiffies;
+
+out_unlock:
+ write_sequnlock_bh(&fnhe_seqlock);
+ return;
}
-/* called in rcu_read_lock() section */
-void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
- __be32 saddr, struct net_device *dev)
+static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4)
{
- int s, i;
- struct in_device *in_dev = __in_dev_get_rcu(dev);
- __be32 skeys[2] = { saddr, 0 };
- int ikeys[2] = { dev->ifindex, 0 };
- struct inet_peer *peer;
+ __be32 new_gw = icmp_hdr(skb)->un.gateway;
+ __be32 old_gw = ip_hdr(skb)->saddr;
+ struct net_device *dev = skb->dev;
+ struct in_device *in_dev;
+ struct fib_result res;
+ struct neighbour *n;
struct net *net;
+ switch (icmp_hdr(skb)->code & 7) {
+ case ICMP_REDIR_NET:
+ case ICMP_REDIR_NETTOS:
+ case ICMP_REDIR_HOST:
+ case ICMP_REDIR_HOSTTOS:
+ break;
+
+ default:
+ return;
+ }
+
+ if (rt->rt_gateway != old_gw)
+ return;
+
+ in_dev = __in_dev_get_rcu(dev);
if (!in_dev)
return;
@@ -1460,72 +1461,50 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
goto reject_redirect;
}
- for (s = 0; s < 2; s++) {
- for (i = 0; i < 2; i++) {
- unsigned int hash;
- struct rtable __rcu **rthp;
- struct rtable *rt;
-
- hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
-
- rthp = &rt_hash_table[hash].chain;
-
- while ((rt = rcu_dereference(*rthp)) != NULL) {
- rthp = &rt->dst.rt_next;
-
- if (rt->rt_key_dst != daddr ||
- rt->rt_key_src != skeys[s] ||
- rt->rt_oif != ikeys[i] ||
- rt_is_input_route(rt) ||
- rt_is_expired(rt) ||
- !net_eq(dev_net(rt->dst.dev), net) ||
- rt->dst.error ||
- rt->dst.dev != dev ||
- rt->rt_gateway != old_gw)
- continue;
-
- if (!rt->peer)
- rt_bind_peer(rt, rt->rt_dst, 1);
+ n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
+ if (n) {
+ if (!(n->nud_state & NUD_VALID)) {
+ neigh_event_send(n, NULL);
+ } else {
+ if (fib_lookup(net, fl4, &res) == 0) {
+ struct fib_nh *nh = &FIB_RES_NH(res);
- peer = rt->peer;
- if (peer) {
- if (peer->redirect_learned.a4 != new_gw) {
- peer->redirect_learned.a4 = new_gw;
- atomic_inc(&__rt_peer_genid);
- }
- check_peer_redir(&rt->dst, peer);
- }
+ update_or_create_fnhe(nh, fl4->daddr, new_gw,
+ 0, 0);
}
+ rt->rt_gateway = new_gw;
+ rt->rt_flags |= RTCF_REDIRECTED;
+ call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
}
+ neigh_release(n);
}
return;
reject_redirect:
#ifdef CONFIG_IP_ROUTE_VERBOSE
- if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
- pr_info("Redirect from %pI4 on %s about %pI4 ignored\n"
- " Advised path = %pI4 -> %pI4\n",
- &old_gw, dev->name, &new_gw,
- &saddr, &daddr);
+ if (IN_DEV_LOG_MARTIANS(in_dev)) {
+ const struct iphdr *iph = (const struct iphdr *) skb->data;
+ __be32 daddr = iph->daddr;
+ __be32 saddr = iph->saddr;
+
+ net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
+ " Advised path = %pI4 -> %pI4\n",
+ &old_gw, dev->name, &new_gw,
+ &saddr, &daddr);
+ }
#endif
;
}
-static bool peer_pmtu_expired(struct inet_peer *peer)
+static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
{
- unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
-
- return orig &&
- time_after_eq(jiffies, orig) &&
- cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
-}
+ struct rtable *rt;
+ struct flowi4 fl4;
-static bool peer_pmtu_cleaned(struct inet_peer *peer)
-{
- unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
+ rt = (struct rtable *) dst;
- return orig &&
- cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
+ ip_rt_build_flow_key(&fl4, sk, skb);
+ __ip_do_redirect(rt, skb, &fl4);
}
static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
@@ -1537,14 +1516,13 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
if (dst->obsolete > 0) {
ip_rt_put(rt);
ret = NULL;
- } else if (rt->rt_flags & RTCF_REDIRECTED) {
- unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
+ } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
+ rt->dst.expires) {
+ unsigned int hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
rt->rt_oif,
rt_genid(dev_net(dst->dev)));
rt_del(hash, rt);
ret = NULL;
- } else if (rt->peer && peer_pmtu_expired(rt->peer)) {
- dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
}
}
return ret;
@@ -1571,6 +1549,7 @@ void ip_rt_send_redirect(struct sk_buff *skb)
struct rtable *rt = skb_rtable(skb);
struct in_device *in_dev;
struct inet_peer *peer;
+ struct net *net;
int log_martians;
rcu_read_lock();
@@ -1582,9 +1561,8 @@ void ip_rt_send_redirect(struct sk_buff *skb)
log_martians = IN_DEV_LOG_MARTIANS(in_dev);
rcu_read_unlock();
- if (!rt->peer)
- rt_bind_peer(rt, rt->rt_dst, 1);
- peer = rt->peer;
+ net = dev_net(rt->dst.dev);
+ peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
if (!peer) {
icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
return;
@@ -1601,7 +1579,7 @@ void ip_rt_send_redirect(struct sk_buff *skb)
*/
if (peer->rate_tokens >= ip_rt_redirect_number) {
peer->rate_last = jiffies;
- return;
+ goto out_put_peer;
}
/* Check for load limit; set rate_last to the latest sent
@@ -1616,23 +1594,40 @@ void ip_rt_send_redirect(struct sk_buff *skb)
++peer->rate_tokens;
#ifdef CONFIG_IP_ROUTE_VERBOSE
if (log_martians &&
- peer->rate_tokens == ip_rt_redirect_number &&
- net_ratelimit())
- pr_warn("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
- &ip_hdr(skb)->saddr, rt->rt_iif,
- &rt->rt_dst, &rt->rt_gateway);
+ peer->rate_tokens == ip_rt_redirect_number)
+ net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
+ &ip_hdr(skb)->saddr, rt->rt_iif,
+ &rt->rt_dst, &rt->rt_gateway);
#endif
}
+out_put_peer:
+ inet_putpeer(peer);
}
static int ip_error(struct sk_buff *skb)
{
+ struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
struct rtable *rt = skb_rtable(skb);
struct inet_peer *peer;
unsigned long now;
+ struct net *net;
bool send;
int code;
+ net = dev_net(rt->dst.dev);
+ if (!IN_DEV_FORWARD(in_dev)) {
+ switch (rt->dst.error) {
+ case EHOSTUNREACH:
+ IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
+ break;
+
+ case ENETUNREACH:
+ IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
+ break;
+ }
+ goto out;
+ }
+
switch (rt->dst.error) {
case EINVAL:
default:
@@ -1642,17 +1637,14 @@ static int ip_error(struct sk_buff *skb)
break;
case ENETUNREACH:
code = ICMP_NET_UNREACH;
- IP_INC_STATS_BH(dev_net(rt->dst.dev),
- IPSTATS_MIB_INNOROUTES);
+ IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
break;
case EACCES:
code = ICMP_PKT_FILTERED;
break;
}
- if (!rt->peer)
- rt_bind_peer(rt, rt->rt_dst, 1);
- peer = rt->peer;
+ peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
send = true;
if (peer) {
@@ -1665,6 +1657,7 @@ static int ip_error(struct sk_buff *skb)
peer->rate_tokens -= ip_rt_error_cost;
else
send = false;
+ inet_putpeer(peer);
}
if (send)
icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
@@ -1673,136 +1666,96 @@ out: kfree_skb(skb);
return 0;
}
-/*
- * The last two values are not from the RFC but
- * are needed for AMPRnet AX.25 paths.
- */
+static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
+{
+ struct fib_result res;
-static const unsigned short mtu_plateau[] =
-{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
+ if (mtu < ip_rt_min_pmtu)
+ mtu = ip_rt_min_pmtu;
-static inline unsigned short guess_mtu(unsigned short old_mtu)
-{
- int i;
+ if (fib_lookup(dev_net(rt->dst.dev), fl4, &res) == 0) {
+ struct fib_nh *nh = &FIB_RES_NH(res);
- for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
- if (old_mtu > mtu_plateau[i])
- return mtu_plateau[i];
- return 68;
+ update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
+ jiffies + ip_rt_mtu_expires);
+ }
+ rt->rt_pmtu = mtu;
+ dst_set_expires(&rt->dst, ip_rt_mtu_expires);
}
-unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
- unsigned short new_mtu,
- struct net_device *dev)
+static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
+ struct sk_buff *skb, u32 mtu)
{
- unsigned short old_mtu = ntohs(iph->tot_len);
- unsigned short est_mtu = 0;
- struct inet_peer *peer;
-
- peer = inet_getpeer_v4(iph->daddr, 1);
- if (peer) {
- unsigned short mtu = new_mtu;
-
- if (new_mtu < 68 || new_mtu >= old_mtu) {
- /* BSD 4.2 derived systems incorrectly adjust
- * tot_len by the IP header length, and report
- * a zero MTU in the ICMP message.
- */
- if (mtu == 0 &&
- old_mtu >= 68 + (iph->ihl << 2))
- old_mtu -= iph->ihl << 2;
- mtu = guess_mtu(old_mtu);
- }
-
- if (mtu < ip_rt_min_pmtu)
- mtu = ip_rt_min_pmtu;
- if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
- unsigned long pmtu_expires;
+ struct rtable *rt = (struct rtable *) dst;
+ struct flowi4 fl4;
- pmtu_expires = jiffies + ip_rt_mtu_expires;
- if (!pmtu_expires)
- pmtu_expires = 1UL;
+ ip_rt_build_flow_key(&fl4, sk, skb);
+ __ip_rt_update_pmtu(rt, &fl4, mtu);
+}
- est_mtu = mtu;
- peer->pmtu_learned = mtu;
- peer->pmtu_expires = pmtu_expires;
- atomic_inc(&__rt_peer_genid);
- }
+void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
+ int oif, u32 mark, u8 protocol, int flow_flags)
+{
+ const struct iphdr *iph = (const struct iphdr *) skb->data;
+ struct flowi4 fl4;
+ struct rtable *rt;
- inet_putpeer(peer);
+ __build_flow_key(&fl4, NULL, iph, oif,
+ RT_TOS(iph->tos), protocol, mark, flow_flags);
+ rt = __ip_route_output_key(net, &fl4);
+ if (!IS_ERR(rt)) {
+ __ip_rt_update_pmtu(rt, &fl4, mtu);
+ ip_rt_put(rt);
}
- return est_mtu ? : new_mtu;
}
+EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
-static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
+void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
{
- unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
+ const struct iphdr *iph = (const struct iphdr *) skb->data;
+ struct flowi4 fl4;
+ struct rtable *rt;
- if (!expires)
- return;
- if (time_before(jiffies, expires)) {
- u32 orig_dst_mtu = dst_mtu(dst);
- if (peer->pmtu_learned < orig_dst_mtu) {
- if (!peer->pmtu_orig)
- peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
- dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
- }
- } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
- dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
+ __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
+ rt = __ip_route_output_key(sock_net(sk), &fl4);
+ if (!IS_ERR(rt)) {
+ __ip_rt_update_pmtu(rt, &fl4, mtu);
+ ip_rt_put(rt);
+ }
}
+EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
-static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
+void ipv4_redirect(struct sk_buff *skb, struct net *net,
+ int oif, u32 mark, u8 protocol, int flow_flags)
{
- struct rtable *rt = (struct rtable *) dst;
- struct inet_peer *peer;
-
- dst_confirm(dst);
-
- if (!rt->peer)
- rt_bind_peer(rt, rt->rt_dst, 1);
- peer = rt->peer;
- if (peer) {
- unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
-
- if (mtu < ip_rt_min_pmtu)
- mtu = ip_rt_min_pmtu;
- if (!pmtu_expires || mtu < peer->pmtu_learned) {
-
- pmtu_expires = jiffies + ip_rt_mtu_expires;
- if (!pmtu_expires)
- pmtu_expires = 1UL;
-
- peer->pmtu_learned = mtu;
- peer->pmtu_expires = pmtu_expires;
+ const struct iphdr *iph = (const struct iphdr *) skb->data;
+ struct flowi4 fl4;
+ struct rtable *rt;
- atomic_inc(&__rt_peer_genid);
- rt->rt_peer_genid = rt_peer_genid();
- }
- check_peer_pmtu(dst, peer);
+ __build_flow_key(&fl4, NULL, iph, oif,
+ RT_TOS(iph->tos), protocol, mark, flow_flags);
+ rt = __ip_route_output_key(net, &fl4);
+ if (!IS_ERR(rt)) {
+ __ip_do_redirect(rt, skb, &fl4);
+ ip_rt_put(rt);
}
}
+EXPORT_SYMBOL_GPL(ipv4_redirect);
-
-static void ipv4_validate_peer(struct rtable *rt)
+void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
{
- if (rt->rt_peer_genid != rt_peer_genid()) {
- struct inet_peer *peer;
-
- if (!rt->peer)
- rt_bind_peer(rt, rt->rt_dst, 0);
-
- peer = rt->peer;
- if (peer) {
- check_peer_pmtu(&rt->dst, peer);
-
- if (peer->redirect_learned.a4 &&
- peer->redirect_learned.a4 != rt->rt_gateway)
- check_peer_redir(&rt->dst, peer);
- }
+ const struct iphdr *iph = (const struct iphdr *) skb->data;
+ struct flowi4 fl4;
+ struct rtable *rt;
- rt->rt_peer_genid = rt_peer_genid();
+ __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
+ rt = __ip_route_output_key(sock_net(sk), &fl4);
+ if (!IS_ERR(rt)) {
+ __ip_do_redirect(rt, skb, &fl4);
+ ip_rt_put(rt);
}
}
+EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
{
@@ -1810,23 +1763,17 @@ static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
if (rt_is_expired(rt))
return NULL;
- ipv4_validate_peer(rt);
return dst;
}
static void ipv4_dst_destroy(struct dst_entry *dst)
{
struct rtable *rt = (struct rtable *) dst;
- struct inet_peer *peer = rt->peer;
if (rt->fi) {
fib_info_put(rt->fi);
rt->fi = NULL;
}
- if (peer) {
- rt->peer = NULL;
- inet_putpeer(peer);
- }
}
@@ -1837,15 +1784,15 @@ static void ipv4_link_failure(struct sk_buff *skb)
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
rt = skb_rtable(skb);
- if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
- dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
+ if (rt)
+ dst_set_expires(&rt->dst, 0);
}
static int ip_rt_bug(struct sk_buff *skb)
{
- printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
- &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
- skb->dev ? skb->dev->name : "?");
+ pr_debug("%s: %pI4 -> %pI4, %s\n",
+ __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
+ skb->dev ? skb->dev->name : "?");
kfree_skb(skb);
WARN_ON(1);
return 0;
@@ -1918,7 +1865,13 @@ static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
static unsigned int ipv4_mtu(const struct dst_entry *dst)
{
const struct rtable *rt = (const struct rtable *) dst;
- unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
+ unsigned int mtu = rt->rt_pmtu;
+
+ if (mtu && time_after_eq(jiffies, rt->dst.expires))
+ mtu = 0;
+
+ if (!mtu)
+ mtu = dst_metric_raw(dst, RTAX_MTU);
if (mtu && rt_is_output_route(rt))
return mtu;
@@ -1940,36 +1893,50 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst)
static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
struct fib_info *fi)
{
- struct inet_peer *peer;
- int create = 0;
+ if (fi->fib_metrics != (u32 *) dst_default_metrics) {
+ rt->fi = fi;
+ atomic_inc(&fi->fib_clntref);
+ }
+ dst_init_metrics(&rt->dst, fi->fib_metrics, true);
+}
- /* If a peer entry exists for this destination, we must hook
- * it up in order to get at cached metrics.
- */
- if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
- create = 1;
+static void rt_bind_exception(struct rtable *rt, struct fib_nh *nh, __be32 daddr)
+{
+ struct fnhe_hash_bucket *hash = nh->nh_exceptions;
+ struct fib_nh_exception *fnhe;
+ u32 hval;
- rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
- if (peer) {
- rt->rt_peer_genid = rt_peer_genid();
- if (inet_metrics_new(peer))
- memcpy(peer->metrics, fi->fib_metrics,
- sizeof(u32) * RTAX_MAX);
- dst_init_metrics(&rt->dst, peer->metrics, false);
+ hval = fnhe_hashfun(daddr);
- check_peer_pmtu(&rt->dst, peer);
+restart:
+ for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
+ fnhe = rcu_dereference(fnhe->fnhe_next)) {
+ __be32 fnhe_daddr, gw;
+ unsigned long expires;
+ unsigned int seq;
+ u32 pmtu;
+
+ seq = read_seqbegin(&fnhe_seqlock);
+ fnhe_daddr = fnhe->fnhe_daddr;
+ gw = fnhe->fnhe_gw;
+ pmtu = fnhe->fnhe_pmtu;
+ expires = fnhe->fnhe_expires;
+ if (read_seqretry(&fnhe_seqlock, seq))
+ goto restart;
+ if (daddr != fnhe_daddr)
+ continue;
+ if (pmtu) {
+ unsigned long diff = expires - jiffies;
- if (peer->redirect_learned.a4 &&
- peer->redirect_learned.a4 != rt->rt_gateway) {
- rt->rt_gateway = peer->redirect_learned.a4;
- rt->rt_flags |= RTCF_REDIRECTED;
- }
- } else {
- if (fi->fib_metrics != (u32 *) dst_default_metrics) {
- rt->fi = fi;
- atomic_inc(&fi->fib_clntref);
+ if (time_before(jiffies, expires)) {
+ rt->rt_pmtu = pmtu;
+ dst_set_expires(&rt->dst, diff);
+ }
}
- dst_init_metrics(&rt->dst, fi->fib_metrics, true);
+ if (gw)
+ rt->rt_gateway = gw;
+ fnhe->fnhe_stamp = jiffies;
+ break;
}
}
@@ -1977,26 +1944,22 @@ static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
const struct fib_result *res,
struct fib_info *fi, u16 type, u32 itag)
{
- struct dst_entry *dst = &rt->dst;
-
if (fi) {
- if (FIB_RES_GW(*res) &&
- FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
- rt->rt_gateway = FIB_RES_GW(*res);
+ struct fib_nh *nh = &FIB_RES_NH(*res);
+
+ if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK)
+ rt->rt_gateway = nh->nh_gw;
+ if (unlikely(nh->nh_exceptions))
+ rt_bind_exception(rt, nh, fl4->daddr);
rt_init_metrics(rt, fl4, fi);
#ifdef CONFIG_IP_ROUTE_CLASSID
- dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
+ rt->dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
#endif
}
- if (dst_mtu(dst) > IP_MAX_MTU)
- dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
- if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
- dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
-
#ifdef CONFIG_IP_ROUTE_CLASSID
#ifdef CONFIG_IP_MULTIPLE_TABLES
- set_class_tag(rt, fib_rules_tclass(res));
+ set_class_tag(rt, res->tclassid);
#endif
set_class_tag(rt, itag);
#endif
@@ -2017,7 +1980,6 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
{
unsigned int hash;
struct rtable *rth;
- __be32 spec_dst;
struct in_device *in_dev = __in_dev_get_rcu(dev);
u32 itag = 0;
int err;
@@ -2028,20 +1990,23 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
return -EINVAL;
if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
- ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
+ skb->protocol != htons(ETH_P_IP))
goto e_inval;
+ if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
+ if (ipv4_is_loopback(saddr))
+ goto e_inval;
+
if (ipv4_is_zeronet(saddr)) {
if (!ipv4_is_local_multicast(daddr))
goto e_inval;
- spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
} else {
- err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
- &itag);
+ err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
+ in_dev, &itag);
if (err < 0)
goto e_err;
}
- rth = rt_dst_alloc(init_net.loopback_dev,
+ rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
if (!rth)
goto e_nobufs;
@@ -2063,10 +2028,8 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
rth->rt_iif = dev->ifindex;
rth->rt_oif = 0;
rth->rt_mark = skb->mark;
+ rth->rt_pmtu = 0;
rth->rt_gateway = daddr;
- rth->rt_spec_dst= spec_dst;
- rth->rt_peer_genid = 0;
- rth->peer = NULL;
rth->fi = NULL;
if (our) {
rth->dst.input= ip_local_deliver;
@@ -2128,20 +2091,18 @@ static int __mkroute_input(struct sk_buff *skb,
int err;
struct in_device *out_dev;
unsigned int flags = 0;
- __be32 spec_dst;
u32 itag;
/* get a working reference to the output device */
out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
if (out_dev == NULL) {
- if (net_ratelimit())
- pr_crit("Bug in ip_route_input_slow(). Please report.\n");
+ net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
return -EINVAL;
}
err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
- in_dev->dev, &spec_dst, &itag);
+ in_dev->dev, in_dev, &itag);
if (err < 0) {
ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
saddr);
@@ -2192,10 +2153,8 @@ static int __mkroute_input(struct sk_buff *skb,
rth->rt_iif = in_dev->dev->ifindex;
rth->rt_oif = 0;
rth->rt_mark = skb->mark;
+ rth->rt_pmtu = 0;
rth->rt_gateway = daddr;
- rth->rt_spec_dst= spec_dst;
- rth->rt_peer_genid = 0;
- rth->peer = NULL;
rth->fi = NULL;
rth->dst.input = ip_forward;
@@ -2215,9 +2174,9 @@ static int ip_mkroute_input(struct sk_buff *skb,
struct in_device *in_dev,
__be32 daddr, __be32 saddr, u32 tos)
{
- struct rtable* rth = NULL;
+ struct rtable *rth = NULL;
int err;
- unsigned hash;
+ unsigned int hash;
#ifdef CONFIG_IP_ROUTE_MULTIPATH
if (res->fi && res->fi->fib_nhs > 1)
@@ -2255,13 +2214,12 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
struct fib_result res;
struct in_device *in_dev = __in_dev_get_rcu(dev);
struct flowi4 fl4;
- unsigned flags = 0;
+ unsigned int flags = 0;
u32 itag = 0;
- struct rtable * rth;
- unsigned hash;
- __be32 spec_dst;
+ struct rtable *rth;
+ unsigned int hash;
int err = -EINVAL;
- struct net * net = dev_net(dev);
+ struct net *net = dev_net(dev);
/* IP on this device is disabled. */
@@ -2272,8 +2230,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
by fib_lookup.
*/
- if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
- ipv4_is_loopback(saddr))
+ if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
goto martian_source;
if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
@@ -2285,9 +2242,17 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
if (ipv4_is_zeronet(saddr))
goto martian_source;
- if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
+ if (ipv4_is_zeronet(daddr))
goto martian_destination;
+ if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) {
+ if (ipv4_is_loopback(daddr))
+ goto martian_destination;
+
+ if (ipv4_is_loopback(saddr))
+ goto martian_source;
+ }
+
/*
* Now we are ready to route packet.
*/
@@ -2299,11 +2264,8 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
fl4.daddr = daddr;
fl4.saddr = saddr;
err = fib_lookup(net, &fl4, &res);
- if (err != 0) {
- if (!IN_DEV_FORWARD(in_dev))
- goto e_hostunreach;
+ if (err != 0)
goto no_route;
- }
RT_CACHE_STAT_INC(in_slow_tot);
@@ -2313,17 +2275,16 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
if (res.type == RTN_LOCAL) {
err = fib_validate_source(skb, saddr, daddr, tos,
net->loopback_dev->ifindex,
- dev, &spec_dst, &itag);
+ dev, in_dev, &itag);
if (err < 0)
goto martian_source_keep_err;
if (err)
flags |= RTCF_DIRECTSRC;
- spec_dst = daddr;
goto local_input;
}
if (!IN_DEV_FORWARD(in_dev))
- goto e_hostunreach;
+ goto no_route;
if (res.type != RTN_UNICAST)
goto martian_destination;
@@ -2334,11 +2295,9 @@ brd_input:
if (skb->protocol != htons(ETH_P_IP))
goto e_inval;
- if (ipv4_is_zeronet(saddr))
- spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
- else {
- err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
- &itag);
+ if (!ipv4_is_zeronet(saddr)) {
+ err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
+ in_dev, &itag);
if (err < 0)
goto martian_source_keep_err;
if (err)
@@ -2368,17 +2327,12 @@ local_input:
rth->rt_key_tos = tos;
rth->rt_dst = daddr;
rth->rt_src = saddr;
-#ifdef CONFIG_IP_ROUTE_CLASSID
- rth->dst.tclassid = itag;
-#endif
rth->rt_route_iif = dev->ifindex;
rth->rt_iif = dev->ifindex;
rth->rt_oif = 0;
rth->rt_mark = skb->mark;
+ rth->rt_pmtu = 0;
rth->rt_gateway = daddr;
- rth->rt_spec_dst= spec_dst;
- rth->rt_peer_genid = 0;
- rth->peer = NULL;
rth->fi = NULL;
if (res.type == RTN_UNREACHABLE) {
rth->dst.input= ip_error;
@@ -2394,7 +2348,6 @@ local_input:
no_route:
RT_CACHE_STAT_INC(in_no_route);
- spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
res.type = RTN_UNREACHABLE;
if (err == -ESRCH)
err = -ENETUNREACH;
@@ -2406,15 +2359,11 @@ no_route:
martian_destination:
RT_CACHE_STAT_INC(in_martian_dst);
#ifdef CONFIG_IP_ROUTE_VERBOSE
- if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
- pr_warn("martian destination %pI4 from %pI4, dev %s\n",
- &daddr, &saddr, dev->name);
+ if (IN_DEV_LOG_MARTIANS(in_dev))
+ net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
+ &daddr, &saddr, dev->name);
#endif
-e_hostunreach:
- err = -EHOSTUNREACH;
- goto out;
-
e_inval:
err = -EINVAL;
goto out;
@@ -2433,8 +2382,8 @@ martian_source_keep_err:
int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
u8 tos, struct net_device *dev, bool noref)
{
- struct rtable * rth;
- unsigned hash;
+ struct rtable *rth;
+ unsigned int hash;
int iif = dev->ifindex;
struct net *net;
int res;
@@ -2458,7 +2407,6 @@ int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
rth->rt_mark == skb->mark &&
net_eq(dev_net(rth->dst.dev), net) &&
!rt_is_expired(rth)) {
- ipv4_validate_peer(rth);
if (noref) {
dst_use_noref(&rth->dst, jiffies);
skb_dst_set_noref(skb, &rth->dst);
@@ -2526,9 +2474,14 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
u16 type = res->type;
struct rtable *rth;
- if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
+ in_dev = __in_dev_get_rcu(dev_out);
+ if (!in_dev)
return ERR_PTR(-EINVAL);
+ if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
+ if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
+ return ERR_PTR(-EINVAL);
+
if (ipv4_is_lbcast(fl4->daddr))
type = RTN_BROADCAST;
else if (ipv4_is_multicast(fl4->daddr))
@@ -2539,10 +2492,6 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
if (dev_out->flags & IFF_LOOPBACK)
flags |= RTCF_LOCAL;
- in_dev = __in_dev_get_rcu(dev_out);
- if (!in_dev)
- return ERR_PTR(-EINVAL);
-
if (type == RTN_BROADCAST) {
flags |= RTCF_BROADCAST | RTCF_LOCAL;
fi = NULL;
@@ -2579,20 +2528,15 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
rth->rt_iif = orig_oif ? : dev_out->ifindex;
rth->rt_oif = orig_oif;
rth->rt_mark = fl4->flowi4_mark;
+ rth->rt_pmtu = 0;
rth->rt_gateway = fl4->daddr;
- rth->rt_spec_dst= fl4->saddr;
- rth->rt_peer_genid = 0;
- rth->peer = NULL;
rth->fi = NULL;
RT_CACHE_STAT_INC(out_slow_tot);
- if (flags & RTCF_LOCAL) {
+ if (flags & RTCF_LOCAL)
rth->dst.input = ip_local_deliver;
- rth->rt_spec_dst = fl4->daddr;
- }
if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
- rth->rt_spec_dst = fl4->saddr;
if (flags & RTCF_LOCAL &&
!(dev_out->flags & IFF_LOOPBACK)) {
rth->dst.output = ip_mc_output;
@@ -2611,6 +2555,9 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
rt_set_nexthop(rth, fl4, res, fi, type, 0);
+ if (fl4->flowi4_flags & FLOWI_FLAG_RT_NOCACHE)
+ rth->dst.flags |= DST_NOCACHE;
+
return rth;
}
@@ -2630,10 +2577,9 @@ static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
__be32 orig_saddr;
int orig_oif;
+ res.tclassid = 0;
res.fi = NULL;
-#ifdef CONFIG_IP_MULTIPLE_TABLES
- res.r = NULL;
-#endif
+ res.table = NULL;
orig_daddr = fl4->daddr;
orig_saddr = fl4->saddr;
@@ -2736,6 +2682,7 @@ static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
if (fib_lookup(net, fl4, &res)) {
res.fi = NULL;
+ res.table = NULL;
if (fl4->flowi4_oif) {
/* Apparently, routing tables are wrong. Assume,
that the destination is on link.
@@ -2834,7 +2781,6 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
(IPTOS_RT_MASK | RTO_ONLINK)) &&
net_eq(dev_net(rth->dst.dev), net) &&
!rt_is_expired(rth)) {
- ipv4_validate_peer(rth);
dst_use(&rth->dst, jiffies);
RT_CACHE_STAT_INC(out_hit);
rcu_read_unlock_bh();
@@ -2865,7 +2811,13 @@ static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
return mtu ? : dst->dev->mtu;
}
-static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
+static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
+ struct sk_buff *skb, u32 mtu)
+{
+}
+
+static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
+ struct sk_buff *skb)
{
}
@@ -2883,6 +2835,7 @@ static struct dst_ops ipv4_dst_blackhole_ops = {
.mtu = ipv4_blackhole_mtu,
.default_advmss = ipv4_default_advmss,
.update_pmtu = ipv4_rt_blackhole_update_pmtu,
+ .redirect = ipv4_rt_blackhole_redirect,
.cow_metrics = ipv4_rt_blackhole_cow_metrics,
.neigh_lookup = ipv4_neigh_lookup,
};
@@ -2898,7 +2851,6 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
new->__use = 1;
new->input = dst_discard;
new->output = dst_discard;
- dst_copy_metrics(new, &ort->dst);
new->dev = ort->dst.dev;
if (new->dev)
@@ -2911,6 +2863,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
rt->rt_iif = ort->rt_iif;
rt->rt_oif = ort->rt_oif;
rt->rt_mark = ort->rt_mark;
+ rt->rt_pmtu = ort->rt_pmtu;
rt->rt_genid = rt_genid(net);
rt->rt_flags = ort->rt_flags;
@@ -2918,10 +2871,6 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
rt->rt_dst = ort->rt_dst;
rt->rt_src = ort->rt_src;
rt->rt_gateway = ort->rt_gateway;
- rt->rt_spec_dst = ort->rt_spec_dst;
- rt->peer = ort->peer;
- if (rt->peer)
- atomic_inc(&rt->peer->refcnt);
rt->fi = ort->fi;
if (rt->fi)
atomic_inc(&rt->fi->fib_clntref);
@@ -2959,8 +2908,8 @@ static int rt_fill_info(struct net *net,
struct rtmsg *r;
struct nlmsghdr *nlh;
unsigned long expires = 0;
- const struct inet_peer *peer = rt->peer;
- u32 id = 0, ts = 0, tsage = 0, error;
+ u32 error;
+ u32 metrics[RTAX_MAX];
nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
if (nlh == NULL)
@@ -2972,7 +2921,8 @@ static int rt_fill_info(struct net *net,
r->rtm_src_len = 0;
r->rtm_tos = rt->rt_key_tos;
r->rtm_table = RT_TABLE_MAIN;
- NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
+ if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
+ goto nla_put_failure;
r->rtm_type = rt->rt_type;
r->rtm_scope = RT_SCOPE_UNIVERSE;
r->rtm_protocol = RTPROT_UNSPEC;
@@ -2980,47 +2930,47 @@ static int rt_fill_info(struct net *net,
if (rt->rt_flags & RTCF_NOTIFY)
r->rtm_flags |= RTM_F_NOTIFY;
- NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
-
+ if (nla_put_be32(skb, RTA_DST, rt->rt_dst))
+ goto nla_put_failure;
if (rt->rt_key_src) {
r->rtm_src_len = 32;
- NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
+ if (nla_put_be32(skb, RTA_SRC, rt->rt_key_src))
+ goto nla_put_failure;
}
- if (rt->dst.dev)
- NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
+ if (rt->dst.dev &&
+ nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
+ goto nla_put_failure;
#ifdef CONFIG_IP_ROUTE_CLASSID
- if (rt->dst.tclassid)
- NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
+ if (rt->dst.tclassid &&
+ nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
+ goto nla_put_failure;
#endif
- if (rt_is_input_route(rt))
- NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
- else if (rt->rt_src != rt->rt_key_src)
- NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
-
- if (rt->rt_dst != rt->rt_gateway)
- NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
+ if (!rt_is_input_route(rt) &&
+ rt->rt_src != rt->rt_key_src) {
+ if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_src))
+ goto nla_put_failure;
+ }
+ if (rt->rt_dst != rt->rt_gateway &&
+ nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
+ goto nla_put_failure;
- if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
+ memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
+ if (rt->rt_pmtu)
+ metrics[RTAX_MTU - 1] = rt->rt_pmtu;
+ if (rtnetlink_put_metrics(skb, metrics) < 0)
goto nla_put_failure;
- if (rt->rt_mark)
- NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
+ if (rt->rt_mark &&
+ nla_put_be32(skb, RTA_MARK, rt->rt_mark))
+ goto nla_put_failure;
error = rt->dst.error;
- if (peer) {
- inet_peer_refcheck(rt->peer);
- id = atomic_read(&peer->ip_id_count) & 0xffff;
- if (peer->tcp_ts_stamp) {
- ts = peer->tcp_ts;
- tsage = get_seconds() - peer->tcp_ts_stamp;
- }
- expires = ACCESS_ONCE(peer->pmtu_expires);
- if (expires) {
- if (time_before(jiffies, expires))
- expires -= jiffies;
- else
- expires = 0;
- }
+ expires = rt->dst.expires;
+ if (expires) {
+ if (time_before(jiffies, expires))
+ expires -= jiffies;
+ else
+ expires = 0;
}
if (rt_is_input_route(rt)) {
@@ -3045,11 +2995,11 @@ static int rt_fill_info(struct net *net,
}
} else
#endif
- NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
+ if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
+ goto nla_put_failure;
}
- if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
- expires, error) < 0)
+ if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
goto nla_put_failure;
return nlmsg_end(skb, nlh);
@@ -3059,7 +3009,7 @@ nla_put_failure:
return -EMSGSIZE;
}
-static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
+static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
{
struct net *net = sock_net(in_skb->sk);
struct rtmsg *rtm;
@@ -3334,23 +3284,6 @@ static ctl_table ipv4_route_table[] = {
{ }
};
-static struct ctl_table empty[1];
-
-static struct ctl_table ipv4_skeleton[] =
-{
- { .procname = "route",
- .mode = 0555, .child = ipv4_route_table},
- { .procname = "neigh",
- .mode = 0555, .child = empty},
- { }
-};
-
-static __net_initdata struct ctl_path ipv4_path[] = {
- { .procname = "net", },
- { .procname = "ipv4", },
- { },
-};
-
static struct ctl_table ipv4_route_flush_table[] = {
{
.procname = "flush",
@@ -3361,13 +3294,6 @@ static struct ctl_table ipv4_route_flush_table[] = {
{ },
};
-static __net_initdata struct ctl_path ipv4_route_path[] = {
- { .procname = "net", },
- { .procname = "ipv4", },
- { .procname = "route", },
- { },
-};
-
static __net_init int sysctl_route_net_init(struct net *net)
{
struct ctl_table *tbl;
@@ -3380,8 +3306,7 @@ static __net_init int sysctl_route_net_init(struct net *net)
}
tbl[0].extra1 = net;
- net->ipv4.route_hdr =
- register_net_sysctl_table(net, ipv4_route_path, tbl);
+ net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
if (net->ipv4.route_hdr == NULL)
goto err_reg;
return 0;
@@ -3422,6 +3347,30 @@ static __net_initdata struct pernet_operations rt_genid_ops = {
.init = rt_genid_init,
};
+static int __net_init ipv4_inetpeer_init(struct net *net)
+{
+ struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
+
+ if (!bp)
+ return -ENOMEM;
+ inet_peer_base_init(bp);
+ net->ipv4.peers = bp;
+ return 0;
+}
+
+static void __net_exit ipv4_inetpeer_exit(struct net *net)
+{
+ struct inet_peer_base *bp = net->ipv4.peers;
+
+ net->ipv4.peers = NULL;
+ inetpeer_invalidate_tree(bp);
+ kfree(bp);
+}
+
+static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
+ .init = ipv4_inetpeer_init,
+ .exit = ipv4_inetpeer_exit,
+};
#ifdef CONFIG_IP_ROUTE_CLASSID
struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
@@ -3430,9 +3379,15 @@ struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
static __initdata unsigned long rhash_entries;
static int __init set_rhash_entries(char *str)
{
+ ssize_t ret;
+
if (!str)
return 0;
- rhash_entries = simple_strtoul(str, &str, 0);
+
+ ret = kstrtoul(str, 0, &rhash_entries);
+ if (ret)
+ return 0;
+
return 1;
}
__setup("rhash_entries=", set_rhash_entries);
@@ -3468,6 +3423,7 @@ int __init ip_rt_init(void)
0,
&rt_hash_log,
&rt_hash_mask,
+ 0,
rhash_entries ? 0 : 512 * 1024);
memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
rt_hash_lock_init();
@@ -3495,6 +3451,7 @@ int __init ip_rt_init(void)
register_pernet_subsys(&sysctl_route_ops);
#endif
register_pernet_subsys(&rt_genid_ops);
+ register_pernet_subsys(&ipv4_inetpeer_ops);
return rc;
}
@@ -3505,6 +3462,6 @@ int __init ip_rt_init(void)
*/
void __init ip_static_sysctl_init(void)
{
- register_sysctl_paths(ipv4_path, ipv4_skeleton);
+ register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
}
#endif
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index eab2a7fb15d1..650e1528e1e6 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -293,7 +293,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
/* check for timestamp cookie support */
memset(&tcp_opt, 0, sizeof(tcp_opt));
- tcp_parse_options(skb, &tcp_opt, &hash_location, 0);
+ tcp_parse_options(skb, &tcp_opt, &hash_location, 0, NULL);
if (!cookie_check_timestamp(&tcp_opt, &ecn_ok))
goto out;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 7a7724da9bff..5840c3255721 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -27,6 +27,7 @@
#include <net/tcp_memcontrol.h>
static int zero;
+static int two = 2;
static int tcp_retr1_max = 255;
static int ip_local_port_range_min[] = { 1, 1 };
static int ip_local_port_range_max[] = { 65535, 65535 };
@@ -78,7 +79,7 @@ static int ipv4_local_port_range(ctl_table *table, int write,
static void inet_get_ping_group_range_table(struct ctl_table *table, gid_t *low, gid_t *high)
{
gid_t *data = table->data;
- unsigned seq;
+ unsigned int seq;
do {
seq = read_seqbegin(&sysctl_local_ports.lock);
@@ -300,6 +301,13 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
+ .procname = "ip_early_demux",
+ .data = &sysctl_ip_early_demux,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
.procname = "ip_dynaddr",
.data = &sysctl_ip_dynaddr,
.maxlen = sizeof(int),
@@ -359,6 +367,13 @@ static struct ctl_table ipv4_table[] = {
},
#endif
{
+ .procname = "tcp_fastopen",
+ .data = &sysctl_tcp_fastopen,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
.procname = "tcp_tw_recycle",
.data = &tcp_death_row.sysctl_tw_recycle,
.maxlen = sizeof(int),
@@ -590,6 +605,20 @@ static struct ctl_table ipv4_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec
},
+ {
+ .procname = "tcp_limit_output_bytes",
+ .data = &sysctl_tcp_limit_output_bytes,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_challenge_ack_limit",
+ .data = &sysctl_tcp_challenge_ack_limit,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
#ifdef CONFIG_NET_DMA
{
.procname = "tcp_dma_copybreak",
@@ -677,6 +706,15 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
+ .procname = "tcp_early_retrans",
+ .data = &sysctl_tcp_early_retrans,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &two,
+ },
+ {
.procname = "udp_mem",
.data = &sysctl_udp_mem,
.maxlen = sizeof(sysctl_udp_mem),
@@ -768,13 +806,6 @@ static struct ctl_table ipv4_net_table[] = {
{ }
};
-struct ctl_path net_ipv4_ctl_path[] = {
- { .procname = "net", },
- { .procname = "ipv4", },
- { },
-};
-EXPORT_SYMBOL_GPL(net_ipv4_ctl_path);
-
static __net_init int ipv4_sysctl_init_net(struct net *net)
{
struct ctl_table *table;
@@ -815,8 +846,7 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
tcp_init_mem(net);
- net->ipv4.ipv4_hdr = register_net_sysctl_table(net,
- net_ipv4_ctl_path, table);
+ net->ipv4.ipv4_hdr = register_net_sysctl(net, "net/ipv4", table);
if (net->ipv4.ipv4_hdr == NULL)
goto err_reg;
@@ -857,12 +887,12 @@ static __init int sysctl_ipv4_init(void)
if (!i->procname)
return -EINVAL;
- hdr = register_sysctl_paths(net_ipv4_ctl_path, ipv4_table);
+ hdr = register_net_sysctl(&init_net, "net/ipv4", ipv4_table);
if (hdr == NULL)
return -ENOMEM;
if (register_pernet_subsys(&ipv4_sysctl_ops)) {
- unregister_sysctl_table(hdr);
+ unregister_net_sysctl_table(hdr);
return -ENOMEM;
}
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index cfd7edda0a8e..581ecf02c6b5 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -270,6 +270,7 @@
#include <linux/slab.h>
#include <net/icmp.h>
+#include <net/inet_common.h>
#include <net/tcp.h>
#include <net/xfrm.h>
#include <net/ip.h>
@@ -363,6 +364,72 @@ static int retrans_to_secs(u8 retrans, int timeout, int rto_max)
return period;
}
+/* Address-family independent initialization for a tcp_sock.
+ *
+ * NOTE: A lot of things set to zero explicitly by call to
+ * sk_alloc() so need not be done here.
+ */
+void tcp_init_sock(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ skb_queue_head_init(&tp->out_of_order_queue);
+ tcp_init_xmit_timers(sk);
+ tcp_prequeue_init(tp);
+ INIT_LIST_HEAD(&tp->tsq_node);
+
+ icsk->icsk_rto = TCP_TIMEOUT_INIT;
+ tp->mdev = TCP_TIMEOUT_INIT;
+
+ /* So many TCP implementations out there (incorrectly) count the
+ * initial SYN frame in their delayed-ACK and congestion control
+ * algorithms that we must have the following bandaid to talk
+ * efficiently to them. -DaveM
+ */
+ tp->snd_cwnd = TCP_INIT_CWND;
+
+ /* See draft-stevens-tcpca-spec-01 for discussion of the
+ * initialization of these values.
+ */
+ tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
+ tp->snd_cwnd_clamp = ~0;
+ tp->mss_cache = TCP_MSS_DEFAULT;
+
+ tp->reordering = sysctl_tcp_reordering;
+ tcp_enable_early_retrans(tp);
+ icsk->icsk_ca_ops = &tcp_init_congestion_ops;
+
+ sk->sk_state = TCP_CLOSE;
+
+ sk->sk_write_space = sk_stream_write_space;
+ sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
+
+ icsk->icsk_sync_mss = tcp_sync_mss;
+
+ /* TCP Cookie Transactions */
+ if (sysctl_tcp_cookie_size > 0) {
+ /* Default, cookies without s_data_payload. */
+ tp->cookie_values =
+ kzalloc(sizeof(*tp->cookie_values),
+ sk->sk_allocation);
+ if (tp->cookie_values != NULL)
+ kref_init(&tp->cookie_values->kref);
+ }
+ /* Presumed zeroed, in order of appearance:
+ * cookie_in_always, cookie_out_never,
+ * s_data_constant, s_data_in, s_data_out
+ */
+ sk->sk_sndbuf = sysctl_tcp_wmem[1];
+ sk->sk_rcvbuf = sysctl_tcp_rmem[1];
+
+ local_bh_disable();
+ sock_update_memcg(sk);
+ sk_sockets_allocated_inc(sk);
+ local_bh_enable();
+}
+EXPORT_SYMBOL(tcp_init_sock);
+
/*
* Wait for a TCP event.
*
@@ -528,7 +595,7 @@ static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
tp->pushed_seq = tp->write_seq;
}
-static inline int forced_push(const struct tcp_sock *tp)
+static inline bool forced_push(const struct tcp_sock *tp)
{
return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
}
@@ -701,11 +768,12 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp)
skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp);
if (skb) {
if (sk_wmem_schedule(sk, skb->truesize)) {
+ skb_reserve(skb, sk->sk_prot->max_header);
/*
* Make sure that we have exactly size bytes
* available to the caller, no more, no less.
*/
- skb_reserve(skb, skb_tailroom(skb) - size);
+ skb->avail_size = size;
return skb;
}
__kfree_skb(skb);
@@ -730,6 +798,10 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
inet_csk(sk)->icsk_ext_hdr_len -
tp->tcp_header_len);
+ /* TSQ : try to have two TSO segments in flight */
+ xmit_size_goal = min_t(u32, xmit_size_goal,
+ sysctl_tcp_limit_output_bytes >> 1);
+
xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal);
/* We try hard to avoid divides here */
@@ -783,9 +855,10 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
while (psize > 0) {
struct sk_buff *skb = tcp_write_queue_tail(sk);
struct page *page = pages[poffset / PAGE_SIZE];
- int copy, i, can_coalesce;
+ int copy, i;
int offset = poffset % PAGE_SIZE;
int size = min_t(size_t, psize, PAGE_SIZE - offset);
+ bool can_coalesce;
if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) {
new_segment:
@@ -850,8 +923,7 @@ new_segment:
wait_for_sndbuf:
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
wait_for_memory:
- if (copied)
- tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
+ tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
goto do_error;
@@ -860,7 +932,7 @@ wait_for_memory:
}
out:
- if (copied)
+ if (copied && !(flags & MSG_SENDPAGE_NOTLAST))
tcp_push(sk, flags, mss_now, tp->nonagle);
return copied;
@@ -911,27 +983,81 @@ static inline int select_size(const struct sock *sk, bool sg)
return tmp;
}
+void tcp_free_fastopen_req(struct tcp_sock *tp)
+{
+ if (tp->fastopen_req != NULL) {
+ kfree(tp->fastopen_req);
+ tp->fastopen_req = NULL;
+ }
+}
+
+static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *size)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ int err, flags;
+
+ if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE))
+ return -EOPNOTSUPP;
+ if (tp->fastopen_req != NULL)
+ return -EALREADY; /* Another Fast Open is in progress */
+
+ tp->fastopen_req = kzalloc(sizeof(struct tcp_fastopen_request),
+ sk->sk_allocation);
+ if (unlikely(tp->fastopen_req == NULL))
+ return -ENOBUFS;
+ tp->fastopen_req->data = msg;
+
+ flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0;
+ err = __inet_stream_connect(sk->sk_socket, msg->msg_name,
+ msg->msg_namelen, flags);
+ *size = tp->fastopen_req->copied;
+ tcp_free_fastopen_req(tp);
+ return err;
+}
+
int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
size_t size)
{
struct iovec *iov;
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
- int iovlen, flags, err, copied;
- int mss_now, size_goal;
+ int iovlen, flags, err, copied = 0;
+ int mss_now = 0, size_goal, copied_syn = 0, offset = 0;
bool sg;
long timeo;
lock_sock(sk);
flags = msg->msg_flags;
+ if (flags & MSG_FASTOPEN) {
+ err = tcp_sendmsg_fastopen(sk, msg, &copied_syn);
+ if (err == -EINPROGRESS && copied_syn > 0)
+ goto out;
+ else if (err)
+ goto out_err;
+ offset = copied_syn;
+ }
+
timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
/* Wait for a connection to finish. */
if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
+ goto do_error;
+
+ if (unlikely(tp->repair)) {
+ if (tp->repair_queue == TCP_RECV_QUEUE) {
+ copied = tcp_send_rcvq(sk, msg, size);
+ goto out;
+ }
+
+ err = -EINVAL;
+ if (tp->repair_queue == TCP_NO_QUEUE)
goto out_err;
+ /* 'common' sending to sendq */
+ }
+
/* This should be in poll */
clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
@@ -953,6 +1079,15 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
unsigned char __user *from = iov->iov_base;
iov++;
+ if (unlikely(offset > 0)) { /* Skip bytes copied in SYN */
+ if (offset >= seglen) {
+ offset -= seglen;
+ continue;
+ }
+ seglen -= offset;
+ from += offset;
+ offset = 0;
+ }
while (seglen > 0) {
int copy = 0;
@@ -995,15 +1130,14 @@ new_segment:
copy = seglen;
/* Where to copy to? */
- if (skb_tailroom(skb) > 0) {
+ if (skb_availroom(skb) > 0) {
/* We have some space in skb head. Superb! */
- if (copy > skb_tailroom(skb))
- copy = skb_tailroom(skb);
+ copy = min_t(int, copy, skb_availroom(skb));
err = skb_add_data_nocache(sk, skb, from, copy);
if (err)
goto do_fault;
} else {
- int merge = 0;
+ bool merge = false;
int i = skb_shinfo(skb)->nr_frags;
struct page *page = sk->sk_sndmsg_page;
int off;
@@ -1017,7 +1151,7 @@ new_segment:
off != PAGE_SIZE) {
/* We can extend the last page
* fragment. */
- merge = 1;
+ merge = true;
} else if (i == MAX_SKB_FRAGS || !sg) {
/* Need to add new fragment and cannot
* do this because interface is non-SG,
@@ -1089,7 +1223,7 @@ new_segment:
if ((seglen -= copy) == 0 && iovlen == 0)
goto out;
- if (skb->len < max || (flags & MSG_OOB))
+ if (skb->len < max || (flags & MSG_OOB) || unlikely(tp->repair))
continue;
if (forced_push(tp)) {
@@ -1102,7 +1236,7 @@ new_segment:
wait_for_sndbuf:
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
wait_for_memory:
- if (copied)
+ if (copied && likely(!tp->repair))
tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
@@ -1113,10 +1247,10 @@ wait_for_memory:
}
out:
- if (copied)
+ if (copied && likely(!tp->repair))
tcp_push(sk, flags, mss_now, tp->nonagle);
release_sock(sk);
- return copied;
+ return copied + copied_syn;
do_fault:
if (!skb->len) {
@@ -1129,7 +1263,7 @@ do_fault:
}
do_error:
- if (copied)
+ if (copied + copied_syn)
goto out;
out_err:
err = sk_stream_error(sk, flags, err);
@@ -1187,6 +1321,24 @@ static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags)
return -EAGAIN;
}
+static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
+{
+ struct sk_buff *skb;
+ int copied = 0, err = 0;
+
+ /* XXX -- need to support SO_PEEK_OFF */
+
+ skb_queue_walk(&sk->sk_write_queue, skb) {
+ err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, skb->len);
+ if (err)
+ break;
+
+ copied += skb->len;
+ }
+
+ return err ?: copied;
+}
+
/* Clean up the receive buffer for full frames taken by the user,
* then send an ACK if necessary. COPIED is the number of bytes
* tcp_recvmsg has given to the user so far, it speeds up the
@@ -1196,7 +1348,7 @@ static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags)
void tcp_cleanup_rbuf(struct sock *sk, int copied)
{
struct tcp_sock *tp = tcp_sk(sk);
- int time_to_ack = 0;
+ bool time_to_ack = false;
struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
@@ -1222,7 +1374,7 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied)
((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
!icsk->icsk_ack.pingpong)) &&
!atomic_read(&sk->sk_rmem_alloc)))
- time_to_ack = 1;
+ time_to_ack = true;
}
/* We send an ACK if we can now advertise a non-zero window
@@ -1244,7 +1396,7 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied)
* "Lots" means "at least twice" here.
*/
if (new_window && new_window >= 2 * rcv_window_now)
- time_to_ack = 1;
+ time_to_ack = true;
}
}
if (time_to_ack)
@@ -1376,11 +1528,11 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
break;
}
if (tcp_hdr(skb)->fin) {
- sk_eat_skb(sk, skb, 0);
+ sk_eat_skb(sk, skb, false);
++seq;
break;
}
- sk_eat_skb(sk, skb, 0);
+ sk_eat_skb(sk, skb, false);
if (!desc->count)
break;
tp->copied_seq = seq;
@@ -1416,7 +1568,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
int target; /* Read at least this many bytes */
long timeo;
struct task_struct *user_recv = NULL;
- int copied_early = 0;
+ bool copied_early = false;
struct sk_buff *skb;
u32 urg_hole = 0;
@@ -1432,6 +1584,21 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
if (flags & MSG_OOB)
goto recv_urg;
+ if (unlikely(tp->repair)) {
+ err = -EPERM;
+ if (!(flags & MSG_PEEK))
+ goto out;
+
+ if (tp->repair_queue == TCP_SEND_QUEUE)
+ goto recv_sndq;
+
+ err = -EINVAL;
+ if (tp->repair_queue == TCP_NO_QUEUE)
+ goto out;
+
+ /* 'common' recv queue MSG_PEEK-ing */
+ }
+
seq = &tp->copied_seq;
if (flags & MSG_PEEK) {
peek_seq = tp->copied_seq;
@@ -1452,7 +1619,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
if ((available < target) &&
(len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) &&
!sysctl_tcp_low_latency &&
- dma_find_channel(DMA_MEMCPY)) {
+ net_dma_find_channel()) {
preempt_enable_no_resched();
tp->ucopy.pinned_list =
dma_pin_iovec_pages(msg->msg_iov, len);
@@ -1633,9 +1800,9 @@ do_prequeue:
}
if ((flags & MSG_PEEK) &&
(peek_seq - copied - urg_hole != tp->copied_seq)) {
- if (net_ratelimit())
- printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
- current->comm, task_pid_nr(current));
+ net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n",
+ current->comm,
+ task_pid_nr(current));
peek_seq = tp->copied_seq;
}
continue;
@@ -1667,7 +1834,7 @@ do_prequeue:
if (!(flags & MSG_TRUNC)) {
#ifdef CONFIG_NET_DMA
if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
- tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
+ tp->ucopy.dma_chan = net_dma_find_channel();
if (tp->ucopy.dma_chan) {
tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec(
@@ -1689,7 +1856,7 @@ do_prequeue:
dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
if ((offset + used) == skb->len)
- copied_early = 1;
+ copied_early = true;
} else
#endif
@@ -1723,7 +1890,7 @@ skip_copy:
goto found_fin_ok;
if (!(flags & MSG_PEEK)) {
sk_eat_skb(sk, skb, copied_early);
- copied_early = 0;
+ copied_early = false;
}
continue;
@@ -1732,7 +1899,7 @@ skip_copy:
++*seq;
if (!(flags & MSG_PEEK)) {
sk_eat_skb(sk, skb, copied_early);
- copied_early = 0;
+ copied_early = false;
}
break;
} while (len > 0);
@@ -1783,6 +1950,10 @@ out:
recv_urg:
err = tcp_recv_urg(sk, msg, len, flags);
goto out;
+
+recv_sndq:
+ err = tcp_peek_sndq(sk, msg, len);
+ goto out;
}
EXPORT_SYMBOL(tcp_recvmsg);
@@ -1886,10 +2057,10 @@ bool tcp_check_oom(struct sock *sk, int shift)
too_many_orphans = tcp_too_many_orphans(sk, shift);
out_of_socket_memory = tcp_out_of_memory(sk);
- if (too_many_orphans && net_ratelimit())
- pr_info("too many orphaned sockets\n");
- if (out_of_socket_memory && net_ratelimit())
- pr_info("out of memory -- consider tuning tcp_mem\n");
+ if (too_many_orphans)
+ net_info_ratelimited("too many orphaned sockets\n");
+ if (out_of_socket_memory)
+ net_info_ratelimited("out of memory -- consider tuning tcp_mem\n");
return too_many_orphans || out_of_socket_memory;
}
@@ -1935,7 +2106,9 @@ void tcp_close(struct sock *sk, long timeout)
* advertise a zero window, then kill -9 the FTP client, wheee...
* Note: timeout is always zero in such a case.
*/
- if (data_was_unread) {
+ if (unlikely(tcp_sk(sk)->repair)) {
+ sk->sk_prot->disconnect(sk, 0);
+ } else if (data_was_unread) {
/* Unread data was tossed, zap the connection. */
NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
tcp_set_state(sk, TCP_CLOSE);
@@ -2053,7 +2226,7 @@ EXPORT_SYMBOL(tcp_close);
/* These states need RST on ABORT according to RFC793 */
-static inline int tcp_need_reset(int state)
+static inline bool tcp_need_reset(int state)
{
return (1 << state) &
(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
@@ -2074,6 +2247,8 @@ int tcp_disconnect(struct sock *sk, int flags)
/* ABORT function of RFC793 */
if (old_state == TCP_LISTEN) {
inet_csk_listen_stop(sk);
+ } else if (unlikely(tp->repair)) {
+ sk->sk_err = ECONNABORTED;
} else if (tcp_need_reset(old_state) ||
(tp->snd_nxt != tp->write_seq &&
(1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
@@ -2125,6 +2300,54 @@ int tcp_disconnect(struct sock *sk, int flags)
}
EXPORT_SYMBOL(tcp_disconnect);
+static inline bool tcp_can_repair_sock(const struct sock *sk)
+{
+ return capable(CAP_NET_ADMIN) &&
+ ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_ESTABLISHED));
+}
+
+static int tcp_repair_options_est(struct tcp_sock *tp,
+ struct tcp_repair_opt __user *optbuf, unsigned int len)
+{
+ struct tcp_repair_opt opt;
+
+ while (len >= sizeof(opt)) {
+ if (copy_from_user(&opt, optbuf, sizeof(opt)))
+ return -EFAULT;
+
+ optbuf++;
+ len -= sizeof(opt);
+
+ switch (opt.opt_code) {
+ case TCPOPT_MSS:
+ tp->rx_opt.mss_clamp = opt.opt_val;
+ break;
+ case TCPOPT_WINDOW:
+ if (opt.opt_val > 14)
+ return -EFBIG;
+
+ tp->rx_opt.snd_wscale = opt.opt_val;
+ break;
+ case TCPOPT_SACK_PERM:
+ if (opt.opt_val != 0)
+ return -EINVAL;
+
+ tp->rx_opt.sack_ok |= TCP_SACK_SEEN;
+ if (sysctl_tcp_fack)
+ tcp_enable_fack(tp);
+ break;
+ case TCPOPT_TIMESTAMP:
+ if (opt.opt_val != 0)
+ return -EINVAL;
+
+ tp->rx_opt.tstamp_ok = 1;
+ break;
+ }
+ }
+
+ return 0;
+}
+
/*
* Socket option code for TCP.
*/
@@ -2295,6 +2518,55 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
err = -EINVAL;
else
tp->thin_dupack = val;
+ if (tp->thin_dupack)
+ tcp_disable_early_retrans(tp);
+ break;
+
+ case TCP_REPAIR:
+ if (!tcp_can_repair_sock(sk))
+ err = -EPERM;
+ else if (val == 1) {
+ tp->repair = 1;
+ sk->sk_reuse = SK_FORCE_REUSE;
+ tp->repair_queue = TCP_NO_QUEUE;
+ } else if (val == 0) {
+ tp->repair = 0;
+ sk->sk_reuse = SK_NO_REUSE;
+ tcp_send_window_probe(sk);
+ } else
+ err = -EINVAL;
+
+ break;
+
+ case TCP_REPAIR_QUEUE:
+ if (!tp->repair)
+ err = -EPERM;
+ else if (val < TCP_QUEUES_NR)
+ tp->repair_queue = val;
+ else
+ err = -EINVAL;
+ break;
+
+ case TCP_QUEUE_SEQ:
+ if (sk->sk_state != TCP_CLOSE)
+ err = -EPERM;
+ else if (tp->repair_queue == TCP_SEND_QUEUE)
+ tp->write_seq = val;
+ else if (tp->repair_queue == TCP_RECV_QUEUE)
+ tp->rcv_nxt = val;
+ else
+ err = -EINVAL;
+ break;
+
+ case TCP_REPAIR_OPTIONS:
+ if (!tp->repair)
+ err = -EINVAL;
+ else if (sk->sk_state == TCP_ESTABLISHED)
+ err = tcp_repair_options_est(tp,
+ (struct tcp_repair_opt __user *)optval,
+ optlen);
+ else
+ err = -EPERM;
break;
case TCP_CORK:
@@ -2530,6 +2802,8 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
val = tp->mss_cache;
if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
val = tp->rx_opt.user_mss;
+ if (tp->repair)
+ val = tp->rx_opt.mss_clamp;
break;
case TCP_NODELAY:
val = !!(tp->nonagle&TCP_NAGLE_OFF);
@@ -2632,6 +2906,26 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
val = tp->thin_dupack;
break;
+ case TCP_REPAIR:
+ val = tp->repair;
+ break;
+
+ case TCP_REPAIR_QUEUE:
+ if (tp->repair)
+ val = tp->repair_queue;
+ else
+ return -EINVAL;
+ break;
+
+ case TCP_QUEUE_SEQ:
+ if (tp->repair_queue == TCP_SEND_QUEUE)
+ val = tp->write_seq;
+ else if (tp->repair_queue == TCP_RECV_QUEUE)
+ val = tp->rcv_nxt;
+ else
+ return -EINVAL;
+ break;
+
case TCP_USER_TIMEOUT:
val = jiffies_to_msecs(icsk->icsk_user_timeout);
break;
@@ -2675,7 +2969,7 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb,
{
struct sk_buff *segs = ERR_PTR(-EINVAL);
struct tcphdr *th;
- unsigned thlen;
+ unsigned int thlen;
unsigned int seq;
__be32 delta;
unsigned int oldlen;
@@ -2933,13 +3227,13 @@ out_free:
struct tcp_md5sig_pool __percpu *tcp_alloc_md5sig_pool(struct sock *sk)
{
struct tcp_md5sig_pool __percpu *pool;
- int alloc = 0;
+ bool alloc = false;
retry:
spin_lock_bh(&tcp_md5sig_pool_lock);
pool = tcp_md5sig_pool;
if (tcp_md5sig_users++ == 0) {
- alloc = 1;
+ alloc = true;
spin_unlock_bh(&tcp_md5sig_pool_lock);
} else if (!pool) {
tcp_md5sig_users--;
@@ -3033,9 +3327,9 @@ int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
struct scatterlist sg;
const struct tcphdr *tp = tcp_hdr(skb);
struct hash_desc *desc = &hp->md5_desc;
- unsigned i;
- const unsigned head_data_len = skb_headlen(skb) > header_len ?
- skb_headlen(skb) - header_len : 0;
+ unsigned int i;
+ const unsigned int head_data_len = skb_headlen(skb) > header_len ?
+ skb_headlen(skb) - header_len : 0;
const struct skb_shared_info *shi = skb_shinfo(skb);
struct sk_buff *frag_iter;
@@ -3072,8 +3366,7 @@ EXPORT_SYMBOL(tcp_md5_hash_key);
#endif
-/**
- * Each Responder maintains up to two secret values concurrently for
+/* Each Responder maintains up to two secret values concurrently for
* efficient secret rollover. Each secret value has 4 states:
*
* Generating. (tcp_secret_generating != tcp_secret_primary)
@@ -3223,9 +3516,15 @@ extern struct tcp_congestion_ops tcp_reno;
static __initdata unsigned long thash_entries;
static int __init set_thash_entries(char *str)
{
+ ssize_t ret;
+
if (!str)
return 0;
- thash_entries = simple_strtoul(str, &str, 0);
+
+ ret = kstrtoul(str, 0, &thash_entries);
+ if (ret)
+ return 0;
+
return 1;
}
__setup("thash_entries=", set_thash_entries);
@@ -3243,7 +3542,7 @@ void __init tcp_init(void)
{
struct sk_buff *skb = NULL;
unsigned long limit;
- int max_share, cnt;
+ int max_rshare, max_wshare, cnt;
unsigned int i;
unsigned long jiffy = jiffies;
@@ -3270,6 +3569,7 @@ void __init tcp_init(void)
0,
NULL,
&tcp_hashinfo.ehash_mask,
+ 0,
thash_entries ? 0 : 512 * 1024);
for (i = 0; i <= tcp_hashinfo.ehash_mask; i++) {
INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i);
@@ -3286,6 +3586,7 @@ void __init tcp_init(void)
0,
&tcp_hashinfo.bhash_size,
NULL,
+ 0,
64 * 1024);
tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size;
for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
@@ -3302,21 +3603,23 @@ void __init tcp_init(void)
tcp_init_mem(&init_net);
/* Set per-socket limits to no more than 1/128 the pressure threshold */
- limit = nr_free_buffer_pages() << (PAGE_SHIFT - 10);
- limit = max(limit, 128UL);
- max_share = min(4UL*1024*1024, limit);
+ limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7);
+ max_wshare = min(4UL*1024*1024, limit);
+ max_rshare = min(6UL*1024*1024, limit);
sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
sysctl_tcp_wmem[1] = 16*1024;
- sysctl_tcp_wmem[2] = max(64*1024, max_share);
+ sysctl_tcp_wmem[2] = max(64*1024, max_wshare);
sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;
sysctl_tcp_rmem[1] = 87380;
- sysctl_tcp_rmem[2] = max(87380, max_share);
+ sysctl_tcp_rmem[2] = max(87380, max_rshare);
pr_info("Hash tables configured (established %u bind %u)\n",
tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
+ tcp_metrics_init();
+
tcp_register_congestion_control(&tcp_reno);
memset(&tcp_secret_one.secrets[0], 0, sizeof(tcp_secret_one.secrets));
@@ -3327,4 +3630,5 @@ void __init tcp_init(void)
tcp_secret_primary = &tcp_secret_one;
tcp_secret_retiring = &tcp_secret_two;
tcp_secret_secondary = &tcp_secret_two;
+ tcp_tasklet_init();
}
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 272a84593c85..4d4db16e336e 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -280,19 +280,19 @@ int tcp_set_congestion_control(struct sock *sk, const char *name)
/* RFC2861 Check whether we are limited by application or congestion window
* This is the inverse of cwnd check in tcp_tso_should_defer
*/
-int tcp_is_cwnd_limited(const struct sock *sk, u32 in_flight)
+bool tcp_is_cwnd_limited(const struct sock *sk, u32 in_flight)
{
const struct tcp_sock *tp = tcp_sk(sk);
u32 left;
if (in_flight >= tp->snd_cwnd)
- return 1;
+ return true;
left = tp->snd_cwnd - in_flight;
if (sk_can_gso(sk) &&
left * sysctl_tcp_tso_win_divisor < tp->snd_cwnd &&
left * tp->mss_cache < sk->sk_gso_max_size)
- return 1;
+ return true;
return left <= tcp_max_tso_deferred_mss(tp);
}
EXPORT_SYMBOL_GPL(tcp_is_cwnd_limited);
@@ -307,6 +307,7 @@ EXPORT_SYMBOL_GPL(tcp_is_cwnd_limited);
void tcp_slow_start(struct tcp_sock *tp)
{
int cnt; /* increase in packets */
+ unsigned int delta = 0;
/* RFC3465: ABC Slow start
* Increase only after a full MSS of bytes is acked
@@ -333,9 +334,9 @@ void tcp_slow_start(struct tcp_sock *tp)
tp->snd_cwnd_cnt += cnt;
while (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
tp->snd_cwnd_cnt -= tp->snd_cwnd;
- if (tp->snd_cwnd < tp->snd_cwnd_clamp)
- tp->snd_cwnd++;
+ delta++;
}
+ tp->snd_cwnd = min(tp->snd_cwnd + delta, tp->snd_cwnd_clamp);
}
EXPORT_SYMBOL_GPL(tcp_slow_start);
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
new file mode 100644
index 000000000000..a7f729c409d7
--- /dev/null
+++ b/net/ipv4/tcp_fastopen.c
@@ -0,0 +1,11 @@
+#include <linux/init.h>
+#include <linux/kernel.h>
+
+int sysctl_tcp_fastopen;
+
+static int __init tcp_fastopen_init(void)
+{
+ return 0;
+}
+
+late_initcall(tcp_fastopen_init);
diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c
index fe3ecf484b44..57bdd17dff4d 100644
--- a/net/ipv4/tcp_hybla.c
+++ b/net/ipv4/tcp_hybla.c
@@ -15,7 +15,7 @@
/* Tcp Hybla structure. */
struct hybla {
- u8 hybla_en;
+ bool hybla_en;
u32 snd_cwnd_cents; /* Keeps increment values when it is <1, <<7 */
u32 rho; /* Rho parameter, integer part */
u32 rho2; /* Rho * Rho, integer part */
@@ -24,8 +24,7 @@ struct hybla {
u32 minrtt; /* Minimum smoothed round trip time value seen */
};
-/* Hybla reference round trip time (default= 1/40 sec = 25 ms),
- expressed in jiffies */
+/* Hybla reference round trip time (default= 1/40 sec = 25 ms), in ms */
static int rtt0 = 25;
module_param(rtt0, int, 0644);
MODULE_PARM_DESC(rtt0, "reference rout trip time (ms)");
@@ -39,7 +38,7 @@ static inline void hybla_recalc_param (struct sock *sk)
ca->rho_3ls = max_t(u32, tcp_sk(sk)->srtt / msecs_to_jiffies(rtt0), 8);
ca->rho = ca->rho_3ls >> 3;
ca->rho2_7ls = (ca->rho_3ls * ca->rho_3ls) << 1;
- ca->rho2 = ca->rho2_7ls >>7;
+ ca->rho2 = ca->rho2_7ls >> 7;
}
static void hybla_init(struct sock *sk)
@@ -52,7 +51,7 @@ static void hybla_init(struct sock *sk)
ca->rho_3ls = 0;
ca->rho2_7ls = 0;
ca->snd_cwnd_cents = 0;
- ca->hybla_en = 1;
+ ca->hybla_en = true;
tp->snd_cwnd = 2;
tp->snd_cwnd_clamp = 65535;
@@ -67,6 +66,7 @@ static void hybla_init(struct sock *sk)
static void hybla_state(struct sock *sk, u8 ca_state)
{
struct hybla *ca = inet_csk_ca(sk);
+
ca->hybla_en = (ca_state == TCP_CA_Open);
}
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index e886e2f7fa8d..21d7f8f3a7a5 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -85,20 +85,23 @@ int sysctl_tcp_ecn __read_mostly = 2;
EXPORT_SYMBOL(sysctl_tcp_ecn);
int sysctl_tcp_dsack __read_mostly = 1;
int sysctl_tcp_app_win __read_mostly = 31;
-int sysctl_tcp_adv_win_scale __read_mostly = 2;
+int sysctl_tcp_adv_win_scale __read_mostly = 1;
EXPORT_SYMBOL(sysctl_tcp_adv_win_scale);
+/* rfc5961 challenge ack rate limiting */
+int sysctl_tcp_challenge_ack_limit = 100;
+
int sysctl_tcp_stdurg __read_mostly;
int sysctl_tcp_rfc1337 __read_mostly;
int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
int sysctl_tcp_frto __read_mostly = 2;
int sysctl_tcp_frto_response __read_mostly;
-int sysctl_tcp_nometrics_save __read_mostly;
int sysctl_tcp_thin_dupack __read_mostly;
int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
int sysctl_tcp_abc __read_mostly;
+int sysctl_tcp_early_retrans __read_mostly = 2;
#define FLAG_DATA 0x01 /* Incoming frame contained data. */
#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
@@ -175,7 +178,7 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb)
static void tcp_incr_quickack(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
- unsigned quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss);
+ unsigned int quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss);
if (quickacks == 0)
quickacks = 2;
@@ -195,9 +198,10 @@ static void tcp_enter_quickack_mode(struct sock *sk)
* and the session is not interactive.
*/
-static inline int tcp_in_quickack_mode(const struct sock *sk)
+static inline bool tcp_in_quickack_mode(const struct sock *sk)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
+
return icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong;
}
@@ -252,11 +256,11 @@ static inline void TCP_ECN_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th)
tp->ecn_flags &= ~TCP_ECN_OK;
}
-static inline int TCP_ECN_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th)
+static bool TCP_ECN_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th)
{
if (th->ece && !th->syn && (tp->ecn_flags & TCP_ECN_OK))
- return 1;
- return 0;
+ return true;
+ return false;
}
/* Buffer size and advertised window tuning.
@@ -335,6 +339,7 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)
incr = __tcp_grow_window(sk, skb);
if (incr) {
+ incr = max_t(int, incr, 2 * skb->len);
tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr,
tp->window_clamp);
inet_csk(sk)->icsk_ack.quick |= 1;
@@ -474,8 +479,11 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
if (!win_dep) {
m -= (new_sample >> 3);
new_sample += m;
- } else if (m < new_sample)
- new_sample = m << 3;
+ } else {
+ m <<= 3;
+ if (m < new_sample)
+ new_sample = m;
+ }
} else {
/* No previous measure. */
new_sample = m << 3;
@@ -491,7 +499,7 @@ static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp)
goto new_measure;
if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq))
return;
- tcp_rcv_rtt_update(tp, jiffies - tp->rcv_rtt_est.time, 1);
+ tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rcv_rtt_est.time, 1);
new_measure:
tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd;
@@ -695,7 +703,7 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt)
/* Calculate rto without backoff. This is the second half of Van Jacobson's
* routine referred to above.
*/
-static inline void tcp_set_rto(struct sock *sk)
+void tcp_set_rto(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
/* Old crap is replaced with new one. 8)
@@ -722,109 +730,6 @@ static inline void tcp_set_rto(struct sock *sk)
tcp_bound_rto(sk);
}
-/* Save metrics learned by this TCP session.
- This function is called only, when TCP finishes successfully
- i.e. when it enters TIME-WAIT or goes from LAST-ACK to CLOSE.
- */
-void tcp_update_metrics(struct sock *sk)
-{
- struct tcp_sock *tp = tcp_sk(sk);
- struct dst_entry *dst = __sk_dst_get(sk);
-
- if (sysctl_tcp_nometrics_save)
- return;
-
- dst_confirm(dst);
-
- if (dst && (dst->flags & DST_HOST)) {
- const struct inet_connection_sock *icsk = inet_csk(sk);
- int m;
- unsigned long rtt;
-
- if (icsk->icsk_backoff || !tp->srtt) {
- /* This session failed to estimate rtt. Why?
- * Probably, no packets returned in time.
- * Reset our results.
- */
- if (!(dst_metric_locked(dst, RTAX_RTT)))
- dst_metric_set(dst, RTAX_RTT, 0);
- return;
- }
-
- rtt = dst_metric_rtt(dst, RTAX_RTT);
- m = rtt - tp->srtt;
-
- /* If newly calculated rtt larger than stored one,
- * store new one. Otherwise, use EWMA. Remember,
- * rtt overestimation is always better than underestimation.
- */
- if (!(dst_metric_locked(dst, RTAX_RTT))) {
- if (m <= 0)
- set_dst_metric_rtt(dst, RTAX_RTT, tp->srtt);
- else
- set_dst_metric_rtt(dst, RTAX_RTT, rtt - (m >> 3));
- }
-
- if (!(dst_metric_locked(dst, RTAX_RTTVAR))) {
- unsigned long var;
- if (m < 0)
- m = -m;
-
- /* Scale deviation to rttvar fixed point */
- m >>= 1;
- if (m < tp->mdev)
- m = tp->mdev;
-
- var = dst_metric_rtt(dst, RTAX_RTTVAR);
- if (m >= var)
- var = m;
- else
- var -= (var - m) >> 2;
-
- set_dst_metric_rtt(dst, RTAX_RTTVAR, var);
- }
-
- if (tcp_in_initial_slowstart(tp)) {
- /* Slow start still did not finish. */
- if (dst_metric(dst, RTAX_SSTHRESH) &&
- !dst_metric_locked(dst, RTAX_SSTHRESH) &&
- (tp->snd_cwnd >> 1) > dst_metric(dst, RTAX_SSTHRESH))
- dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_cwnd >> 1);
- if (!dst_metric_locked(dst, RTAX_CWND) &&
- tp->snd_cwnd > dst_metric(dst, RTAX_CWND))
- dst_metric_set(dst, RTAX_CWND, tp->snd_cwnd);
- } else if (tp->snd_cwnd > tp->snd_ssthresh &&
- icsk->icsk_ca_state == TCP_CA_Open) {
- /* Cong. avoidance phase, cwnd is reliable. */
- if (!dst_metric_locked(dst, RTAX_SSTHRESH))
- dst_metric_set(dst, RTAX_SSTHRESH,
- max(tp->snd_cwnd >> 1, tp->snd_ssthresh));
- if (!dst_metric_locked(dst, RTAX_CWND))
- dst_metric_set(dst, RTAX_CWND,
- (dst_metric(dst, RTAX_CWND) +
- tp->snd_cwnd) >> 1);
- } else {
- /* Else slow start did not finish, cwnd is non-sense,
- ssthresh may be also invalid.
- */
- if (!dst_metric_locked(dst, RTAX_CWND))
- dst_metric_set(dst, RTAX_CWND,
- (dst_metric(dst, RTAX_CWND) +
- tp->snd_ssthresh) >> 1);
- if (dst_metric(dst, RTAX_SSTHRESH) &&
- !dst_metric_locked(dst, RTAX_SSTHRESH) &&
- tp->snd_ssthresh > dst_metric(dst, RTAX_SSTHRESH))
- dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_ssthresh);
- }
-
- if (!dst_metric_locked(dst, RTAX_REORDERING)) {
- if (dst_metric(dst, RTAX_REORDERING) < tp->reordering &&
- tp->reordering != sysctl_tcp_reordering)
- dst_metric_set(dst, RTAX_REORDERING, tp->reordering);
- }
- }
-}
-
__u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst)
{
__u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
@@ -861,7 +766,7 @@ void tcp_enter_cwr(struct sock *sk, const int set_ssthresh)
* Packet counting of FACK is based on in-order assumptions, therefore TCP
* disables it when reordering is detected
*/
-static void tcp_disable_fack(struct tcp_sock *tp)
+void tcp_disable_fack(struct tcp_sock *tp)
{
/* RFC3517 uses different metric in lost marker => reset on change */
if (tcp_is_fack(tp))
@@ -875,85 +780,6 @@ static void tcp_dsack_seen(struct tcp_sock *tp)
tp->rx_opt.sack_ok |= TCP_DSACK_SEEN;
}
-/* Initialize metrics on socket. */
-
-static void tcp_init_metrics(struct sock *sk)
-{
- struct tcp_sock *tp = tcp_sk(sk);
- struct dst_entry *dst = __sk_dst_get(sk);
-
- if (dst == NULL)
- goto reset;
-
- dst_confirm(dst);
-
- if (dst_metric_locked(dst, RTAX_CWND))
- tp->snd_cwnd_clamp = dst_metric(dst, RTAX_CWND);
- if (dst_metric(dst, RTAX_SSTHRESH)) {
- tp->snd_ssthresh = dst_metric(dst, RTAX_SSTHRESH);
- if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
- tp->snd_ssthresh = tp->snd_cwnd_clamp;
- } else {
- /* ssthresh may have been reduced unnecessarily during.
- * 3WHS. Restore it back to its initial default.
- */
- tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
- }
- if (dst_metric(dst, RTAX_REORDERING) &&
- tp->reordering != dst_metric(dst, RTAX_REORDERING)) {
- tcp_disable_fack(tp);
- tp->reordering = dst_metric(dst, RTAX_REORDERING);
- }
-
- if (dst_metric(dst, RTAX_RTT) == 0 || tp->srtt == 0)
- goto reset;
-
- /* Initial rtt is determined from SYN,SYN-ACK.
- * The segment is small and rtt may appear much
- * less than real one. Use per-dst memory
- * to make it more realistic.
- *
- * A bit of theory. RTT is time passed after "normal" sized packet
- * is sent until it is ACKed. In normal circumstances sending small
- * packets force peer to delay ACKs and calculation is correct too.
- * The algorithm is adaptive and, provided we follow specs, it
- * NEVER underestimate RTT. BUT! If peer tries to make some clever
- * tricks sort of "quick acks" for time long enough to decrease RTT
- * to low value, and then abruptly stops to do it and starts to delay
- * ACKs, wait for troubles.
- */
- if (dst_metric_rtt(dst, RTAX_RTT) > tp->srtt) {
- tp->srtt = dst_metric_rtt(dst, RTAX_RTT);
- tp->rtt_seq = tp->snd_nxt;
- }
- if (dst_metric_rtt(dst, RTAX_RTTVAR) > tp->mdev) {
- tp->mdev = dst_metric_rtt(dst, RTAX_RTTVAR);
- tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk));
- }
- tcp_set_rto(sk);
-reset:
- if (tp->srtt == 0) {
- /* RFC2988bis: We've failed to get a valid RTT sample from
- * 3WHS. This is most likely due to retransmission,
- * including spurious one. Reset the RTO back to 3secs
- * from the more aggressive 1sec to avoid more spurious
- * retransmission.
- */
- tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_FALLBACK;
- inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK;
- }
- /* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been
- * retransmitted. In light of RFC2988bis' more aggressive 1sec
- * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK
- * retransmission has occurred.
- */
- if (tp->total_retrans > 1)
- tp->snd_cwnd = 1;
- else
- tp->snd_cwnd = tcp_init_cwnd(tp, dst);
- tp->snd_cwnd_stamp = tcp_time_stamp;
-}
-
static void tcp_update_reordering(struct sock *sk, const int metric,
const int ts)
{
@@ -975,15 +801,18 @@ static void tcp_update_reordering(struct sock *sk, const int metric,
NET_INC_STATS_BH(sock_net(sk), mib_idx);
#if FASTRETRANS_DEBUG > 1
- printk(KERN_DEBUG "Disorder%d %d %u f%u s%u rr%d\n",
- tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state,
- tp->reordering,
- tp->fackets_out,
- tp->sacked_out,
- tp->undo_marker ? tp->undo_retrans : 0);
+ pr_debug("Disorder%d %d %u f%u s%u rr%d\n",
+ tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state,
+ tp->reordering,
+ tp->fackets_out,
+ tp->sacked_out,
+ tp->undo_marker ? tp->undo_retrans : 0);
#endif
tcp_disable_fack(tp);
}
+
+ if (metric > 0)
+ tcp_disable_early_retrans(tp);
}
/* This must be called before lost_out is incremented */
@@ -1114,36 +943,36 @@ static void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp,
* the exact amount is rather hard to quantify. However, tp->max_window can
* be used as an exaggerated estimate.
*/
-static int tcp_is_sackblock_valid(struct tcp_sock *tp, int is_dsack,
- u32 start_seq, u32 end_seq)
+static bool tcp_is_sackblock_valid(struct tcp_sock *tp, bool is_dsack,
+ u32 start_seq, u32 end_seq)
{
/* Too far in future, or reversed (interpretation is ambiguous) */
if (after(end_seq, tp->snd_nxt) || !before(start_seq, end_seq))
- return 0;
+ return false;
/* Nasty start_seq wrap-around check (see comments above) */
if (!before(start_seq, tp->snd_nxt))
- return 0;
+ return false;
/* In outstanding window? ...This is valid exit for D-SACKs too.
* start_seq == snd_una is non-sensical (see comments above)
*/
if (after(start_seq, tp->snd_una))
- return 1;
+ return true;
if (!is_dsack || !tp->undo_marker)
- return 0;
+ return false;
/* ...Then it's D-SACK, and must reside below snd_una completely */
if (after(end_seq, tp->snd_una))
- return 0;
+ return false;
if (!before(start_seq, tp->undo_marker))
- return 1;
+ return true;
/* Too old */
if (!after(end_seq, tp->undo_marker))
- return 0;
+ return false;
/* Undo_marker boundary crossing (overestimates a lot). Known already:
* start_seq < undo_marker and end_seq >= undo_marker.
@@ -1215,17 +1044,17 @@ static void tcp_mark_lost_retrans(struct sock *sk)
tp->lost_retrans_low = new_low_seq;
}
-static int tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,
- struct tcp_sack_block_wire *sp, int num_sacks,
- u32 prior_snd_una)
+static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,
+ struct tcp_sack_block_wire *sp, int num_sacks,
+ u32 prior_snd_una)
{
struct tcp_sock *tp = tcp_sk(sk);
u32 start_seq_0 = get_unaligned_be32(&sp[0].start_seq);
u32 end_seq_0 = get_unaligned_be32(&sp[0].end_seq);
- int dup_sack = 0;
+ bool dup_sack = false;
if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) {
- dup_sack = 1;
+ dup_sack = true;
tcp_dsack_seen(tp);
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKRECV);
} else if (num_sacks > 1) {
@@ -1234,7 +1063,7 @@ static int tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,
if (!after(end_seq_0, end_seq_1) &&
!before(start_seq_0, start_seq_1)) {
- dup_sack = 1;
+ dup_sack = true;
tcp_dsack_seen(tp);
NET_INC_STATS_BH(sock_net(sk),
LINUX_MIB_TCPDSACKOFORECV);
@@ -1265,9 +1094,10 @@ struct tcp_sacktag_state {
* FIXME: this could be merged to shift decision code
*/
static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
- u32 start_seq, u32 end_seq)
+ u32 start_seq, u32 end_seq)
{
- int in_sack, err;
+ int err;
+ bool in_sack;
unsigned int pkt_len;
unsigned int mss;
@@ -1313,7 +1143,7 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
static u8 tcp_sacktag_one(struct sock *sk,
struct tcp_sacktag_state *state, u8 sacked,
u32 start_seq, u32 end_seq,
- int dup_sack, int pcount)
+ bool dup_sack, int pcount)
{
struct tcp_sock *tp = tcp_sk(sk);
int fack_count = state->fack_count;
@@ -1393,10 +1223,10 @@ static u8 tcp_sacktag_one(struct sock *sk,
/* Shift newly-SACKed bytes from this skb to the immediately previous
* already-SACKed sk_buff. Mark the newly-SACKed bytes as such.
*/
-static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
- struct tcp_sacktag_state *state,
- unsigned int pcount, int shifted, int mss,
- int dup_sack)
+static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
+ struct tcp_sacktag_state *state,
+ unsigned int pcount, int shifted, int mss,
+ bool dup_sack)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *prev = tcp_write_queue_prev(sk, skb);
@@ -1446,7 +1276,7 @@ static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
if (skb->len > 0) {
BUG_ON(!tcp_skb_pcount(skb));
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKSHIFTED);
- return 0;
+ return false;
}
/* Whole SKB was eaten :-) */
@@ -1469,7 +1299,7 @@ static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKMERGED);
- return 1;
+ return true;
}
/* I wish gso_size would have a bit more sane initialization than
@@ -1492,7 +1322,7 @@ static int skb_can_shift(const struct sk_buff *skb)
static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
struct tcp_sacktag_state *state,
u32 start_seq, u32 end_seq,
- int dup_sack)
+ bool dup_sack)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *prev;
@@ -1631,14 +1461,14 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
struct tcp_sack_block *next_dup,
struct tcp_sacktag_state *state,
u32 start_seq, u32 end_seq,
- int dup_sack_in)
+ bool dup_sack_in)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *tmp;
tcp_for_write_queue_from(skb, sk) {
int in_sack = 0;
- int dup_sack = dup_sack_in;
+ bool dup_sack = dup_sack_in;
if (skb == tcp_send_head(sk))
break;
@@ -1653,7 +1483,7 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
next_dup->start_seq,
next_dup->end_seq);
if (in_sack > 0)
- dup_sack = 1;
+ dup_sack = true;
}
/* skb reference here is a bit tricky to get right, since
@@ -1758,7 +1588,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
struct sk_buff *skb;
int num_sacks = min(TCP_NUM_SACKS, (ptr[1] - TCPOLEN_SACK_BASE) >> 3);
int used_sacks;
- int found_dup_sack = 0;
+ bool found_dup_sack = false;
int i, j;
int first_sack_index;
@@ -1789,7 +1619,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
used_sacks = 0;
first_sack_index = 0;
for (i = 0; i < num_sacks; i++) {
- int dup_sack = !i && found_dup_sack;
+ bool dup_sack = !i && found_dup_sack;
sp[used_sacks].start_seq = get_unaligned_be32(&sp_wire[i].start_seq);
sp[used_sacks].end_seq = get_unaligned_be32(&sp_wire[i].end_seq);
@@ -1856,7 +1686,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
while (i < used_sacks) {
u32 start_seq = sp[i].start_seq;
u32 end_seq = sp[i].end_seq;
- int dup_sack = (found_dup_sack && (i == first_sack_index));
+ bool dup_sack = (found_dup_sack && (i == first_sack_index));
struct tcp_sack_block *next_dup = NULL;
if (found_dup_sack && ((i + 1) == first_sack_index))
@@ -1958,9 +1788,9 @@ out:
}
/* Limits sacked_out so that sum with lost_out isn't ever larger than
- * packets_out. Returns zero if sacked_out adjustement wasn't necessary.
+ * packets_out. Returns false if sacked_out adjustement wasn't necessary.
*/
-static int tcp_limit_reno_sacked(struct tcp_sock *tp)
+static bool tcp_limit_reno_sacked(struct tcp_sock *tp)
{
u32 holes;
@@ -1969,9 +1799,9 @@ static int tcp_limit_reno_sacked(struct tcp_sock *tp)
if ((tp->sacked_out + holes) > tp->packets_out) {
tp->sacked_out = tp->packets_out - holes;
- return 1;
+ return true;
}
- return 0;
+ return false;
}
/* If we receive more dupacks than we expected counting segments
@@ -2025,40 +1855,40 @@ static int tcp_is_sackfrto(const struct tcp_sock *tp)
/* F-RTO can only be used if TCP has never retransmitted anything other than
* head (SACK enhanced variant from Appendix B of RFC4138 is more robust here)
*/
-int tcp_use_frto(struct sock *sk)
+bool tcp_use_frto(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
const struct inet_connection_sock *icsk = inet_csk(sk);
struct sk_buff *skb;
if (!sysctl_tcp_frto)
- return 0;
+ return false;
/* MTU probe and F-RTO won't really play nicely along currently */
if (icsk->icsk_mtup.probe_size)
- return 0;
+ return false;
if (tcp_is_sackfrto(tp))
- return 1;
+ return true;
/* Avoid expensive walking of rexmit queue if possible */
if (tp->retrans_out > 1)
- return 0;
+ return false;
skb = tcp_write_queue_head(sk);
if (tcp_skb_is_last(sk, skb))
- return 1;
+ return true;
skb = tcp_write_queue_next(sk, skb); /* Skips head */
tcp_for_write_queue_from(skb, sk) {
if (skb == tcp_send_head(sk))
break;
if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS)
- return 0;
+ return false;
/* Short-circuit when first non-SACKed skb has been checked */
if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
break;
}
- return 1;
+ return true;
}
/* RTO occurred, but do not yet enter Loss state. Instead, defer RTO
@@ -2294,7 +2124,7 @@ void tcp_enter_loss(struct sock *sk, int how)
*
* Do processing similar to RTO timeout.
*/
-static int tcp_check_sack_reneging(struct sock *sk, int flag)
+static bool tcp_check_sack_reneging(struct sock *sk, int flag)
{
if (flag & FLAG_SACK_RENEGING) {
struct inet_connection_sock *icsk = inet_csk(sk);
@@ -2305,9 +2135,9 @@ static int tcp_check_sack_reneging(struct sock *sk, int flag)
tcp_retransmit_skb(sk, tcp_write_queue_head(sk));
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
icsk->icsk_rto, TCP_RTO_MAX);
- return 1;
+ return true;
}
- return 0;
+ return false;
}
static inline int tcp_fackets_out(const struct tcp_sock *tp)
@@ -2335,6 +2165,27 @@ static inline int tcp_dupack_heuristics(const struct tcp_sock *tp)
return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1;
}
+static bool tcp_pause_early_retransmit(struct sock *sk, int flag)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ unsigned long delay;
+
+ /* Delay early retransmit and entering fast recovery for
+ * max(RTT/4, 2msec) unless ack has ECE mark, no RTT samples
+ * available, or RTO is scheduled to fire first.
+ */
+ if (sysctl_tcp_early_retrans < 2 || (flag & FLAG_ECE) || !tp->srtt)
+ return false;
+
+ delay = max_t(unsigned long, (tp->srtt >> 5), msecs_to_jiffies(2));
+ if (!time_after(inet_csk(sk)->icsk_timeout, (jiffies + delay)))
+ return false;
+
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, delay, TCP_RTO_MAX);
+ tp->early_retrans_delayed = 1;
+ return true;
+}
+
static inline int tcp_skb_timedout(const struct sock *sk,
const struct sk_buff *skb)
{
@@ -2442,28 +2293,28 @@ static inline int tcp_head_timedout(const struct sock *sk)
* Main question: may we further continue forward transmission
* with the same cwnd?
*/
-static int tcp_time_to_recover(struct sock *sk)
+static bool tcp_time_to_recover(struct sock *sk, int flag)
{
struct tcp_sock *tp = tcp_sk(sk);
__u32 packets_out;
/* Do not perform any recovery during F-RTO algorithm */
if (tp->frto_counter)
- return 0;
+ return false;
/* Trick#1: The loss is proven. */
if (tp->lost_out)
- return 1;
+ return true;
/* Not-A-Trick#2 : Classic rule... */
if (tcp_dupack_heuristics(tp) > tp->reordering)
- return 1;
+ return true;
/* Trick#3 : when we use RFC2988 timer restart, fast
* retransmit can be triggered by timeout of queue head.
*/
if (tcp_is_fack(tp) && tcp_head_timedout(sk))
- return 1;
+ return true;
/* Trick#4: It is still not OK... But will it be useful to delay
* recovery more?
@@ -2475,7 +2326,7 @@ static int tcp_time_to_recover(struct sock *sk)
/* We have nothing to send. This connection is limited
* either by receiver window or by application.
*/
- return 1;
+ return true;
}
/* If a thin stream is detected, retransmit after first
@@ -2486,9 +2337,19 @@ static int tcp_time_to_recover(struct sock *sk)
if ((tp->thin_dupack || sysctl_tcp_thin_dupack) &&
tcp_stream_is_thin(tp) && tcp_dupack_heuristics(tp) > 1 &&
tcp_is_sack(tp) && !tcp_send_head(sk))
- return 1;
+ return true;
- return 0;
+ /* Trick#6: TCP early retransmit, per RFC5827. To avoid spurious
+ * retransmissions due to small network reorderings, we implement
+ * Mitigation A.3 in the RFC and delay the retransmission for a short
+ * interval if appropriate.
+ */
+ if (tp->do_early_retrans && !tp->retrans_out && tp->sacked_out &&
+ (tp->packets_out == (tp->sacked_out + 1) && tp->packets_out < 4) &&
+ !tcp_may_send_now(sk))
+ return !tcp_pause_early_retransmit(sk, flag);
+
+ return false;
}
/* New heuristics: it is possible only after we switched to restart timer
@@ -2660,7 +2521,7 @@ static void tcp_cwnd_down(struct sock *sk, int flag)
/* Nothing was retransmitted or returned timestamp is less
* than timestamp of the first retransmission.
*/
-static inline int tcp_packet_delayed(const struct tcp_sock *tp)
+static inline bool tcp_packet_delayed(const struct tcp_sock *tp)
{
return !tp->retrans_stamp ||
(tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
@@ -2676,22 +2537,22 @@ static void DBGUNDO(struct sock *sk, const char *msg)
struct inet_sock *inet = inet_sk(sk);
if (sk->sk_family == AF_INET) {
- printk(KERN_DEBUG "Undo %s %pI4/%u c%u l%u ss%u/%u p%u\n",
- msg,
- &inet->inet_daddr, ntohs(inet->inet_dport),
- tp->snd_cwnd, tcp_left_out(tp),
- tp->snd_ssthresh, tp->prior_ssthresh,
- tp->packets_out);
+ pr_debug("Undo %s %pI4/%u c%u l%u ss%u/%u p%u\n",
+ msg,
+ &inet->inet_daddr, ntohs(inet->inet_dport),
+ tp->snd_cwnd, tcp_left_out(tp),
+ tp->snd_ssthresh, tp->prior_ssthresh,
+ tp->packets_out);
}
#if IS_ENABLED(CONFIG_IPV6)
else if (sk->sk_family == AF_INET6) {
struct ipv6_pinfo *np = inet6_sk(sk);
- printk(KERN_DEBUG "Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n",
- msg,
- &np->daddr, ntohs(inet->inet_dport),
- tp->snd_cwnd, tcp_left_out(tp),
- tp->snd_ssthresh, tp->prior_ssthresh,
- tp->packets_out);
+ pr_debug("Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n",
+ msg,
+ &np->daddr, ntohs(inet->inet_dport),
+ tp->snd_cwnd, tcp_left_out(tp),
+ tp->snd_ssthresh, tp->prior_ssthresh,
+ tp->packets_out);
}
#endif
}
@@ -2721,13 +2582,13 @@ static void tcp_undo_cwr(struct sock *sk, const bool undo_ssthresh)
tp->snd_cwnd_stamp = tcp_time_stamp;
}
-static inline int tcp_may_undo(const struct tcp_sock *tp)
+static inline bool tcp_may_undo(const struct tcp_sock *tp)
{
return tp->undo_marker && (!tp->undo_retrans || tcp_packet_delayed(tp));
}
/* People celebrate: "We love our President!" */
-static int tcp_try_undo_recovery(struct sock *sk)
+static bool tcp_try_undo_recovery(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -2752,10 +2613,10 @@ static int tcp_try_undo_recovery(struct sock *sk)
* is ACKed. For Reno it is MUST to prevent false
* fast retransmits (RFC2582). SACK TCP is safe. */
tcp_moderate_cwnd(tp);
- return 1;
+ return true;
}
tcp_set_ca_state(sk, TCP_CA_Open);
- return 0;
+ return false;
}
/* Try to undo cwnd reduction, because D-SACKs acked all retransmitted data */
@@ -2785,19 +2646,19 @@ static void tcp_try_undo_dsack(struct sock *sk)
* that successive retransmissions of a segment must not advance
* retrans_stamp under any conditions.
*/
-static int tcp_any_retrans_done(const struct sock *sk)
+static bool tcp_any_retrans_done(const struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
if (tp->retrans_out)
- return 1;
+ return true;
skb = tcp_write_queue_head(sk);
if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS))
- return 1;
+ return true;
- return 0;
+ return false;
}
/* Undo during fast recovery after partial ACK. */
@@ -2831,7 +2692,7 @@ static int tcp_try_undo_partial(struct sock *sk, int acked)
}
/* Undo during loss recovery after partial ACK. */
-static int tcp_try_undo_loss(struct sock *sk)
+static bool tcp_try_undo_loss(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -2853,9 +2714,9 @@ static int tcp_try_undo_loss(struct sock *sk)
tp->undo_marker = 0;
if (tcp_is_sack(tp))
tcp_set_ca_state(sk, TCP_CA_Open);
- return 1;
+ return true;
}
- return 0;
+ return false;
}
static inline void tcp_complete_cwr(struct sock *sk)
@@ -2864,11 +2725,14 @@ static inline void tcp_complete_cwr(struct sock *sk)
/* Do not moderate cwnd if it's already undone in cwr or recovery. */
if (tp->undo_marker) {
- if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR)
+ if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR) {
tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
- else /* PRR */
+ tp->snd_cwnd_stamp = tcp_time_stamp;
+ } else if (tp->snd_ssthresh < TCP_INFINITE_SSTHRESH) {
+ /* PRR algorithm. */
tp->snd_cwnd = tp->snd_ssthresh;
- tp->snd_cwnd_stamp = tcp_time_stamp;
+ tp->snd_cwnd_stamp = tcp_time_stamp;
+ }
}
tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR);
}
@@ -3018,6 +2882,38 @@ static void tcp_update_cwnd_in_recovery(struct sock *sk, int newly_acked_sacked,
tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt;
}
+static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ int mib_idx;
+
+ if (tcp_is_reno(tp))
+ mib_idx = LINUX_MIB_TCPRENORECOVERY;
+ else
+ mib_idx = LINUX_MIB_TCPSACKRECOVERY;
+
+ NET_INC_STATS_BH(sock_net(sk), mib_idx);
+
+ tp->high_seq = tp->snd_nxt;
+ tp->prior_ssthresh = 0;
+ tp->undo_marker = tp->snd_una;
+ tp->undo_retrans = tp->retrans_out;
+
+ if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
+ if (!ece_ack)
+ tp->prior_ssthresh = tcp_current_ssthresh(sk);
+ tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk);
+ TCP_ECN_queue_cwr(tp);
+ }
+
+ tp->bytes_acked = 0;
+ tp->snd_cwnd_cnt = 0;
+ tp->prior_cwnd = tp->snd_cwnd;
+ tp->prr_delivered = 0;
+ tp->prr_out = 0;
+ tcp_set_ca_state(sk, TCP_CA_Recovery);
+}
+
/* Process an event, which can update packets-in-flight not trivially.
* Main goal of this function is to calculate new estimate for left_out,
* taking into account both packets sitting in receiver's buffer and
@@ -3037,7 +2933,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
struct tcp_sock *tp = tcp_sk(sk);
int do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) &&
(tcp_fackets_out(tp) > tp->reordering));
- int fast_rexmit = 0, mib_idx;
+ int fast_rexmit = 0;
if (WARN_ON(!tp->packets_out && tp->sacked_out))
tp->sacked_out = 0;
@@ -3121,7 +3017,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
if (icsk->icsk_ca_state <= TCP_CA_Disorder)
tcp_try_undo_dsack(sk);
- if (!tcp_time_to_recover(sk)) {
+ if (!tcp_time_to_recover(sk, flag)) {
tcp_try_to_open(sk, flag);
return;
}
@@ -3138,32 +3034,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
}
/* Otherwise enter Recovery state */
-
- if (tcp_is_reno(tp))
- mib_idx = LINUX_MIB_TCPRENORECOVERY;
- else
- mib_idx = LINUX_MIB_TCPSACKRECOVERY;
-
- NET_INC_STATS_BH(sock_net(sk), mib_idx);
-
- tp->high_seq = tp->snd_nxt;
- tp->prior_ssthresh = 0;
- tp->undo_marker = tp->snd_una;
- tp->undo_retrans = tp->retrans_out;
-
- if (icsk->icsk_ca_state < TCP_CA_CWR) {
- if (!(flag & FLAG_ECE))
- tp->prior_ssthresh = tcp_current_ssthresh(sk);
- tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
- TCP_ECN_queue_cwr(tp);
- }
-
- tp->bytes_acked = 0;
- tp->snd_cwnd_cnt = 0;
- tp->prior_cwnd = tp->snd_cwnd;
- tp->prr_delivered = 0;
- tp->prr_out = 0;
- tcp_set_ca_state(sk, TCP_CA_Recovery);
+ tcp_enter_recovery(sk, (flag & FLAG_ECE));
fast_rexmit = 1;
}
@@ -3245,16 +3116,47 @@ static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
/* Restart timer after forward progress on connection.
* RFC2988 recommends to restart timer to now+rto.
*/
-static void tcp_rearm_rto(struct sock *sk)
+void tcp_rearm_rto(struct sock *sk)
{
- const struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
if (!tp->packets_out) {
inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
} else {
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
- inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
+ u32 rto = inet_csk(sk)->icsk_rto;
+ /* Offset the time elapsed after installing regular RTO */
+ if (tp->early_retrans_delayed) {
+ struct sk_buff *skb = tcp_write_queue_head(sk);
+ const u32 rto_time_stamp = TCP_SKB_CB(skb)->when + rto;
+ s32 delta = (s32)(rto_time_stamp - tcp_time_stamp);
+ /* delta may not be positive if the socket is locked
+ * when the delayed ER timer fires and is rescheduled.
+ */
+ if (delta > 0)
+ rto = delta;
+ }
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
+ TCP_RTO_MAX);
}
+ tp->early_retrans_delayed = 0;
+}
+
+/* This function is called when the delayed ER timer fires. TCP enters
+ * fast recovery and performs fast-retransmit.
+ */
+void tcp_resume_early_retransmit(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ tcp_rearm_rto(sk);
+
+ /* Stop if ER is disabled after the delayed ER timer is scheduled */
+ if (!tp->do_early_retrans)
+ return;
+
+ tcp_enter_recovery(sk, false);
+ tcp_update_scoreboard(sk, 1);
+ tcp_xmit_retransmit_queue(sk);
}
/* If we get here, the whole TSO packet has not been acked. */
@@ -3289,7 +3191,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
const struct inet_connection_sock *icsk = inet_csk(sk);
struct sk_buff *skb;
u32 now = tcp_time_stamp;
- int fully_acked = 1;
+ int fully_acked = true;
int flag = 0;
u32 pkts_acked = 0;
u32 reord = tp->packets_out;
@@ -3313,7 +3215,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
if (!acked_pcount)
break;
- fully_acked = 0;
+ fully_acked = false;
} else {
acked_pcount = tcp_skb_pcount(skb);
}
@@ -3430,18 +3332,18 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
if (!tp->packets_out && tcp_is_sack(tp)) {
icsk = inet_csk(sk);
if (tp->lost_out) {
- printk(KERN_DEBUG "Leak l=%u %d\n",
- tp->lost_out, icsk->icsk_ca_state);
+ pr_debug("Leak l=%u %d\n",
+ tp->lost_out, icsk->icsk_ca_state);
tp->lost_out = 0;
}
if (tp->sacked_out) {
- printk(KERN_DEBUG "Leak s=%u %d\n",
- tp->sacked_out, icsk->icsk_ca_state);
+ pr_debug("Leak s=%u %d\n",
+ tp->sacked_out, icsk->icsk_ca_state);
tp->sacked_out = 0;
}
if (tp->retrans_out) {
- printk(KERN_DEBUG "Leak r=%u %d\n",
- tp->retrans_out, icsk->icsk_ca_state);
+ pr_debug("Leak r=%u %d\n",
+ tp->retrans_out, icsk->icsk_ca_state);
tp->retrans_out = 0;
}
}
@@ -3469,13 +3371,13 @@ static void tcp_ack_probe(struct sock *sk)
}
}
-static inline int tcp_ack_is_dubious(const struct sock *sk, const int flag)
+static inline bool tcp_ack_is_dubious(const struct sock *sk, const int flag)
{
return !(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) ||
inet_csk(sk)->icsk_ca_state != TCP_CA_Open;
}
-static inline int tcp_may_raise_cwnd(const struct sock *sk, const int flag)
+static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag)
{
const struct tcp_sock *tp = tcp_sk(sk);
return (!(flag & FLAG_ECE) || tp->snd_cwnd < tp->snd_ssthresh) &&
@@ -3485,7 +3387,7 @@ static inline int tcp_may_raise_cwnd(const struct sock *sk, const int flag)
/* Check that window update is acceptable.
* The function assumes that snd_una<=ack<=snd_next.
*/
-static inline int tcp_may_update_window(const struct tcp_sock *tp,
+static inline bool tcp_may_update_window(const struct tcp_sock *tp,
const u32 ack, const u32 ack_seq,
const u32 nwin)
{
@@ -3592,7 +3494,7 @@ static void tcp_undo_spur_to_response(struct sock *sk, int flag)
* to prove that the RTO is indeed spurious. It transfers the control
* from F-RTO to the conventional RTO recovery
*/
-static int tcp_process_frto(struct sock *sk, int flag)
+static bool tcp_process_frto(struct sock *sk, int flag)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -3608,7 +3510,7 @@ static int tcp_process_frto(struct sock *sk, int flag)
if (!before(tp->snd_una, tp->frto_highmark)) {
tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 2 : 3), flag);
- return 1;
+ return true;
}
if (!tcp_is_sackfrto(tp)) {
@@ -3617,19 +3519,19 @@ static int tcp_process_frto(struct sock *sk, int flag)
* data, winupdate
*/
if (!(flag & FLAG_ANY_PROGRESS) && (flag & FLAG_NOT_DUP))
- return 1;
+ return true;
if (!(flag & FLAG_DATA_ACKED)) {
tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 0 : 3),
flag);
- return 1;
+ return true;
}
} else {
if (!(flag & FLAG_DATA_ACKED) && (tp->frto_counter == 1)) {
/* Prevent sending of new data. */
tp->snd_cwnd = min(tp->snd_cwnd,
tcp_packets_in_flight(tp));
- return 1;
+ return true;
}
if ((tp->frto_counter >= 2) &&
@@ -3639,10 +3541,10 @@ static int tcp_process_frto(struct sock *sk, int flag)
/* RFC4138 shortcoming (see comment above) */
if (!(flag & FLAG_FORWARD_PROGRESS) &&
(flag & FLAG_NOT_DUP))
- return 1;
+ return true;
tcp_enter_frto_loss(sk, 3, flag);
- return 1;
+ return true;
}
}
@@ -3654,7 +3556,7 @@ static int tcp_process_frto(struct sock *sk, int flag)
if (!tcp_may_send_now(sk))
tcp_enter_frto_loss(sk, 2, flag);
- return 1;
+ return true;
} else {
switch (sysctl_tcp_frto_response) {
case 2:
@@ -3671,7 +3573,7 @@ static int tcp_process_frto(struct sock *sk, int flag)
tp->undo_marker = 0;
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSPURIOUSRTOS);
}
- return 0;
+ return false;
}
/* This routine deals with incoming acks, but not outgoing ones. */
@@ -3689,7 +3591,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
int prior_sacked = tp->sacked_out;
int pkts_acked = 0;
int newly_acked_sacked = 0;
- int frto_cwnd = 0;
+ bool frto_cwnd = false;
/* If the ack is older than previous acks
* then we can probably ignore it.
@@ -3703,6 +3605,9 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if (after(ack, tp->snd_nxt))
goto invalid_ack;
+ if (tp->early_retrans_delayed)
+ tcp_rearm_rto(sk);
+
if (after(ack, prior_snd_una))
flag |= FLAG_SND_UNA_ADVANCED;
@@ -3783,9 +3688,11 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
tcp_cong_avoid(sk, ack, prior_in_flight);
}
- if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP))
- dst_confirm(__sk_dst_get(sk));
-
+ if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) {
+ struct dst_entry *dst = __sk_dst_get(sk);
+ if (dst)
+ dst_confirm(dst);
+ }
return 1;
no_queue:
@@ -3825,7 +3732,8 @@ old_ack:
* the fast version below fails.
*/
void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *opt_rx,
- const u8 **hvpp, int estab)
+ const u8 **hvpp, int estab,
+ struct tcp_fastopen_cookie *foc)
{
const unsigned char *ptr;
const struct tcphdr *th = tcp_hdr(skb);
@@ -3868,10 +3776,9 @@ void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *o
__u8 snd_wscale = *(__u8 *)ptr;
opt_rx->wscale_ok = 1;
if (snd_wscale > 14) {
- if (net_ratelimit())
- pr_info("%s: Illegal window scaling value %d >14 received\n",
- __func__,
- snd_wscale);
+ net_info_ratelimited("%s: Illegal window scaling value %d >14 received\n",
+ __func__,
+ snd_wscale);
snd_wscale = 14;
}
opt_rx->snd_wscale = snd_wscale;
@@ -3933,8 +3840,25 @@ void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *o
break;
}
break;
- }
+ case TCPOPT_EXP:
+ /* Fast Open option shares code 254 using a
+ * 16 bits magic number. It's valid only in
+ * SYN or SYN-ACK with an even size.
+ */
+ if (opsize < TCPOLEN_EXP_FASTOPEN_BASE ||
+ get_unaligned_be16(ptr) != TCPOPT_FASTOPEN_MAGIC ||
+ foc == NULL || !th->syn || (opsize & 1))
+ break;
+ foc->len = opsize - TCPOLEN_EXP_FASTOPEN_BASE;
+ if (foc->len >= TCP_FASTOPEN_COOKIE_MIN &&
+ foc->len <= TCP_FASTOPEN_COOKIE_MAX)
+ memcpy(foc->val, ptr + 2, foc->len);
+ else if (foc->len != 0)
+ foc->len = -1;
+ break;
+
+ }
ptr += opsize-2;
length -= opsize;
}
@@ -3942,7 +3866,7 @@ void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *o
}
EXPORT_SYMBOL(tcp_parse_options);
-static int tcp_parse_aligned_timestamp(struct tcp_sock *tp, const struct tcphdr *th)
+static bool tcp_parse_aligned_timestamp(struct tcp_sock *tp, const struct tcphdr *th)
{
const __be32 *ptr = (const __be32 *)(th + 1);
@@ -3953,31 +3877,31 @@ static int tcp_parse_aligned_timestamp(struct tcp_sock *tp, const struct tcphdr
tp->rx_opt.rcv_tsval = ntohl(*ptr);
++ptr;
tp->rx_opt.rcv_tsecr = ntohl(*ptr);
- return 1;
+ return true;
}
- return 0;
+ return false;
}
/* Fast parse options. This hopes to only see timestamps.
* If it is wrong it falls back on tcp_parse_options().
*/
-static int tcp_fast_parse_options(const struct sk_buff *skb,
- const struct tcphdr *th,
- struct tcp_sock *tp, const u8 **hvpp)
+static bool tcp_fast_parse_options(const struct sk_buff *skb,
+ const struct tcphdr *th,
+ struct tcp_sock *tp, const u8 **hvpp)
{
/* In the spirit of fast parsing, compare doff directly to constant
* values. Because equality is used, short doff can be ignored here.
*/
if (th->doff == (sizeof(*th) / 4)) {
tp->rx_opt.saw_tstamp = 0;
- return 0;
+ return false;
} else if (tp->rx_opt.tstamp_ok &&
th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / 4)) {
if (tcp_parse_aligned_timestamp(tp, th))
- return 1;
+ return true;
}
- tcp_parse_options(skb, &tp->rx_opt, hvpp, 1);
- return 1;
+ tcp_parse_options(skb, &tp->rx_opt, hvpp, 1, NULL);
+ return true;
}
#ifdef CONFIG_TCP_MD5SIG
@@ -4082,7 +4006,7 @@ static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb)
(s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (inet_csk(sk)->icsk_rto * 1024) / HZ);
}
-static inline int tcp_paws_discard(const struct sock *sk,
+static inline bool tcp_paws_discard(const struct sock *sk,
const struct sk_buff *skb)
{
const struct tcp_sock *tp = tcp_sk(sk);
@@ -4104,7 +4028,7 @@ static inline int tcp_paws_discard(const struct sock *sk,
* (borrowed from freebsd)
*/
-static inline int tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq)
+static inline bool tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq)
{
return !before(end_seq, tp->rcv_wup) &&
!after(seq, tp->rcv_nxt + tcp_receive_window(tp));
@@ -4218,7 +4142,7 @@ static void tcp_fin(struct sock *sk)
}
}
-static inline int tcp_sack_extend(struct tcp_sack_block *sp, u32 seq,
+static inline bool tcp_sack_extend(struct tcp_sack_block *sp, u32 seq,
u32 end_seq)
{
if (!after(seq, sp->end_seq) && !after(sp->start_seq, end_seq)) {
@@ -4226,9 +4150,9 @@ static inline int tcp_sack_extend(struct tcp_sack_block *sp, u32 seq,
sp->start_seq = seq;
if (after(end_seq, sp->end_seq))
sp->end_seq = end_seq;
- return 1;
+ return true;
}
- return 0;
+ return false;
}
static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq)
@@ -4424,10 +4348,10 @@ static void tcp_ofo_queue(struct sock *sk)
}
}
-static int tcp_prune_ofo_queue(struct sock *sk);
+static bool tcp_prune_ofo_queue(struct sock *sk);
static int tcp_prune_queue(struct sock *sk);
-static inline int tcp_try_rmem_schedule(struct sock *sk, unsigned int size)
+static int tcp_try_rmem_schedule(struct sock *sk, unsigned int size)
{
if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
!sk_rmem_schedule(sk, size)) {
@@ -4446,6 +4370,46 @@ static inline int tcp_try_rmem_schedule(struct sock *sk, unsigned int size)
return 0;
}
+/**
+ * tcp_try_coalesce - try to merge skb to prior one
+ * @sk: socket
+ * @to: prior buffer
+ * @from: buffer to add in queue
+ * @fragstolen: pointer to boolean
+ *
+ * Before queueing skb @from after @to, try to merge them
+ * to reduce overall memory use and queue lengths, if cost is small.
+ * Packets in ofo or receive queues can stay a long time.
+ * Better try to coalesce them right now to avoid future collapses.
+ * Returns true if caller should free @from instead of queueing it
+ */
+static bool tcp_try_coalesce(struct sock *sk,
+ struct sk_buff *to,
+ struct sk_buff *from,
+ bool *fragstolen)
+{
+ int delta;
+
+ *fragstolen = false;
+
+ if (tcp_hdr(from)->fin)
+ return false;
+
+ /* Its possible this segment overlaps with prior segment in queue */
+ if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq)
+ return false;
+
+ if (!skb_try_coalesce(to, from, fragstolen, &delta))
+ return false;
+
+ atomic_add(delta, &sk->sk_rmem_alloc);
+ sk_mem_charge(sk, delta);
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE);
+ TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq;
+ TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq;
+ return true;
+}
+
static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -4454,8 +4418,8 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
TCP_ECN_check_ce(tp, skb);
- if (tcp_try_rmem_schedule(sk, skb->truesize)) {
- /* TODO: should increment a counter */
+ if (unlikely(tcp_try_rmem_schedule(sk, skb->truesize))) {
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFODROP);
__kfree_skb(skb);
return;
}
@@ -4464,6 +4428,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
tp->pred_flags = 0;
inet_csk_schedule_ack(sk);
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
@@ -4484,23 +4449,13 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
end_seq = TCP_SKB_CB(skb)->end_seq;
if (seq == TCP_SKB_CB(skb1)->end_seq) {
- /* Packets in ofo can stay in queue a long time.
- * Better try to coalesce them right now
- * to avoid future tcp_collapse_ofo_queue(),
- * probably the most expensive function in tcp stack.
- */
- if (skb->len <= skb_tailroom(skb1) && !tcp_hdr(skb)->fin) {
- NET_INC_STATS_BH(sock_net(sk),
- LINUX_MIB_TCPRCVCOALESCE);
- BUG_ON(skb_copy_bits(skb, 0,
- skb_put(skb1, skb->len),
- skb->len));
- TCP_SKB_CB(skb1)->end_seq = end_seq;
- TCP_SKB_CB(skb1)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
- __kfree_skb(skb);
- skb = NULL;
- } else {
+ bool fragstolen;
+
+ if (!tcp_try_coalesce(sk, skb1, skb, &fragstolen)) {
__skb_queue_after(&tp->out_of_order_queue, skb1, skb);
+ } else {
+ kfree_skb_partial(skb, fragstolen);
+ skb = NULL;
}
if (!tp->rx_opt.num_sacks ||
@@ -4527,6 +4482,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) {
if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
/* All the bits are present. Drop. */
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
__kfree_skb(skb);
skb = NULL;
tcp_dsack_set(sk, seq, end_seq);
@@ -4565,6 +4521,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
__skb_unlink(skb1, &tp->out_of_order_queue);
tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
TCP_SKB_CB(skb1)->end_seq);
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
__kfree_skb(skb1);
}
@@ -4576,12 +4533,65 @@ end:
skb_set_owner_r(skb, sk);
}
+static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen,
+ bool *fragstolen)
+{
+ int eaten;
+ struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue);
+
+ __skb_pull(skb, hdrlen);
+ eaten = (tail &&
+ tcp_try_coalesce(sk, tail, skb, fragstolen)) ? 1 : 0;
+ tcp_sk(sk)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+ if (!eaten) {
+ __skb_queue_tail(&sk->sk_receive_queue, skb);
+ skb_set_owner_r(skb, sk);
+ }
+ return eaten;
+}
+
+int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size)
+{
+ struct sk_buff *skb;
+ struct tcphdr *th;
+ bool fragstolen;
+
+ if (tcp_try_rmem_schedule(sk, size + sizeof(*th)))
+ goto err;
+
+ skb = alloc_skb(size + sizeof(*th), sk->sk_allocation);
+ if (!skb)
+ goto err;
+
+ th = (struct tcphdr *)skb_put(skb, sizeof(*th));
+ skb_reset_transport_header(skb);
+ memset(th, 0, sizeof(*th));
+
+ if (memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size))
+ goto err_free;
+
+ TCP_SKB_CB(skb)->seq = tcp_sk(sk)->rcv_nxt;
+ TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size;
+ TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1;
+
+ if (tcp_queue_rcv(sk, skb, sizeof(*th), &fragstolen)) {
+ WARN_ON_ONCE(fragstolen); /* should not happen */
+ __kfree_skb(skb);
+ }
+ return size;
+
+err_free:
+ kfree_skb(skb);
+err:
+ return -ENOMEM;
+}
static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
{
const struct tcphdr *th = tcp_hdr(skb);
struct tcp_sock *tp = tcp_sk(sk);
int eaten = -1;
+ bool fragstolen = false;
if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq)
goto drop;
@@ -4626,8 +4636,7 @@ queue_and_out:
tcp_try_rmem_schedule(sk, skb->truesize))
goto drop;
- skb_set_owner_r(skb, sk);
- __skb_queue_tail(&sk->sk_receive_queue, skb);
+ eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen);
}
tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
if (skb->len)
@@ -4651,7 +4660,7 @@ queue_and_out:
tcp_fast_path_check(sk);
if (eaten > 0)
- __kfree_skb(skb);
+ kfree_skb_partial(skb, fragstolen);
else if (!sock_flag(sk, SOCK_DEAD))
sk->sk_data_ready(sk, 0);
return;
@@ -4871,10 +4880,10 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
* Purge the out-of-order queue.
* Return true if queue was pruned.
*/
-static int tcp_prune_ofo_queue(struct sock *sk)
+static bool tcp_prune_ofo_queue(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
- int res = 0;
+ bool res = false;
if (!skb_queue_empty(&tp->out_of_order_queue)) {
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_OFOPRUNED);
@@ -4888,7 +4897,7 @@ static int tcp_prune_ofo_queue(struct sock *sk)
if (tp->rx_opt.sack_ok)
tcp_sack_reset(&tp->rx_opt);
sk_mem_reclaim(sk);
- res = 1;
+ res = true;
}
return res;
}
@@ -4965,7 +4974,7 @@ void tcp_cwnd_application_limited(struct sock *sk)
tp->snd_cwnd_stamp = tcp_time_stamp;
}
-static int tcp_should_expand_sndbuf(const struct sock *sk)
+static bool tcp_should_expand_sndbuf(const struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
@@ -4973,21 +4982,21 @@ static int tcp_should_expand_sndbuf(const struct sock *sk)
* not modify it.
*/
if (sk->sk_userlocks & SOCK_SNDBUF_LOCK)
- return 0;
+ return false;
/* If we are under global TCP memory pressure, do not expand. */
if (sk_under_memory_pressure(sk))
- return 0;
+ return false;
/* If we are under soft global TCP memory pressure, do not expand. */
if (sk_memory_allocated(sk) >= sk_prot_mem_limits(sk, 0))
- return 0;
+ return false;
/* If we filled the congestion window, do not expand. */
if (tp->packets_out >= tp->snd_cwnd)
- return 0;
+ return false;
- return 1;
+ return true;
}
/* When incoming ACK allowed to free some skb from write_queue,
@@ -5205,7 +5214,7 @@ static __sum16 __tcp_checksum_complete_user(struct sock *sk,
return result;
}
-static inline int tcp_checksum_complete_user(struct sock *sk,
+static inline bool tcp_checksum_complete_user(struct sock *sk,
struct sk_buff *skb)
{
return !skb_csum_unnecessary(skb) &&
@@ -5213,19 +5222,19 @@ static inline int tcp_checksum_complete_user(struct sock *sk,
}
#ifdef CONFIG_NET_DMA
-static int tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb,
+static bool tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb,
int hlen)
{
struct tcp_sock *tp = tcp_sk(sk);
int chunk = skb->len - hlen;
int dma_cookie;
- int copied_early = 0;
+ bool copied_early = false;
if (tp->ucopy.wakeup)
- return 0;
+ return false;
if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
- tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
+ tp->ucopy.dma_chan = net_dma_find_channel();
if (tp->ucopy.dma_chan && skb_csum_unnecessary(skb)) {
@@ -5238,7 +5247,7 @@ static int tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb,
goto out;
tp->ucopy.dma_cookie = dma_cookie;
- copied_early = 1;
+ copied_early = true;
tp->ucopy.len -= chunk;
tp->copied_seq += chunk;
@@ -5259,11 +5268,28 @@ out:
}
#endif /* CONFIG_NET_DMA */
+static void tcp_send_challenge_ack(struct sock *sk)
+{
+ /* unprotected vars, we dont care of overwrites */
+ static u32 challenge_timestamp;
+ static unsigned int challenge_count;
+ u32 now = jiffies / HZ;
+
+ if (now != challenge_timestamp) {
+ challenge_timestamp = now;
+ challenge_count = 0;
+ }
+ if (++challenge_count <= sysctl_tcp_challenge_ack_limit) {
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPCHALLENGEACK);
+ tcp_send_ack(sk);
+ }
+}
+
/* Does PAWS and seqno based validation of an incoming segment, flags will
* play significant role here.
*/
-static int tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
- const struct tcphdr *th, int syn_inerr)
+static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
+ const struct tcphdr *th, int syn_inerr)
{
const u8 *hash_location;
struct tcp_sock *tp = tcp_sk(sk);
@@ -5288,14 +5314,26 @@ static int tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
* an acknowledgment should be sent in reply (unless the RST
* bit is set, if so drop the segment and return)".
*/
- if (!th->rst)
+ if (!th->rst) {
+ if (th->syn)
+ goto syn_challenge;
tcp_send_dupack(sk, skb);
+ }
goto discard;
}
/* Step 2: check RST bit */
if (th->rst) {
- tcp_reset(sk);
+ /* RFC 5961 3.2 :
+ * If sequence number exactly matches RCV.NXT, then
+ * RESET the connection
+ * else
+ * Send a challenge ACK
+ */
+ if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt)
+ tcp_reset(sk);
+ else
+ tcp_send_challenge_ack(sk);
goto discard;
}
@@ -5306,20 +5344,23 @@ static int tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
/* step 3: check security and precedence [ignored] */
- /* step 4: Check for a SYN in window. */
- if (th->syn && !before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
+ /* step 4: Check for a SYN
+ * RFC 5691 4.2 : Send a challenge ack
+ */
+ if (th->syn) {
+syn_challenge:
if (syn_inerr)
TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONSYN);
- tcp_reset(sk);
- return -1;
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE);
+ tcp_send_challenge_ack(sk);
+ goto discard;
}
- return 1;
+ return true;
discard:
__kfree_skb(skb);
- return 0;
+ return false;
}
/*
@@ -5349,7 +5390,18 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
const struct tcphdr *th, unsigned int len)
{
struct tcp_sock *tp = tcp_sk(sk);
- int res;
+
+ if (sk->sk_rx_dst) {
+ struct dst_entry *dst = sk->sk_rx_dst;
+ if (unlikely(dst->obsolete)) {
+ if (dst->ops->check(dst, 0) == NULL) {
+ dst_release(dst);
+ sk->sk_rx_dst = NULL;
+ }
+ }
+ }
+ if (unlikely(sk->sk_rx_dst == NULL))
+ sk->sk_rx_dst = dst_clone(skb_dst(skb));
/*
* Header prediction.
@@ -5430,6 +5482,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
} else {
int eaten = 0;
int copied_early = 0;
+ bool fragstolen = false;
if (tp->copied_seq == tp->rcv_nxt &&
len - tcp_header_len <= tp->ucopy.len) {
@@ -5487,10 +5540,8 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITS);
/* Bulk data transfer: receiver */
- __skb_pull(skb, tcp_header_len);
- __skb_queue_tail(&sk->sk_receive_queue, skb);
- skb_set_owner_r(skb, sk);
- tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+ eaten = tcp_queue_rcv(sk, skb, tcp_header_len,
+ &fragstolen);
}
tcp_event_data_recv(sk, skb);
@@ -5512,7 +5563,7 @@ no_ack:
else
#endif
if (eaten)
- __kfree_skb(skb);
+ kfree_skb_partial(skb, fragstolen);
else
sk->sk_data_ready(sk, 0);
return 0;
@@ -5527,9 +5578,8 @@ slow_path:
* Standard slow path.
*/
- res = tcp_validate_incoming(sk, skb, th, 1);
- if (res <= 0)
- return -res;
+ if (!tcp_validate_incoming(sk, skb, th, 1))
+ return 0;
step5:
if (th->ack && tcp_ack(sk, skb, FLAG_SLOWPATH) < 0)
@@ -5556,6 +5606,85 @@ discard:
}
EXPORT_SYMBOL(tcp_rcv_established);
+void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ tcp_set_state(sk, TCP_ESTABLISHED);
+
+ if (skb != NULL) {
+ sk->sk_rx_dst = dst_clone(skb_dst(skb));
+ security_inet_conn_established(sk, skb);
+ }
+
+ /* Make sure socket is routed, for correct metrics. */
+ icsk->icsk_af_ops->rebuild_header(sk);
+
+ tcp_init_metrics(sk);
+
+ tcp_init_congestion_control(sk);
+
+ /* Prevent spurious tcp_cwnd_restart() on first data
+ * packet.
+ */
+ tp->lsndtime = tcp_time_stamp;
+
+ tcp_init_buffer_space(sk);
+
+ if (sock_flag(sk, SOCK_KEEPOPEN))
+ inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));
+
+ if (!tp->rx_opt.snd_wscale)
+ __tcp_fast_path_on(tp, tp->snd_wnd);
+ else
+ tp->pred_flags = 0;
+
+ if (!sock_flag(sk, SOCK_DEAD)) {
+ sk->sk_state_change(sk);
+ sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
+ }
+}
+
+static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
+ struct tcp_fastopen_cookie *cookie)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *data = tp->syn_data ? tcp_write_queue_head(sk) : NULL;
+ u16 mss = tp->rx_opt.mss_clamp;
+ bool syn_drop;
+
+ if (mss == tp->rx_opt.user_mss) {
+ struct tcp_options_received opt;
+ const u8 *hash_location;
+
+ /* Get original SYNACK MSS value if user MSS sets mss_clamp */
+ tcp_clear_options(&opt);
+ opt.user_mss = opt.mss_clamp = 0;
+ tcp_parse_options(synack, &opt, &hash_location, 0, NULL);
+ mss = opt.mss_clamp;
+ }
+
+ if (!tp->syn_fastopen) /* Ignore an unsolicited cookie */
+ cookie->len = -1;
+
+ /* The SYN-ACK neither has cookie nor acknowledges the data. Presumably
+ * the remote receives only the retransmitted (regular) SYNs: either
+ * the original SYN-data or the corresponding SYN-ACK is lost.
+ */
+ syn_drop = (cookie->len <= 0 && data &&
+ inet_csk(sk)->icsk_retransmits);
+
+ tcp_fastopen_cache_set(sk, mss, cookie, syn_drop);
+
+ if (data) { /* Retransmit unacked data in SYN */
+ tcp_retransmit_skb(sk, data);
+ tcp_rearm_rto(sk);
+ return true;
+ }
+ return false;
+}
+
static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
const struct tcphdr *th, unsigned int len)
{
@@ -5563,9 +5692,10 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_cookie_values *cvp = tp->cookie_values;
+ struct tcp_fastopen_cookie foc = { .len = -1 };
int saved_clamp = tp->rx_opt.mss_clamp;
- tcp_parse_options(skb, &tp->rx_opt, &hash_location, 0);
+ tcp_parse_options(skb, &tp->rx_opt, &hash_location, 0, &foc);
if (th->ack) {
/* rfc793:
@@ -5575,11 +5705,9 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
* If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send
* a reset (unless the RST bit is set, if so drop
* the segment and return)"
- *
- * We do not send data with SYN, so that RFC-correct
- * test reduces to:
*/
- if (TCP_SKB_CB(skb)->ack_seq != tp->snd_nxt)
+ if (!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_una) ||
+ after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt))
goto reset_and_undo;
if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
@@ -5688,36 +5816,12 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
}
smp_mb();
- tcp_set_state(sk, TCP_ESTABLISHED);
-
- security_inet_conn_established(sk, skb);
-
- /* Make sure socket is routed, for correct metrics. */
- icsk->icsk_af_ops->rebuild_header(sk);
-
- tcp_init_metrics(sk);
-
- tcp_init_congestion_control(sk);
-
- /* Prevent spurious tcp_cwnd_restart() on first data
- * packet.
- */
- tp->lsndtime = tcp_time_stamp;
-
- tcp_init_buffer_space(sk);
- if (sock_flag(sk, SOCK_KEEPOPEN))
- inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));
+ tcp_finish_connect(sk, skb);
- if (!tp->rx_opt.snd_wscale)
- __tcp_fast_path_on(tp, tp->snd_wnd);
- else
- tp->pred_flags = 0;
-
- if (!sock_flag(sk, SOCK_DEAD)) {
- sk->sk_state_change(sk);
- sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
- }
+ if ((tp->syn_fastopen || tp->syn_data) &&
+ tcp_rcv_fastopen_synack(sk, skb, &foc))
+ return -1;
if (sk->sk_write_pending ||
icsk->icsk_accept_queue.rskq_defer_accept ||
@@ -5731,8 +5835,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
*/
inet_csk_schedule_ack(sk);
icsk->icsk_ack.lrcvtime = tcp_time_stamp;
- icsk->icsk_ack.ato = TCP_ATO_MIN;
- tcp_incr_quickack(sk);
tcp_enter_quickack_mode(sk);
inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
TCP_DELACK_MAX, TCP_RTO_MAX);
@@ -5839,7 +5941,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
int queued = 0;
- int res;
tp->rx_opt.saw_tstamp = 0;
@@ -5894,9 +5995,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
return 0;
}
- res = tcp_validate_incoming(sk, skb, th, 0);
- if (res <= 0)
- return -res;
+ if (!tcp_validate_incoming(sk, skb, th, 0))
+ return 0;
/* step 5: check the ACK field */
if (th->ack) {
@@ -5952,9 +6052,14 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
case TCP_FIN_WAIT1:
if (tp->snd_una == tp->write_seq) {
+ struct dst_entry *dst;
+
tcp_set_state(sk, TCP_FIN_WAIT2);
sk->sk_shutdown |= SEND_SHUTDOWN;
- dst_confirm(__sk_dst_get(sk));
+
+ dst = __sk_dst_get(sk);
+ if (dst)
+ dst_confirm(dst);
if (!sock_flag(sk, SOCK_DEAD))
/* Wake up lingering close() */
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 3a25cf743f8b..1d8b75a58981 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -138,6 +138,14 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
}
EXPORT_SYMBOL_GPL(tcp_twsk_unique);
+static int tcp_repair_connect(struct sock *sk)
+{
+ tcp_connect_init(sk);
+ tcp_finish_connect(sk, NULL);
+
+ return 0;
+}
+
/* This will initiate an outgoing connection. */
int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
@@ -196,26 +204,13 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
/* Reset inherited state */
tp->rx_opt.ts_recent = 0;
tp->rx_opt.ts_recent_stamp = 0;
- tp->write_seq = 0;
+ if (likely(!tp->repair))
+ tp->write_seq = 0;
}
if (tcp_death_row.sysctl_tw_recycle &&
- !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr) {
- struct inet_peer *peer = rt_get_peer(rt, fl4->daddr);
- /*
- * VJ's idea. We save last timestamp seen from
- * the destination in peer table, when entering state
- * TIME-WAIT * and initialize rx_opt.ts_recent from it,
- * when trying new connection.
- */
- if (peer) {
- inet_peer_refcheck(peer);
- if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
- tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
- tp->rx_opt.ts_recent = peer->tcp_ts;
- }
- }
- }
+ !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
+ tcp_fetch_timewait_stamp(sk, &rt->dst);
inet->inet_dport = usin->sin_port;
inet->inet_daddr = daddr;
@@ -247,7 +242,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
sk->sk_gso_type = SKB_GSO_TCPV4;
sk_setup_caps(sk, &rt->dst);
- if (!tp->write_seq)
+ if (!tp->write_seq && likely(!tp->repair))
tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
inet->inet_daddr,
inet->inet_sport,
@@ -255,7 +250,11 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
inet->inet_id = tp->write_seq ^ jiffies;
- err = tcp_connect(sk);
+ if (likely(!tp->repair))
+ err = tcp_connect(sk);
+ else
+ err = tcp_repair_connect(sk);
+
rt = NULL;
if (err)
goto failure;
@@ -290,17 +289,10 @@ static void do_pmtu_discovery(struct sock *sk, const struct iphdr *iph, u32 mtu)
if (sk->sk_state == TCP_LISTEN)
return;
- /* We don't check in the destentry if pmtu discovery is forbidden
- * on this route. We just assume that no packet_to_big packets
- * are send back when pmtu discovery is not active.
- * There is a small race when the user changes this flag in the
- * route, but I think that's acceptable.
- */
- if ((dst = __sk_dst_check(sk, 0)) == NULL)
+ dst = inet_csk_update_pmtu(sk, mtu);
+ if (!dst)
return;
- dst->ops->update_pmtu(dst, mtu);
-
/* Something is about to be wrong... Remember soft error
* for the case, if this connection will not able to recover.
*/
@@ -322,6 +314,14 @@ static void do_pmtu_discovery(struct sock *sk, const struct iphdr *iph, u32 mtu)
} /* else let the usual retransmit timer handle it */
}
+static void do_redirect(struct sk_buff *skb, struct sock *sk)
+{
+ struct dst_entry *dst = __sk_dst_check(sk, 0);
+
+ if (dst)
+ dst->ops->redirect(dst, sk, skb);
+}
+
/*
* This routine is called by the ICMP module when it gets some
* sort of error condition. If err < 0 then the socket should
@@ -395,6 +395,9 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
}
switch (type) {
+ case ICMP_REDIRECT:
+ do_redirect(icmp_skb, sk);
+ goto out;
case ICMP_SOURCE_QUENCH:
/* Just silently ignore these. */
goto out;
@@ -685,8 +688,8 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
net = dev_net(skb_dst(skb)->dev);
arg.tos = ip_hdr(skb)->tos;
- ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
- &arg, arg.iov[0].iov_len);
+ ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr,
+ ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
@@ -768,8 +771,8 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
if (oif)
arg.bound_dev_if = oif;
arg.tos = tos;
- ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
- &arg, arg.iov[0].iov_len);
+ ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr,
+ ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
}
@@ -811,7 +814,9 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
*/
static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
struct request_sock *req,
- struct request_values *rvp)
+ struct request_values *rvp,
+ u16 queue_mapping,
+ bool nocache)
{
const struct inet_request_sock *ireq = inet_rsk(req);
struct flowi4 fl4;
@@ -819,7 +824,7 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
struct sk_buff * skb;
/* First, grab a route. */
- if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
+ if (!dst && (dst = inet_csk_route_req(sk, &fl4, req, nocache)) == NULL)
return -1;
skb = tcp_make_synack(sk, dst, req, rvp);
@@ -827,13 +832,13 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
if (skb) {
__tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
+ skb_set_queue_mapping(skb, queue_mapping);
err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
ireq->rmt_addr,
ireq->opt);
err = net_xmit_eval(err);
}
- dst_release(dst);
return err;
}
@@ -841,7 +846,7 @@ static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
struct request_values *rvp)
{
TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
- return tcp_v4_send_synack(sk, NULL, req, rvp);
+ return tcp_v4_send_synack(sk, NULL, req, rvp, 0, false);
}
/*
@@ -853,14 +858,14 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req)
}
/*
- * Return 1 if a syncookie should be sent
+ * Return true if a syncookie should be sent
*/
-int tcp_syn_flood_action(struct sock *sk,
+bool tcp_syn_flood_action(struct sock *sk,
const struct sk_buff *skb,
const char *proto)
{
const char *msg = "Dropping request";
- int want_cookie = 0;
+ bool want_cookie = false;
struct listen_sock *lopt;
@@ -868,7 +873,7 @@ int tcp_syn_flood_action(struct sock *sk,
#ifdef CONFIG_SYN_COOKIES
if (sysctl_tcp_syncookies) {
msg = "Sending cookies";
- want_cookie = 1;
+ want_cookie = true;
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
} else
#endif
@@ -1183,7 +1188,7 @@ clear_hash_noput:
}
EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
-static int tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
+static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
{
/*
* This gets called for each TCP segment that arrives
@@ -1206,16 +1211,16 @@ static int tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
/* We've parsed the options - do we have a hash? */
if (!hash_expected && !hash_location)
- return 0;
+ return false;
if (hash_expected && !hash_location) {
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
- return 1;
+ return true;
}
if (!hash_expected && hash_location) {
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
- return 1;
+ return true;
}
/* Okay, so this is hash_expected and hash_location -
@@ -1226,15 +1231,14 @@ static int tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
NULL, NULL, skb);
if (genhash || memcmp(hash_location, newhash, 16) != 0) {
- if (net_ratelimit()) {
- pr_info("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
- &iph->saddr, ntohs(th->source),
- &iph->daddr, ntohs(th->dest),
- genhash ? " tcp_v4_calc_md5_hash failed" : "");
- }
- return 1;
+ net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
+ &iph->saddr, ntohs(th->source),
+ &iph->daddr, ntohs(th->dest),
+ genhash ? " tcp_v4_calc_md5_hash failed"
+ : "");
+ return true;
}
- return 0;
+ return false;
}
#endif
@@ -1268,7 +1272,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
__be32 saddr = ip_hdr(skb)->saddr;
__be32 daddr = ip_hdr(skb)->daddr;
__u32 isn = TCP_SKB_CB(skb)->when;
- int want_cookie = 0;
+ bool want_cookie = false;
/* Never answer to SYNs send to broadcast or multicast */
if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
@@ -1303,7 +1307,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
tcp_clear_options(&tmp_opt);
tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
tmp_opt.user_mss = tp->rx_opt.user_mss;
- tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
+ tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL);
if (tmp_opt.cookie_plus > 0 &&
tmp_opt.saw_tstamp &&
@@ -1327,7 +1331,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
while (l-- > 0)
*c++ ^= *hash_location++;
- want_cookie = 0; /* not our kind of cookie */
+ want_cookie = false; /* not our kind of cookie */
tmp_ext.cookie_out_never = 0; /* false */
tmp_ext.cookie_plus = tmp_opt.cookie_plus;
} else if (!tp->rx_opt.cookie_in_always) {
@@ -1355,13 +1359,12 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
goto drop_and_free;
if (!want_cookie || tmp_opt.tstamp_ok)
- TCP_ECN_create_request(req, tcp_hdr(skb));
+ TCP_ECN_create_request(req, skb);
if (want_cookie) {
isn = cookie_v4_init_sequence(sk, skb, &req->mss);
req->cookie_ts = tmp_opt.tstamp_ok;
} else if (!isn) {
- struct inet_peer *peer = NULL;
struct flowi4 fl4;
/* VJ's idea. We save last timestamp seen
@@ -1375,13 +1378,9 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
*/
if (tmp_opt.saw_tstamp &&
tcp_death_row.sysctl_tw_recycle &&
- (dst = inet_csk_route_req(sk, &fl4, req)) != NULL &&
- fl4.daddr == saddr &&
- (peer = rt_get_peer((struct rtable *)dst, fl4.daddr)) != NULL) {
- inet_peer_refcheck(peer);
- if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
- (s32)(peer->tcp_ts - req->ts_recent) >
- TCP_PAWS_WINDOW) {
+ (dst = inet_csk_route_req(sk, &fl4, req, want_cookie)) != NULL &&
+ fl4.daddr == saddr) {
+ if (!tcp_peer_is_proven(req, dst, true)) {
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
goto drop_and_release;
}
@@ -1390,8 +1389,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
else if (!sysctl_tcp_syncookies &&
(sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
(sysctl_max_syn_backlog >> 2)) &&
- (!peer || !peer->tcp_ts_stamp) &&
- (!dst || !dst_metric(dst, RTAX_RTT))) {
+ !tcp_peer_is_proven(req, dst, false)) {
/* Without syncookies last quarter of
* backlog is filled with destinations,
* proven to be alive.
@@ -1410,7 +1408,9 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
tcp_rsk(req)->snt_synack = tcp_time_stamp;
if (tcp_v4_send_synack(sk, dst, req,
- (struct request_values *)&tmp_ext) ||
+ (struct request_values *)&tmp_ext,
+ skb_get_queue_mapping(skb),
+ want_cookie) ||
want_cookie)
goto drop_and_free;
@@ -1657,6 +1657,51 @@ csum_err:
}
EXPORT_SYMBOL(tcp_v4_do_rcv);
+void tcp_v4_early_demux(struct sk_buff *skb)
+{
+ struct net *net = dev_net(skb->dev);
+ const struct iphdr *iph;
+ const struct tcphdr *th;
+ struct net_device *dev;
+ struct sock *sk;
+
+ if (skb->pkt_type != PACKET_HOST)
+ return;
+
+ if (!pskb_may_pull(skb, ip_hdrlen(skb) + sizeof(struct tcphdr)))
+ return;
+
+ iph = ip_hdr(skb);
+ th = (struct tcphdr *) ((char *)iph + ip_hdrlen(skb));
+
+ if (th->doff < sizeof(struct tcphdr) / 4)
+ return;
+
+ if (!pskb_may_pull(skb, ip_hdrlen(skb) + th->doff * 4))
+ return;
+
+ dev = skb->dev;
+ sk = __inet_lookup_established(net, &tcp_hashinfo,
+ iph->saddr, th->source,
+ iph->daddr, ntohs(th->dest),
+ dev->ifindex);
+ if (sk) {
+ skb->sk = sk;
+ skb->destructor = sock_edemux;
+ if (sk->sk_state != TCP_TIME_WAIT) {
+ struct dst_entry *dst = sk->sk_rx_dst;
+ if (dst)
+ dst = dst_check(dst, 0);
+ if (dst) {
+ struct rtable *rt = (struct rtable *) dst;
+
+ if (rt->rt_iif == dev->ifindex)
+ skb_dst_set_noref(skb, dst);
+ }
+ }
+ }
+}
+
/*
* From tcp_input.c
*/
@@ -1730,7 +1775,7 @@ process:
#ifdef CONFIG_NET_DMA
struct tcp_sock *tp = tcp_sk(sk);
if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
- tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
+ tp->ucopy.dma_chan = net_dma_find_channel();
if (tp->ucopy.dma_chan)
ret = tcp_v4_do_rcv(sk, skb);
else
@@ -1739,7 +1784,8 @@ process:
if (!tcp_prequeue(sk, skb))
ret = tcp_v4_do_rcv(sk, skb);
}
- } else if (unlikely(sk_add_backlog(sk, skb))) {
+ } else if (unlikely(sk_add_backlog(sk, skb,
+ sk->sk_rcvbuf + sk->sk_sndbuf))) {
bh_unlock_sock(sk);
NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
goto discard_and_relse;
@@ -1805,40 +1851,10 @@ do_time_wait:
goto discard_it;
}
-struct inet_peer *tcp_v4_get_peer(struct sock *sk, bool *release_it)
-{
- struct rtable *rt = (struct rtable *) __sk_dst_get(sk);
- struct inet_sock *inet = inet_sk(sk);
- struct inet_peer *peer;
-
- if (!rt ||
- inet->cork.fl.u.ip4.daddr != inet->inet_daddr) {
- peer = inet_getpeer_v4(inet->inet_daddr, 1);
- *release_it = true;
- } else {
- if (!rt->peer)
- rt_bind_peer(rt, inet->inet_daddr, 1);
- peer = rt->peer;
- *release_it = false;
- }
-
- return peer;
-}
-EXPORT_SYMBOL(tcp_v4_get_peer);
-
-void *tcp_v4_tw_get_peer(struct sock *sk)
-{
- const struct inet_timewait_sock *tw = inet_twsk(sk);
-
- return inet_getpeer_v4(tw->tw_daddr, 1);
-}
-EXPORT_SYMBOL(tcp_v4_tw_get_peer);
-
static struct timewait_sock_ops tcp_timewait_sock_ops = {
.twsk_obj_size = sizeof(struct tcp_timewait_sock),
.twsk_unique = tcp_twsk_unique,
.twsk_destructor= tcp_twsk_destructor,
- .twsk_getpeer = tcp_v4_tw_get_peer,
};
const struct inet_connection_sock_af_ops ipv4_specific = {
@@ -1847,7 +1863,6 @@ const struct inet_connection_sock_af_ops ipv4_specific = {
.rebuild_header = inet_sk_rebuild_header,
.conn_request = tcp_v4_conn_request,
.syn_recv_sock = tcp_v4_syn_recv_sock,
- .get_peer = tcp_v4_get_peer,
.net_header_len = sizeof(struct iphdr),
.setsockopt = ip_setsockopt,
.getsockopt = ip_getsockopt,
@@ -1875,64 +1890,15 @@ static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
static int tcp_v4_init_sock(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
- struct tcp_sock *tp = tcp_sk(sk);
-
- skb_queue_head_init(&tp->out_of_order_queue);
- tcp_init_xmit_timers(sk);
- tcp_prequeue_init(tp);
-
- icsk->icsk_rto = TCP_TIMEOUT_INIT;
- tp->mdev = TCP_TIMEOUT_INIT;
-
- /* So many TCP implementations out there (incorrectly) count the
- * initial SYN frame in their delayed-ACK and congestion control
- * algorithms that we must have the following bandaid to talk
- * efficiently to them. -DaveM
- */
- tp->snd_cwnd = TCP_INIT_CWND;
-
- /* See draft-stevens-tcpca-spec-01 for discussion of the
- * initialization of these values.
- */
- tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
- tp->snd_cwnd_clamp = ~0;
- tp->mss_cache = TCP_MSS_DEFAULT;
- tp->reordering = sysctl_tcp_reordering;
- icsk->icsk_ca_ops = &tcp_init_congestion_ops;
-
- sk->sk_state = TCP_CLOSE;
-
- sk->sk_write_space = sk_stream_write_space;
- sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
+ tcp_init_sock(sk);
icsk->icsk_af_ops = &ipv4_specific;
- icsk->icsk_sync_mss = tcp_sync_mss;
+
#ifdef CONFIG_TCP_MD5SIG
- tp->af_specific = &tcp_sock_ipv4_specific;
+ tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
#endif
- /* TCP Cookie Transactions */
- if (sysctl_tcp_cookie_size > 0) {
- /* Default, cookies without s_data_payload. */
- tp->cookie_values =
- kzalloc(sizeof(*tp->cookie_values),
- sk->sk_allocation);
- if (tp->cookie_values != NULL)
- kref_init(&tp->cookie_values->kref);
- }
- /* Presumed zeroed, in order of appearance:
- * cookie_in_always, cookie_out_never,
- * s_data_constant, s_data_in, s_data_out
- */
- sk->sk_sndbuf = sysctl_tcp_wmem[1];
- sk->sk_rcvbuf = sysctl_tcp_rmem[1];
-
- local_bh_disable();
- sock_update_memcg(sk);
- sk_sockets_allocated_inc(sk);
- local_bh_enable();
-
return 0;
}
@@ -1986,6 +1952,9 @@ void tcp_v4_destroy_sock(struct sock *sk)
tp->cookie_values = NULL;
}
+ /* If socket is aborted during connect operation */
+ tcp_free_fastopen_req(tp);
+
sk_sockets_allocated_dec(sk);
sock_release_memcg(sk);
}
@@ -2109,7 +2078,7 @@ static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
return rc;
}
-static inline int empty_bucket(struct tcp_iter_state *st)
+static inline bool empty_bucket(struct tcp_iter_state *st)
{
return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
@@ -2626,6 +2595,7 @@ struct proto tcp_prot = {
.sendmsg = tcp_sendmsg,
.sendpage = tcp_sendpage,
.backlog_rcv = tcp_v4_do_rcv,
+ .release_cb = tcp_release_cb,
.hash = inet_hash,
.unhash = inet_unhash,
.get_port = inet_csk_get_port,
@@ -2657,13 +2627,11 @@ EXPORT_SYMBOL(tcp_prot);
static int __net_init tcp_sk_init(struct net *net)
{
- return inet_ctl_sock_create(&net->ipv4.tcp_sock,
- PF_INET, SOCK_RAW, IPPROTO_TCP, net);
+ return 0;
}
static void __net_exit tcp_sk_exit(struct net *net)
{
- inet_ctl_sock_destroy(net->ipv4.tcp_sock);
}
static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c
index e795272fbe9e..b6f3583ddfe8 100644
--- a/net/ipv4/tcp_memcontrol.c
+++ b/net/ipv4/tcp_memcontrol.c
@@ -6,37 +6,6 @@
#include <linux/memcontrol.h>
#include <linux/module.h>
-static u64 tcp_cgroup_read(struct cgroup *cont, struct cftype *cft);
-static int tcp_cgroup_write(struct cgroup *cont, struct cftype *cft,
- const char *buffer);
-static int tcp_cgroup_reset(struct cgroup *cont, unsigned int event);
-
-static struct cftype tcp_files[] = {
- {
- .name = "kmem.tcp.limit_in_bytes",
- .write_string = tcp_cgroup_write,
- .read_u64 = tcp_cgroup_read,
- .private = RES_LIMIT,
- },
- {
- .name = "kmem.tcp.usage_in_bytes",
- .read_u64 = tcp_cgroup_read,
- .private = RES_USAGE,
- },
- {
- .name = "kmem.tcp.failcnt",
- .private = RES_FAILCNT,
- .trigger = tcp_cgroup_reset,
- .read_u64 = tcp_cgroup_read,
- },
- {
- .name = "kmem.tcp.max_usage_in_bytes",
- .private = RES_MAX_USAGE,
- .trigger = tcp_cgroup_reset,
- .read_u64 = tcp_cgroup_read,
- },
-};
-
static inline struct tcp_memcontrol *tcp_from_cgproto(struct cg_proto *cg_proto)
{
return container_of(cg_proto, struct tcp_memcontrol, cg_proto);
@@ -49,7 +18,7 @@ static void memcg_tcp_enter_memory_pressure(struct sock *sk)
}
EXPORT_SYMBOL(memcg_tcp_enter_memory_pressure);
-int tcp_init_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss)
+int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
{
/*
* The root cgroup does not use res_counters, but rather,
@@ -59,13 +28,12 @@ int tcp_init_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss)
struct res_counter *res_parent = NULL;
struct cg_proto *cg_proto, *parent_cg;
struct tcp_memcontrol *tcp;
- struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
struct mem_cgroup *parent = parent_mem_cgroup(memcg);
struct net *net = current->nsproxy->net_ns;
cg_proto = tcp_prot.proto_cgroup(memcg);
if (!cg_proto)
- goto create_files;
+ return 0;
tcp = tcp_from_cgproto(cg_proto);
@@ -88,15 +56,12 @@ int tcp_init_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss)
cg_proto->sockets_allocated = &tcp->tcp_sockets_allocated;
cg_proto->memcg = memcg;
-create_files:
- return cgroup_add_files(cgrp, ss, tcp_files,
- ARRAY_SIZE(tcp_files));
+ return 0;
}
EXPORT_SYMBOL(tcp_init_cgroup);
-void tcp_destroy_cgroup(struct cgroup *cgrp)
+void tcp_destroy_cgroup(struct mem_cgroup *memcg)
{
- struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
struct cg_proto *cg_proto;
struct tcp_memcontrol *tcp;
u64 val;
@@ -109,9 +74,6 @@ void tcp_destroy_cgroup(struct cgroup *cgrp)
percpu_counter_destroy(&tcp->tcp_sockets_allocated);
val = res_counter_read_u64(&tcp->tcp_memory_allocated, RES_LIMIT);
-
- if (val != RESOURCE_MAX)
- static_key_slow_dec(&memcg_socket_limit_enabled);
}
EXPORT_SYMBOL(tcp_destroy_cgroup);
@@ -142,10 +104,33 @@ static int tcp_update_limit(struct mem_cgroup *memcg, u64 val)
tcp->tcp_prot_mem[i] = min_t(long, val >> PAGE_SHIFT,
net->ipv4.sysctl_tcp_mem[i]);
- if (val == RESOURCE_MAX && old_lim != RESOURCE_MAX)
- static_key_slow_dec(&memcg_socket_limit_enabled);
- else if (old_lim == RESOURCE_MAX && val != RESOURCE_MAX)
- static_key_slow_inc(&memcg_socket_limit_enabled);
+ if (val == RESOURCE_MAX)
+ clear_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags);
+ else if (val != RESOURCE_MAX) {
+ /*
+ * The active bit needs to be written after the static_key
+ * update. This is what guarantees that the socket activation
+ * function is the last one to run. See sock_update_memcg() for
+ * details, and note that we don't mark any socket as belonging
+ * to this memcg until that flag is up.
+ *
+ * We need to do this, because static_keys will span multiple
+ * sites, but we can't control their order. If we mark a socket
+ * as accounted, but the accounting functions are not patched in
+ * yet, we'll lose accounting.
+ *
+ * We never race with the readers in sock_update_memcg(),
+ * because when this value change, the code to process it is not
+ * patched in yet.
+ *
+ * The activated bit is used to guarantee that no two writers
+ * will do the update in the same memcg. Without that, we can't
+ * properly shutdown the static key.
+ */
+ if (!test_and_set_bit(MEMCG_SOCK_ACTIVATED, &cg_proto->flags))
+ static_key_slow_inc(&memcg_socket_limit_enabled);
+ set_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags);
+ }
return 0;
}
@@ -270,3 +255,37 @@ void tcp_prot_mem(struct mem_cgroup *memcg, long val, int idx)
tcp->tcp_prot_mem[idx] = val;
}
+
+static struct cftype tcp_files[] = {
+ {
+ .name = "kmem.tcp.limit_in_bytes",
+ .write_string = tcp_cgroup_write,
+ .read_u64 = tcp_cgroup_read,
+ .private = RES_LIMIT,
+ },
+ {
+ .name = "kmem.tcp.usage_in_bytes",
+ .read_u64 = tcp_cgroup_read,
+ .private = RES_USAGE,
+ },
+ {
+ .name = "kmem.tcp.failcnt",
+ .private = RES_FAILCNT,
+ .trigger = tcp_cgroup_reset,
+ .read_u64 = tcp_cgroup_read,
+ },
+ {
+ .name = "kmem.tcp.max_usage_in_bytes",
+ .private = RES_MAX_USAGE,
+ .trigger = tcp_cgroup_reset,
+ .read_u64 = tcp_cgroup_read,
+ },
+ { } /* terminate */
+};
+
+static int __init tcp_memcontrol_init(void)
+{
+ WARN_ON(cgroup_add_cftypes(&mem_cgroup_subsys, tcp_files));
+ return 0;
+}
+__initcall(tcp_memcontrol_init);
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
new file mode 100644
index 000000000000..992f1bff4fc6
--- /dev/null
+++ b/net/ipv4/tcp_metrics.c
@@ -0,0 +1,744 @@
+#include <linux/rcupdate.h>
+#include <linux/spinlock.h>
+#include <linux/jiffies.h>
+#include <linux/bootmem.h>
+#include <linux/module.h>
+#include <linux/cache.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/tcp.h>
+#include <linux/hash.h>
+
+#include <net/inet_connection_sock.h>
+#include <net/net_namespace.h>
+#include <net/request_sock.h>
+#include <net/inetpeer.h>
+#include <net/sock.h>
+#include <net/ipv6.h>
+#include <net/dst.h>
+#include <net/tcp.h>
+
+int sysctl_tcp_nometrics_save __read_mostly;
+
+enum tcp_metric_index {
+ TCP_METRIC_RTT,
+ TCP_METRIC_RTTVAR,
+ TCP_METRIC_SSTHRESH,
+ TCP_METRIC_CWND,
+ TCP_METRIC_REORDERING,
+
+ /* Always last. */
+ TCP_METRIC_MAX,
+};
+
+struct tcp_fastopen_metrics {
+ u16 mss;
+ u16 syn_loss:10; /* Recurring Fast Open SYN losses */
+ unsigned long last_syn_loss; /* Last Fast Open SYN loss */
+ struct tcp_fastopen_cookie cookie;
+};
+
+struct tcp_metrics_block {
+ struct tcp_metrics_block __rcu *tcpm_next;
+ struct inetpeer_addr tcpm_addr;
+ unsigned long tcpm_stamp;
+ u32 tcpm_ts;
+ u32 tcpm_ts_stamp;
+ u32 tcpm_lock;
+ u32 tcpm_vals[TCP_METRIC_MAX];
+ struct tcp_fastopen_metrics tcpm_fastopen;
+};
+
+static bool tcp_metric_locked(struct tcp_metrics_block *tm,
+ enum tcp_metric_index idx)
+{
+ return tm->tcpm_lock & (1 << idx);
+}
+
+static u32 tcp_metric_get(struct tcp_metrics_block *tm,
+ enum tcp_metric_index idx)
+{
+ return tm->tcpm_vals[idx];
+}
+
+static u32 tcp_metric_get_jiffies(struct tcp_metrics_block *tm,
+ enum tcp_metric_index idx)
+{
+ return msecs_to_jiffies(tm->tcpm_vals[idx]);
+}
+
+static void tcp_metric_set(struct tcp_metrics_block *tm,
+ enum tcp_metric_index idx,
+ u32 val)
+{
+ tm->tcpm_vals[idx] = val;
+}
+
+static void tcp_metric_set_msecs(struct tcp_metrics_block *tm,
+ enum tcp_metric_index idx,
+ u32 val)
+{
+ tm->tcpm_vals[idx] = jiffies_to_msecs(val);
+}
+
+static bool addr_same(const struct inetpeer_addr *a,
+ const struct inetpeer_addr *b)
+{
+ const struct in6_addr *a6, *b6;
+
+ if (a->family != b->family)
+ return false;
+ if (a->family == AF_INET)
+ return a->addr.a4 == b->addr.a4;
+
+ a6 = (const struct in6_addr *) &a->addr.a6[0];
+ b6 = (const struct in6_addr *) &b->addr.a6[0];
+
+ return ipv6_addr_equal(a6, b6);
+}
+
+struct tcpm_hash_bucket {
+ struct tcp_metrics_block __rcu *chain;
+};
+
+static DEFINE_SPINLOCK(tcp_metrics_lock);
+
+static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst)
+{
+ u32 val;
+
+ val = 0;
+ if (dst_metric_locked(dst, RTAX_RTT))
+ val |= 1 << TCP_METRIC_RTT;
+ if (dst_metric_locked(dst, RTAX_RTTVAR))
+ val |= 1 << TCP_METRIC_RTTVAR;
+ if (dst_metric_locked(dst, RTAX_SSTHRESH))
+ val |= 1 << TCP_METRIC_SSTHRESH;
+ if (dst_metric_locked(dst, RTAX_CWND))
+ val |= 1 << TCP_METRIC_CWND;
+ if (dst_metric_locked(dst, RTAX_REORDERING))
+ val |= 1 << TCP_METRIC_REORDERING;
+ tm->tcpm_lock = val;
+
+ tm->tcpm_vals[TCP_METRIC_RTT] = dst_metric_raw(dst, RTAX_RTT);
+ tm->tcpm_vals[TCP_METRIC_RTTVAR] = dst_metric_raw(dst, RTAX_RTTVAR);
+ tm->tcpm_vals[TCP_METRIC_SSTHRESH] = dst_metric_raw(dst, RTAX_SSTHRESH);
+ tm->tcpm_vals[TCP_METRIC_CWND] = dst_metric_raw(dst, RTAX_CWND);
+ tm->tcpm_vals[TCP_METRIC_REORDERING] = dst_metric_raw(dst, RTAX_REORDERING);
+ tm->tcpm_ts = 0;
+ tm->tcpm_ts_stamp = 0;
+ tm->tcpm_fastopen.mss = 0;
+ tm->tcpm_fastopen.syn_loss = 0;
+ tm->tcpm_fastopen.cookie.len = 0;
+}
+
+static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst,
+ struct inetpeer_addr *addr,
+ unsigned int hash,
+ bool reclaim)
+{
+ struct tcp_metrics_block *tm;
+ struct net *net;
+
+ spin_lock_bh(&tcp_metrics_lock);
+ net = dev_net(dst->dev);
+ if (unlikely(reclaim)) {
+ struct tcp_metrics_block *oldest;
+
+ oldest = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain);
+ for (tm = rcu_dereference(oldest->tcpm_next); tm;
+ tm = rcu_dereference(tm->tcpm_next)) {
+ if (time_before(tm->tcpm_stamp, oldest->tcpm_stamp))
+ oldest = tm;
+ }
+ tm = oldest;
+ } else {
+ tm = kmalloc(sizeof(*tm), GFP_ATOMIC);
+ if (!tm)
+ goto out_unlock;
+ }
+ tm->tcpm_addr = *addr;
+ tm->tcpm_stamp = jiffies;
+
+ tcpm_suck_dst(tm, dst);
+
+ if (likely(!reclaim)) {
+ tm->tcpm_next = net->ipv4.tcp_metrics_hash[hash].chain;
+ rcu_assign_pointer(net->ipv4.tcp_metrics_hash[hash].chain, tm);
+ }
+
+out_unlock:
+ spin_unlock_bh(&tcp_metrics_lock);
+ return tm;
+}
+
+#define TCP_METRICS_TIMEOUT (60 * 60 * HZ)
+
+static void tcpm_check_stamp(struct tcp_metrics_block *tm, struct dst_entry *dst)
+{
+ if (tm && unlikely(time_after(jiffies, tm->tcpm_stamp + TCP_METRICS_TIMEOUT)))
+ tcpm_suck_dst(tm, dst);
+}
+
+#define TCP_METRICS_RECLAIM_DEPTH 5
+#define TCP_METRICS_RECLAIM_PTR (struct tcp_metrics_block *) 0x1UL
+
+static struct tcp_metrics_block *tcp_get_encode(struct tcp_metrics_block *tm, int depth)
+{
+ if (tm)
+ return tm;
+ if (depth > TCP_METRICS_RECLAIM_DEPTH)
+ return TCP_METRICS_RECLAIM_PTR;
+ return NULL;
+}
+
+static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *addr,
+ struct net *net, unsigned int hash)
+{
+ struct tcp_metrics_block *tm;
+ int depth = 0;
+
+ for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm;
+ tm = rcu_dereference(tm->tcpm_next)) {
+ if (addr_same(&tm->tcpm_addr, addr))
+ break;
+ depth++;
+ }
+ return tcp_get_encode(tm, depth);
+}
+
+static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req,
+ struct dst_entry *dst)
+{
+ struct tcp_metrics_block *tm;
+ struct inetpeer_addr addr;
+ unsigned int hash;
+ struct net *net;
+
+ addr.family = req->rsk_ops->family;
+ switch (addr.family) {
+ case AF_INET:
+ addr.addr.a4 = inet_rsk(req)->rmt_addr;
+ hash = (__force unsigned int) addr.addr.a4;
+ break;
+ case AF_INET6:
+ *(struct in6_addr *)addr.addr.a6 = inet6_rsk(req)->rmt_addr;
+ hash = ipv6_addr_hash(&inet6_rsk(req)->rmt_addr);
+ break;
+ default:
+ return NULL;
+ }
+
+ net = dev_net(dst->dev);
+ hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log);
+
+ for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm;
+ tm = rcu_dereference(tm->tcpm_next)) {
+ if (addr_same(&tm->tcpm_addr, &addr))
+ break;
+ }
+ tcpm_check_stamp(tm, dst);
+ return tm;
+}
+
+static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock *tw)
+{
+ struct inet6_timewait_sock *tw6;
+ struct tcp_metrics_block *tm;
+ struct inetpeer_addr addr;
+ unsigned int hash;
+ struct net *net;
+
+ addr.family = tw->tw_family;
+ switch (addr.family) {
+ case AF_INET:
+ addr.addr.a4 = tw->tw_daddr;
+ hash = (__force unsigned int) addr.addr.a4;
+ break;
+ case AF_INET6:
+ tw6 = inet6_twsk((struct sock *)tw);
+ *(struct in6_addr *)addr.addr.a6 = tw6->tw_v6_daddr;
+ hash = ipv6_addr_hash(&tw6->tw_v6_daddr);
+ break;
+ default:
+ return NULL;
+ }
+
+ net = twsk_net(tw);
+ hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log);
+
+ for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm;
+ tm = rcu_dereference(tm->tcpm_next)) {
+ if (addr_same(&tm->tcpm_addr, &addr))
+ break;
+ }
+ return tm;
+}
+
+static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk,
+ struct dst_entry *dst,
+ bool create)
+{
+ struct tcp_metrics_block *tm;
+ struct inetpeer_addr addr;
+ unsigned int hash;
+ struct net *net;
+ bool reclaim;
+
+ addr.family = sk->sk_family;
+ switch (addr.family) {
+ case AF_INET:
+ addr.addr.a4 = inet_sk(sk)->inet_daddr;
+ hash = (__force unsigned int) addr.addr.a4;
+ break;
+ case AF_INET6:
+ *(struct in6_addr *)addr.addr.a6 = inet6_sk(sk)->daddr;
+ hash = ipv6_addr_hash(&inet6_sk(sk)->daddr);
+ break;
+ default:
+ return NULL;
+ }
+
+ net = dev_net(dst->dev);
+ hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log);
+
+ tm = __tcp_get_metrics(&addr, net, hash);
+ reclaim = false;
+ if (tm == TCP_METRICS_RECLAIM_PTR) {
+ reclaim = true;
+ tm = NULL;
+ }
+ if (!tm && create)
+ tm = tcpm_new(dst, &addr, hash, reclaim);
+ else
+ tcpm_check_stamp(tm, dst);
+
+ return tm;
+}
+
+/* Save metrics learned by this TCP session. This function is called
+ * only, when TCP finishes successfully i.e. when it enters TIME-WAIT
+ * or goes from LAST-ACK to CLOSE.
+ */
+void tcp_update_metrics(struct sock *sk)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ struct dst_entry *dst = __sk_dst_get(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_metrics_block *tm;
+ unsigned long rtt;
+ u32 val;
+ int m;
+
+ if (sysctl_tcp_nometrics_save || !dst)
+ return;
+
+ if (dst->flags & DST_HOST)
+ dst_confirm(dst);
+
+ rcu_read_lock();
+ if (icsk->icsk_backoff || !tp->srtt) {
+ /* This session failed to estimate rtt. Why?
+ * Probably, no packets returned in time. Reset our
+ * results.
+ */
+ tm = tcp_get_metrics(sk, dst, false);
+ if (tm && !tcp_metric_locked(tm, TCP_METRIC_RTT))
+ tcp_metric_set(tm, TCP_METRIC_RTT, 0);
+ goto out_unlock;
+ } else
+ tm = tcp_get_metrics(sk, dst, true);
+
+ if (!tm)
+ goto out_unlock;
+
+ rtt = tcp_metric_get_jiffies(tm, TCP_METRIC_RTT);
+ m = rtt - tp->srtt;
+
+ /* If newly calculated rtt larger than stored one, store new
+ * one. Otherwise, use EWMA. Remember, rtt overestimation is
+ * always better than underestimation.
+ */
+ if (!tcp_metric_locked(tm, TCP_METRIC_RTT)) {
+ if (m <= 0)
+ rtt = tp->srtt;
+ else
+ rtt -= (m >> 3);
+ tcp_metric_set_msecs(tm, TCP_METRIC_RTT, rtt);
+ }
+
+ if (!tcp_metric_locked(tm, TCP_METRIC_RTTVAR)) {
+ unsigned long var;
+
+ if (m < 0)
+ m = -m;
+
+ /* Scale deviation to rttvar fixed point */
+ m >>= 1;
+ if (m < tp->mdev)
+ m = tp->mdev;
+
+ var = tcp_metric_get_jiffies(tm, TCP_METRIC_RTTVAR);
+ if (m >= var)
+ var = m;
+ else
+ var -= (var - m) >> 2;
+
+ tcp_metric_set_msecs(tm, TCP_METRIC_RTTVAR, var);
+ }
+
+ if (tcp_in_initial_slowstart(tp)) {
+ /* Slow start still did not finish. */
+ if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) {
+ val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
+ if (val && (tp->snd_cwnd >> 1) > val)
+ tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
+ tp->snd_cwnd >> 1);
+ }
+ if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) {
+ val = tcp_metric_get(tm, TCP_METRIC_CWND);
+ if (tp->snd_cwnd > val)
+ tcp_metric_set(tm, TCP_METRIC_CWND,
+ tp->snd_cwnd);
+ }
+ } else if (tp->snd_cwnd > tp->snd_ssthresh &&
+ icsk->icsk_ca_state == TCP_CA_Open) {
+ /* Cong. avoidance phase, cwnd is reliable. */
+ if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH))
+ tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
+ max(tp->snd_cwnd >> 1, tp->snd_ssthresh));
+ if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) {
+ val = tcp_metric_get(tm, TCP_METRIC_CWND);
+ tcp_metric_set(tm, TCP_METRIC_CWND, (val + tp->snd_cwnd) >> 1);
+ }
+ } else {
+ /* Else slow start did not finish, cwnd is non-sense,
+ * ssthresh may be also invalid.
+ */
+ if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) {
+ val = tcp_metric_get(tm, TCP_METRIC_CWND);
+ tcp_metric_set(tm, TCP_METRIC_CWND,
+ (val + tp->snd_ssthresh) >> 1);
+ }
+ if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) {
+ val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
+ if (val && tp->snd_ssthresh > val)
+ tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
+ tp->snd_ssthresh);
+ }
+ if (!tcp_metric_locked(tm, TCP_METRIC_REORDERING)) {
+ val = tcp_metric_get(tm, TCP_METRIC_REORDERING);
+ if (val < tp->reordering &&
+ tp->reordering != sysctl_tcp_reordering)
+ tcp_metric_set(tm, TCP_METRIC_REORDERING,
+ tp->reordering);
+ }
+ }
+ tm->tcpm_stamp = jiffies;
+out_unlock:
+ rcu_read_unlock();
+}
+
+/* Initialize metrics on socket. */
+
+void tcp_init_metrics(struct sock *sk)
+{
+ struct dst_entry *dst = __sk_dst_get(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_metrics_block *tm;
+ u32 val;
+
+ if (dst == NULL)
+ goto reset;
+
+ dst_confirm(dst);
+
+ rcu_read_lock();
+ tm = tcp_get_metrics(sk, dst, true);
+ if (!tm) {
+ rcu_read_unlock();
+ goto reset;
+ }
+
+ if (tcp_metric_locked(tm, TCP_METRIC_CWND))
+ tp->snd_cwnd_clamp = tcp_metric_get(tm, TCP_METRIC_CWND);
+
+ val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
+ if (val) {
+ tp->snd_ssthresh = val;
+ if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
+ tp->snd_ssthresh = tp->snd_cwnd_clamp;
+ } else {
+ /* ssthresh may have been reduced unnecessarily during.
+ * 3WHS. Restore it back to its initial default.
+ */
+ tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
+ }
+ val = tcp_metric_get(tm, TCP_METRIC_REORDERING);
+ if (val && tp->reordering != val) {
+ tcp_disable_fack(tp);
+ tcp_disable_early_retrans(tp);
+ tp->reordering = val;
+ }
+
+ val = tcp_metric_get(tm, TCP_METRIC_RTT);
+ if (val == 0 || tp->srtt == 0) {
+ rcu_read_unlock();
+ goto reset;
+ }
+ /* Initial rtt is determined from SYN,SYN-ACK.
+ * The segment is small and rtt may appear much
+ * less than real one. Use per-dst memory
+ * to make it more realistic.
+ *
+ * A bit of theory. RTT is time passed after "normal" sized packet
+ * is sent until it is ACKed. In normal circumstances sending small
+ * packets force peer to delay ACKs and calculation is correct too.
+ * The algorithm is adaptive and, provided we follow specs, it
+ * NEVER underestimate RTT. BUT! If peer tries to make some clever
+ * tricks sort of "quick acks" for time long enough to decrease RTT
+ * to low value, and then abruptly stops to do it and starts to delay
+ * ACKs, wait for troubles.
+ */
+ val = msecs_to_jiffies(val);
+ if (val > tp->srtt) {
+ tp->srtt = val;
+ tp->rtt_seq = tp->snd_nxt;
+ }
+ val = tcp_metric_get_jiffies(tm, TCP_METRIC_RTTVAR);
+ if (val > tp->mdev) {
+ tp->mdev = val;
+ tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk));
+ }
+ rcu_read_unlock();
+
+ tcp_set_rto(sk);
+reset:
+ if (tp->srtt == 0) {
+ /* RFC6298: 5.7 We've failed to get a valid RTT sample from
+ * 3WHS. This is most likely due to retransmission,
+ * including spurious one. Reset the RTO back to 3secs
+ * from the more aggressive 1sec to avoid more spurious
+ * retransmission.
+ */
+ tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_FALLBACK;
+ inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK;
+ }
+ /* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been
+ * retransmitted. In light of RFC6298 more aggressive 1sec
+ * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK
+ * retransmission has occurred.
+ */
+ if (tp->total_retrans > 1)
+ tp->snd_cwnd = 1;
+ else
+ tp->snd_cwnd = tcp_init_cwnd(tp, dst);
+ tp->snd_cwnd_stamp = tcp_time_stamp;
+}
+
+bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst, bool paws_check)
+{
+ struct tcp_metrics_block *tm;
+ bool ret;
+
+ if (!dst)
+ return false;
+
+ rcu_read_lock();
+ tm = __tcp_get_metrics_req(req, dst);
+ if (paws_check) {
+ if (tm &&
+ (u32)get_seconds() - tm->tcpm_ts_stamp < TCP_PAWS_MSL &&
+ (s32)(tm->tcpm_ts - req->ts_recent) > TCP_PAWS_WINDOW)
+ ret = false;
+ else
+ ret = true;
+ } else {
+ if (tm && tcp_metric_get(tm, TCP_METRIC_RTT) && tm->tcpm_ts_stamp)
+ ret = true;
+ else
+ ret = false;
+ }
+ rcu_read_unlock();
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(tcp_peer_is_proven);
+
+void tcp_fetch_timewait_stamp(struct sock *sk, struct dst_entry *dst)
+{
+ struct tcp_metrics_block *tm;
+
+ rcu_read_lock();
+ tm = tcp_get_metrics(sk, dst, true);
+ if (tm) {
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if ((u32)get_seconds() - tm->tcpm_ts_stamp <= TCP_PAWS_MSL) {
+ tp->rx_opt.ts_recent_stamp = tm->tcpm_ts_stamp;
+ tp->rx_opt.ts_recent = tm->tcpm_ts;
+ }
+ }
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(tcp_fetch_timewait_stamp);
+
+/* VJ's idea. Save last timestamp seen from this destination and hold
+ * it at least for normal timewait interval to use for duplicate
+ * segment detection in subsequent connections, before they enter
+ * synchronized state.
+ */
+bool tcp_remember_stamp(struct sock *sk)
+{
+ struct dst_entry *dst = __sk_dst_get(sk);
+ bool ret = false;
+
+ if (dst) {
+ struct tcp_metrics_block *tm;
+
+ rcu_read_lock();
+ tm = tcp_get_metrics(sk, dst, true);
+ if (tm) {
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if ((s32)(tm->tcpm_ts - tp->rx_opt.ts_recent) <= 0 ||
+ ((u32)get_seconds() - tm->tcpm_ts_stamp > TCP_PAWS_MSL &&
+ tm->tcpm_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) {
+ tm->tcpm_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp;
+ tm->tcpm_ts = tp->rx_opt.ts_recent;
+ }
+ ret = true;
+ }
+ rcu_read_unlock();
+ }
+ return ret;
+}
+
+bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw)
+{
+ struct tcp_metrics_block *tm;
+ bool ret = false;
+
+ rcu_read_lock();
+ tm = __tcp_get_metrics_tw(tw);
+ if (tw) {
+ const struct tcp_timewait_sock *tcptw;
+ struct sock *sk = (struct sock *) tw;
+
+ tcptw = tcp_twsk(sk);
+ if ((s32)(tm->tcpm_ts - tcptw->tw_ts_recent) <= 0 ||
+ ((u32)get_seconds() - tm->tcpm_ts_stamp > TCP_PAWS_MSL &&
+ tm->tcpm_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) {
+ tm->tcpm_ts_stamp = (u32)tcptw->tw_ts_recent_stamp;
+ tm->tcpm_ts = tcptw->tw_ts_recent;
+ }
+ ret = true;
+ }
+ rcu_read_unlock();
+
+ return ret;
+}
+
+static DEFINE_SEQLOCK(fastopen_seqlock);
+
+void tcp_fastopen_cache_get(struct sock *sk, u16 *mss,
+ struct tcp_fastopen_cookie *cookie,
+ int *syn_loss, unsigned long *last_syn_loss)
+{
+ struct tcp_metrics_block *tm;
+
+ rcu_read_lock();
+ tm = tcp_get_metrics(sk, __sk_dst_get(sk), false);
+ if (tm) {
+ struct tcp_fastopen_metrics *tfom = &tm->tcpm_fastopen;
+ unsigned int seq;
+
+ do {
+ seq = read_seqbegin(&fastopen_seqlock);
+ if (tfom->mss)
+ *mss = tfom->mss;
+ *cookie = tfom->cookie;
+ *syn_loss = tfom->syn_loss;
+ *last_syn_loss = *syn_loss ? tfom->last_syn_loss : 0;
+ } while (read_seqretry(&fastopen_seqlock, seq));
+ }
+ rcu_read_unlock();
+}
+
+void tcp_fastopen_cache_set(struct sock *sk, u16 mss,
+ struct tcp_fastopen_cookie *cookie, bool syn_lost)
+{
+ struct tcp_metrics_block *tm;
+
+ rcu_read_lock();
+ tm = tcp_get_metrics(sk, __sk_dst_get(sk), true);
+ if (tm) {
+ struct tcp_fastopen_metrics *tfom = &tm->tcpm_fastopen;
+
+ write_seqlock_bh(&fastopen_seqlock);
+ tfom->mss = mss;
+ if (cookie->len > 0)
+ tfom->cookie = *cookie;
+ if (syn_lost) {
+ ++tfom->syn_loss;
+ tfom->last_syn_loss = jiffies;
+ } else
+ tfom->syn_loss = 0;
+ write_sequnlock_bh(&fastopen_seqlock);
+ }
+ rcu_read_unlock();
+}
+
+static unsigned int tcpmhash_entries;
+static int __init set_tcpmhash_entries(char *str)
+{
+ ssize_t ret;
+
+ if (!str)
+ return 0;
+
+ ret = kstrtouint(str, 0, &tcpmhash_entries);
+ if (ret)
+ return 0;
+
+ return 1;
+}
+__setup("tcpmhash_entries=", set_tcpmhash_entries);
+
+static int __net_init tcp_net_metrics_init(struct net *net)
+{
+ size_t size;
+ unsigned int slots;
+
+ slots = tcpmhash_entries;
+ if (!slots) {
+ if (totalram_pages >= 128 * 1024)
+ slots = 16 * 1024;
+ else
+ slots = 8 * 1024;
+ }
+
+ net->ipv4.tcp_metrics_hash_log = order_base_2(slots);
+ size = sizeof(struct tcpm_hash_bucket) << net->ipv4.tcp_metrics_hash_log;
+
+ net->ipv4.tcp_metrics_hash = kzalloc(size, GFP_KERNEL);
+ if (!net->ipv4.tcp_metrics_hash)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static void __net_exit tcp_net_metrics_exit(struct net *net)
+{
+ kfree(net->ipv4.tcp_metrics_hash);
+}
+
+static __net_initdata struct pernet_operations tcp_net_metrics_ops = {
+ .init = tcp_net_metrics_init,
+ .exit = tcp_net_metrics_exit,
+};
+
+void __init tcp_metrics_init(void)
+{
+ register_pernet_subsys(&tcp_net_metrics_ops);
+}
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 3cabafb5cdd1..5912ac3fd240 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -49,62 +49,12 @@ struct inet_timewait_death_row tcp_death_row = {
};
EXPORT_SYMBOL_GPL(tcp_death_row);
-/* VJ's idea. Save last timestamp seen from this destination
- * and hold it at least for normal timewait interval to use for duplicate
- * segment detection in subsequent connections, before they enter synchronized
- * state.
- */
-
-static int tcp_remember_stamp(struct sock *sk)
-{
- const struct inet_connection_sock *icsk = inet_csk(sk);
- struct tcp_sock *tp = tcp_sk(sk);
- struct inet_peer *peer;
- bool release_it;
-
- peer = icsk->icsk_af_ops->get_peer(sk, &release_it);
- if (peer) {
- if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
- ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
- peer->tcp_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) {
- peer->tcp_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp;
- peer->tcp_ts = tp->rx_opt.ts_recent;
- }
- if (release_it)
- inet_putpeer(peer);
- return 1;
- }
-
- return 0;
-}
-
-static int tcp_tw_remember_stamp(struct inet_timewait_sock *tw)
-{
- struct sock *sk = (struct sock *) tw;
- struct inet_peer *peer;
-
- peer = twsk_getpeer(sk);
- if (peer) {
- const struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
-
- if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
- ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
- peer->tcp_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) {
- peer->tcp_ts_stamp = (u32)tcptw->tw_ts_recent_stamp;
- peer->tcp_ts = tcptw->tw_ts_recent;
- }
- inet_putpeer(peer);
- return 1;
- }
- return 0;
-}
-
-static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
+static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
{
if (seq == s_win)
- return 1;
+ return true;
if (after(end_seq, s_win) && before(seq, e_win))
- return 1;
+ return true;
return seq == e_win && seq == end_seq;
}
@@ -143,11 +93,11 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
struct tcp_options_received tmp_opt;
const u8 *hash_location;
struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
- int paws_reject = 0;
+ bool paws_reject = false;
tmp_opt.saw_tstamp = 0;
if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
- tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
+ tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL);
if (tmp_opt.saw_tstamp) {
tmp_opt.ts_recent = tcptw->tw_ts_recent;
@@ -316,7 +266,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
struct inet_timewait_sock *tw = NULL;
const struct inet_connection_sock *icsk = inet_csk(sk);
const struct tcp_sock *tp = tcp_sk(sk);
- int recycle_ok = 0;
+ bool recycle_ok = false;
if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp)
recycle_ok = tcp_remember_stamp(sk);
@@ -327,8 +277,9 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
if (tw != NULL) {
struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1);
+ struct inet_sock *inet = inet_sk(sk);
- tw->tw_transparent = inet_sk(sk)->transparent;
+ tw->tw_transparent = inet->transparent;
tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale;
tcptw->tw_rcv_nxt = tp->rcv_nxt;
tcptw->tw_snd_nxt = tp->snd_nxt;
@@ -403,6 +354,7 @@ void tcp_twsk_destructor(struct sock *sk)
{
#ifdef CONFIG_TCP_MD5SIG
struct tcp_timewait_sock *twsk = tcp_twsk(sk);
+
if (twsk->tw_md5_key) {
tcp_free_md5sig_pool();
kfree_rcu(twsk->tw_md5_key, rcu);
@@ -435,6 +387,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
struct tcp_sock *oldtp = tcp_sk(sk);
struct tcp_cookie_values *oldcvp = oldtp->cookie_values;
+ newsk->sk_rx_dst = dst_clone(skb_dst(skb));
+
/* TCP Cookie Transactions require space for the cookie pair,
* as it differs for each connection. There is no need to
* copy any s_data_payload stored at the original socket.
@@ -470,6 +424,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
treq->snt_isn + 1 + tcp_s_data_size(oldtp);
tcp_prequeue_init(newtp);
+ INIT_LIST_HEAD(&newtp->tsq_node);
tcp_init_wl(newtp, treq->rcv_isn);
@@ -482,6 +437,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
newtp->sacked_out = 0;
newtp->fackets_out = 0;
newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
+ tcp_enable_early_retrans(newtp);
/* So many TCP implementations out there (incorrectly) count the
* initial SYN frame in their delayed-ACK and congestion control
@@ -574,11 +530,11 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
struct sock *child;
const struct tcphdr *th = tcp_hdr(skb);
__be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
- int paws_reject = 0;
+ bool paws_reject = false;
tmp_opt.saw_tstamp = 0;
if (th->doff > (sizeof(struct tcphdr)>>2)) {
- tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
+ tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL);
if (tmp_opt.saw_tstamp) {
tmp_opt.ts_recent = req->ts_recent;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 364784a91939..950aebfd9967 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -34,6 +34,8 @@
*
*/
+#define pr_fmt(fmt) "TCP: " fmt
+
#include <net/tcp.h>
#include <linux/compiler.h>
@@ -48,6 +50,9 @@ int sysctl_tcp_retrans_collapse __read_mostly = 1;
*/
int sysctl_tcp_workaround_signed_windows __read_mostly = 0;
+/* Default TSQ limit of two TSO segments */
+int sysctl_tcp_limit_output_bytes __read_mostly = 131072;
+
/* This limits the percentage of the congestion window which we
* will allow a single TSO frame to consume. Building TSO frames
* which are too large can cause TCP streams to be bursty.
@@ -63,6 +68,8 @@ int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
int sysctl_tcp_cookie_size __read_mostly = 0; /* TCP_COOKIE_MAX */
EXPORT_SYMBOL_GPL(sysctl_tcp_cookie_size);
+static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
+ int push_one, gfp_t gfp);
/* Account for new data that has been sent to the network. */
static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
@@ -78,9 +85,8 @@ static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
tp->frto_counter = 3;
tp->packets_out += tcp_skb_pcount(skb);
- if (!prior_packets)
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
- inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
+ if (!prior_packets || tp->early_retrans_delayed)
+ tcp_rearm_rto(sk);
}
/* SND.NXT, if window was not shrunk.
@@ -369,7 +375,7 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
TCP_SKB_CB(skb)->end_seq = seq;
}
-static inline int tcp_urg_mode(const struct tcp_sock *tp)
+static inline bool tcp_urg_mode(const struct tcp_sock *tp)
{
return tp->snd_una != tp->snd_up;
}
@@ -379,15 +385,17 @@ static inline int tcp_urg_mode(const struct tcp_sock *tp)
#define OPTION_MD5 (1 << 2)
#define OPTION_WSCALE (1 << 3)
#define OPTION_COOKIE_EXTENSION (1 << 4)
+#define OPTION_FAST_OPEN_COOKIE (1 << 8)
struct tcp_out_options {
- u8 options; /* bit field of OPTION_* */
+ u16 options; /* bit field of OPTION_* */
+ u16 mss; /* 0 to disable */
u8 ws; /* window scale, 0 to disable */
u8 num_sack_blocks; /* number of SACK blocks to include */
u8 hash_size; /* bytes in hash_location */
- u16 mss; /* 0 to disable */
- __u32 tsval, tsecr; /* need to include OPTION_TS */
__u8 *hash_location; /* temporary pointer, overloaded */
+ __u32 tsval, tsecr; /* need to include OPTION_TS */
+ struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */
};
/* The sysctl int routines are generic, so check consistency here.
@@ -436,7 +444,7 @@ static u8 tcp_cookie_size_check(u8 desired)
static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
struct tcp_out_options *opts)
{
- u8 options = opts->options; /* mungable copy */
+ u16 options = opts->options; /* mungable copy */
/* Having both authentication and cookies for security is redundant,
* and there's certainly not enough room. Instead, the cookie-less
@@ -558,21 +566,37 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
tp->rx_opt.dsack = 0;
}
+
+ if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) {
+ struct tcp_fastopen_cookie *foc = opts->fastopen_cookie;
+
+ *ptr++ = htonl((TCPOPT_EXP << 24) |
+ ((TCPOLEN_EXP_FASTOPEN_BASE + foc->len) << 16) |
+ TCPOPT_FASTOPEN_MAGIC);
+
+ memcpy(ptr, foc->val, foc->len);
+ if ((foc->len & 3) == 2) {
+ u8 *align = ((u8 *)ptr) + foc->len;
+ align[0] = align[1] = TCPOPT_NOP;
+ }
+ ptr += (foc->len + 3) >> 2;
+ }
}
/* Compute TCP options for SYN packets. This is not the final
* network wire format yet.
*/
-static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb,
+static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
struct tcp_out_options *opts,
struct tcp_md5sig_key **md5)
{
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_cookie_values *cvp = tp->cookie_values;
- unsigned remaining = MAX_TCP_OPTION_SPACE;
+ unsigned int remaining = MAX_TCP_OPTION_SPACE;
u8 cookie_size = (!tp->rx_opt.cookie_out_never && cvp != NULL) ?
tcp_cookie_size_check(cvp->cookie_desired) :
0;
+ struct tcp_fastopen_request *fastopen = tp->fastopen_req;
#ifdef CONFIG_TCP_MD5SIG
*md5 = tp->af_specific->md5_lookup(sk, sk);
@@ -613,6 +637,16 @@ static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb,
remaining -= TCPOLEN_SACKPERM_ALIGNED;
}
+ if (fastopen && fastopen->cookie.len >= 0) {
+ u32 need = TCPOLEN_EXP_FASTOPEN_BASE + fastopen->cookie.len;
+ need = (need + 3) & ~3U; /* Align to 32 bits */
+ if (remaining >= need) {
+ opts->options |= OPTION_FAST_OPEN_COOKIE;
+ opts->fastopen_cookie = &fastopen->cookie;
+ remaining -= need;
+ tp->syn_fastopen = 1;
+ }
+ }
/* Note that timestamps are required by the specification.
*
* Odd numbers of bytes are prohibited by the specification, ensuring
@@ -663,15 +697,15 @@ static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb,
}
/* Set up TCP options for SYN-ACKs. */
-static unsigned tcp_synack_options(struct sock *sk,
+static unsigned int tcp_synack_options(struct sock *sk,
struct request_sock *req,
- unsigned mss, struct sk_buff *skb,
+ unsigned int mss, struct sk_buff *skb,
struct tcp_out_options *opts,
struct tcp_md5sig_key **md5,
struct tcp_extend_values *xvp)
{
struct inet_request_sock *ireq = inet_rsk(req);
- unsigned remaining = MAX_TCP_OPTION_SPACE;
+ unsigned int remaining = MAX_TCP_OPTION_SPACE;
u8 cookie_plus = (xvp != NULL && !xvp->cookie_out_never) ?
xvp->cookie_plus :
0;
@@ -742,13 +776,13 @@ static unsigned tcp_synack_options(struct sock *sk,
/* Compute TCP options for ESTABLISHED sockets. This is not the
* final wire format yet.
*/
-static unsigned tcp_established_options(struct sock *sk, struct sk_buff *skb,
+static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb,
struct tcp_out_options *opts,
struct tcp_md5sig_key **md5)
{
struct tcp_skb_cb *tcb = skb ? TCP_SKB_CB(skb) : NULL;
struct tcp_sock *tp = tcp_sk(sk);
- unsigned size = 0;
+ unsigned int size = 0;
unsigned int eff_sacks;
#ifdef CONFIG_TCP_MD5SIG
@@ -770,9 +804,9 @@ static unsigned tcp_established_options(struct sock *sk, struct sk_buff *skb,
eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
if (unlikely(eff_sacks)) {
- const unsigned remaining = MAX_TCP_OPTION_SPACE - size;
+ const unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
opts->num_sack_blocks =
- min_t(unsigned, eff_sacks,
+ min_t(unsigned int, eff_sacks,
(remaining - TCPOLEN_SACK_BASE_ALIGNED) /
TCPOLEN_SACK_PERBLOCK);
size += TCPOLEN_SACK_BASE_ALIGNED +
@@ -782,6 +816,152 @@ static unsigned tcp_established_options(struct sock *sk, struct sk_buff *skb,
return size;
}
+
+/* TCP SMALL QUEUES (TSQ)
+ *
+ * TSQ goal is to keep small amount of skbs per tcp flow in tx queues (qdisc+dev)
+ * to reduce RTT and bufferbloat.
+ * We do this using a special skb destructor (tcp_wfree).
+ *
+ * Its important tcp_wfree() can be replaced by sock_wfree() in the event skb
+ * needs to be reallocated in a driver.
+ * The invariant being skb->truesize substracted from sk->sk_wmem_alloc
+ *
+ * Since transmit from skb destructor is forbidden, we use a tasklet
+ * to process all sockets that eventually need to send more skbs.
+ * We use one tasklet per cpu, with its own queue of sockets.
+ */
+struct tsq_tasklet {
+ struct tasklet_struct tasklet;
+ struct list_head head; /* queue of tcp sockets */
+};
+static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet);
+
+static void tcp_tsq_handler(struct sock *sk)
+{
+ if ((1 << sk->sk_state) &
+ (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING |
+ TCPF_CLOSE_WAIT | TCPF_LAST_ACK))
+ tcp_write_xmit(sk, tcp_current_mss(sk), 0, 0, GFP_ATOMIC);
+}
+/*
+ * One tasklest per cpu tries to send more skbs.
+ * We run in tasklet context but need to disable irqs when
+ * transfering tsq->head because tcp_wfree() might
+ * interrupt us (non NAPI drivers)
+ */
+static void tcp_tasklet_func(unsigned long data)
+{
+ struct tsq_tasklet *tsq = (struct tsq_tasklet *)data;
+ LIST_HEAD(list);
+ unsigned long flags;
+ struct list_head *q, *n;
+ struct tcp_sock *tp;
+ struct sock *sk;
+
+ local_irq_save(flags);
+ list_splice_init(&tsq->head, &list);
+ local_irq_restore(flags);
+
+ list_for_each_safe(q, n, &list) {
+ tp = list_entry(q, struct tcp_sock, tsq_node);
+ list_del(&tp->tsq_node);
+
+ sk = (struct sock *)tp;
+ bh_lock_sock(sk);
+
+ if (!sock_owned_by_user(sk)) {
+ tcp_tsq_handler(sk);
+ } else {
+ /* defer the work to tcp_release_cb() */
+ set_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags);
+ }
+ bh_unlock_sock(sk);
+
+ clear_bit(TSQ_QUEUED, &tp->tsq_flags);
+ sk_free(sk);
+ }
+}
+
+#define TCP_DEFERRED_ALL ((1UL << TCP_TSQ_DEFERRED) | \
+ (1UL << TCP_WRITE_TIMER_DEFERRED) | \
+ (1UL << TCP_DELACK_TIMER_DEFERRED))
+/**
+ * tcp_release_cb - tcp release_sock() callback
+ * @sk: socket
+ *
+ * called from release_sock() to perform protocol dependent
+ * actions before socket release.
+ */
+void tcp_release_cb(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ unsigned long flags, nflags;
+
+ /* perform an atomic operation only if at least one flag is set */
+ do {
+ flags = tp->tsq_flags;
+ if (!(flags & TCP_DEFERRED_ALL))
+ return;
+ nflags = flags & ~TCP_DEFERRED_ALL;
+ } while (cmpxchg(&tp->tsq_flags, flags, nflags) != flags);
+
+ if (flags & (1UL << TCP_TSQ_DEFERRED))
+ tcp_tsq_handler(sk);
+
+ if (flags & (1UL << TCP_WRITE_TIMER_DEFERRED))
+ tcp_write_timer_handler(sk);
+
+ if (flags & (1UL << TCP_DELACK_TIMER_DEFERRED))
+ tcp_delack_timer_handler(sk);
+}
+EXPORT_SYMBOL(tcp_release_cb);
+
+void __init tcp_tasklet_init(void)
+{
+ int i;
+
+ for_each_possible_cpu(i) {
+ struct tsq_tasklet *tsq = &per_cpu(tsq_tasklet, i);
+
+ INIT_LIST_HEAD(&tsq->head);
+ tasklet_init(&tsq->tasklet,
+ tcp_tasklet_func,
+ (unsigned long)tsq);
+ }
+}
+
+/*
+ * Write buffer destructor automatically called from kfree_skb.
+ * We cant xmit new skbs from this context, as we might already
+ * hold qdisc lock.
+ */
+void tcp_wfree(struct sk_buff *skb)
+{
+ struct sock *sk = skb->sk;
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (test_and_clear_bit(TSQ_THROTTLED, &tp->tsq_flags) &&
+ !test_and_set_bit(TSQ_QUEUED, &tp->tsq_flags)) {
+ unsigned long flags;
+ struct tsq_tasklet *tsq;
+
+ /* Keep a ref on socket.
+ * This last ref will be released in tcp_tasklet_func()
+ */
+ atomic_sub(skb->truesize - 1, &sk->sk_wmem_alloc);
+
+ /* queue this socket to tasklet queue */
+ local_irq_save(flags);
+ tsq = &__get_cpu_var(tsq_tasklet);
+ list_add(&tp->tsq_node, &tsq->head);
+ tasklet_schedule(&tsq->tasklet);
+ local_irq_restore(flags);
+ } else {
+ sock_wfree(skb);
+ }
+}
+
/* This routine actually transmits TCP packets queued in by
* tcp_do_sendmsg(). This is used by both the initial
* transmission and possible later retransmissions.
@@ -801,7 +981,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
struct tcp_sock *tp;
struct tcp_skb_cb *tcb;
struct tcp_out_options opts;
- unsigned tcp_options_size, tcp_header_size;
+ unsigned int tcp_options_size, tcp_header_size;
struct tcp_md5sig_key *md5;
struct tcphdr *th;
int err;
@@ -843,7 +1023,12 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
skb_push(skb, tcp_header_size);
skb_reset_transport_header(skb);
- skb_set_owner_w(skb, sk);
+
+ skb_orphan(skb);
+ skb->sk = sk;
+ skb->destructor = (sysctl_tcp_limit_output_bytes > 0) ?
+ tcp_wfree : sock_wfree;
+ atomic_add(skb->truesize, &sk->sk_wmem_alloc);
/* Build TCP header and checksum it. */
th = tcp_hdr(skb);
@@ -1096,6 +1281,7 @@ static void __pskb_trim_head(struct sk_buff *skb, int len)
eat = min_t(int, len, skb_headlen(skb));
if (eat) {
__skb_pull(skb, eat);
+ skb->avail_size -= eat;
len -= eat;
if (!len)
return;
@@ -1149,7 +1335,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
}
/* Calculate MSS. Not accounting for SACKs here. */
-int tcp_mtu_to_mss(const struct sock *sk, int pmtu)
+int tcp_mtu_to_mss(struct sock *sk, int pmtu)
{
const struct tcp_sock *tp = tcp_sk(sk);
const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -1160,6 +1346,14 @@ int tcp_mtu_to_mss(const struct sock *sk, int pmtu)
*/
mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr);
+ /* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */
+ if (icsk->icsk_af_ops->net_frag_header_len) {
+ const struct dst_entry *dst = __sk_dst_get(sk);
+
+ if (dst && dst_allfrag(dst))
+ mss_now -= icsk->icsk_af_ops->net_frag_header_len;
+ }
+
/* Clamp it (mss_clamp does not include tcp options) */
if (mss_now > tp->rx_opt.mss_clamp)
mss_now = tp->rx_opt.mss_clamp;
@@ -1178,7 +1372,7 @@ int tcp_mtu_to_mss(const struct sock *sk, int pmtu)
}
/* Inverse of above */
-int tcp_mss_to_mtu(const struct sock *sk, int mss)
+int tcp_mss_to_mtu(struct sock *sk, int mss)
{
const struct tcp_sock *tp = tcp_sk(sk);
const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -1189,6 +1383,13 @@ int tcp_mss_to_mtu(const struct sock *sk, int mss)
icsk->icsk_ext_hdr_len +
icsk->icsk_af_ops->net_header_len;
+ /* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */
+ if (icsk->icsk_af_ops->net_frag_header_len) {
+ const struct dst_entry *dst = __sk_dst_get(sk);
+
+ if (dst && dst_allfrag(dst))
+ mtu += icsk->icsk_af_ops->net_frag_header_len;
+ }
return mtu;
}
@@ -1258,7 +1459,7 @@ unsigned int tcp_current_mss(struct sock *sk)
const struct tcp_sock *tp = tcp_sk(sk);
const struct dst_entry *dst = __sk_dst_get(sk);
u32 mss_now;
- unsigned header_len;
+ unsigned int header_len;
struct tcp_out_options opts;
struct tcp_md5sig_key *md5;
@@ -1374,33 +1575,33 @@ static int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb,
}
/* Minshall's variant of the Nagle send check. */
-static inline int tcp_minshall_check(const struct tcp_sock *tp)
+static inline bool tcp_minshall_check(const struct tcp_sock *tp)
{
return after(tp->snd_sml, tp->snd_una) &&
!after(tp->snd_sml, tp->snd_nxt);
}
-/* Return 0, if packet can be sent now without violation Nagle's rules:
+/* Return false, if packet can be sent now without violation Nagle's rules:
* 1. It is full sized.
* 2. Or it contains FIN. (already checked by caller)
* 3. Or TCP_CORK is not set, and TCP_NODELAY is set.
* 4. Or TCP_CORK is not set, and all sent packets are ACKed.
* With Minshall's modification: all sent small packets are ACKed.
*/
-static inline int tcp_nagle_check(const struct tcp_sock *tp,
+static inline bool tcp_nagle_check(const struct tcp_sock *tp,
const struct sk_buff *skb,
- unsigned mss_now, int nonagle)
+ unsigned int mss_now, int nonagle)
{
return skb->len < mss_now &&
((nonagle & TCP_NAGLE_CORK) ||
(!nonagle && tp->packets_out && tcp_minshall_check(tp)));
}
-/* Return non-zero if the Nagle test allows this packet to be
+/* Return true if the Nagle test allows this packet to be
* sent now.
*/
-static inline int tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb,
- unsigned int cur_mss, int nonagle)
+static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb,
+ unsigned int cur_mss, int nonagle)
{
/* Nagle rule does not apply to frames, which sit in the middle of the
* write_queue (they have no chances to get new data).
@@ -1409,24 +1610,25 @@ static inline int tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff
* argument based upon the location of SKB in the send queue.
*/
if (nonagle & TCP_NAGLE_PUSH)
- return 1;
+ return true;
/* Don't use the nagle rule for urgent data (or for the final FIN).
* Nagle can be ignored during F-RTO too (see RFC4138).
*/
if (tcp_urg_mode(tp) || (tp->frto_counter == 2) ||
(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
- return 1;
+ return true;
if (!tcp_nagle_check(tp, skb, cur_mss, nonagle))
- return 1;
+ return true;
- return 0;
+ return false;
}
/* Does at least the first segment of SKB fit into the send window? */
-static inline int tcp_snd_wnd_test(const struct tcp_sock *tp, const struct sk_buff *skb,
- unsigned int cur_mss)
+static bool tcp_snd_wnd_test(const struct tcp_sock *tp,
+ const struct sk_buff *skb,
+ unsigned int cur_mss)
{
u32 end_seq = TCP_SKB_CB(skb)->end_seq;
@@ -1459,7 +1661,7 @@ static unsigned int tcp_snd_test(const struct sock *sk, struct sk_buff *skb,
}
/* Test if sending is allowed right now. */
-int tcp_may_send_now(struct sock *sk)
+bool tcp_may_send_now(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb = tcp_send_head(sk);
@@ -1529,7 +1731,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
*
* This algorithm is from John Heffner.
*/
-static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
+static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -1589,11 +1791,11 @@ static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
/* Ok, it looks like it is advisable to defer. */
tp->tso_deferred = 1 | (jiffies << 1);
- return 1;
+ return true;
send_now:
tp->tso_deferred = 0;
- return 0;
+ return false;
}
/* Create a new MTU probe if we are ready.
@@ -1735,11 +1937,11 @@ static int tcp_mtu_probe(struct sock *sk)
* snd_up-64k-mss .. snd_up cannot be large. However, taking into
* account rare use of URG, this is not a big flaw.
*
- * Returns 1, if no segments are in flight and we have queued segments, but
- * cannot send anything now because of SWS or another problem.
+ * Returns true, if no segments are in flight and we have queued segments,
+ * but cannot send anything now because of SWS or another problem.
*/
-static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
- int push_one, gfp_t gfp)
+static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
+ int push_one, gfp_t gfp)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
@@ -1753,7 +1955,7 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
/* Do MTU probing. */
result = tcp_mtu_probe(sk);
if (!result) {
- return 0;
+ return false;
} else if (result > 0) {
sent_pkts = 1;
}
@@ -1762,6 +1964,7 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
while ((skb = tcp_send_head(sk))) {
unsigned int limit;
+
tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
BUG_ON(!tso_segs);
@@ -1782,6 +1985,13 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
break;
}
+ /* TSQ : sk_wmem_alloc accounts skb truesize,
+ * including skb overhead. But thats OK.
+ */
+ if (atomic_read(&sk->sk_wmem_alloc) >= sysctl_tcp_limit_output_bytes) {
+ set_bit(TSQ_THROTTLED, &tp->tsq_flags);
+ break;
+ }
limit = mss_now;
if (tso_segs > 1 && !tcp_urg_mode(tp))
limit = tcp_mss_split_point(sk, skb, mss_now,
@@ -1812,7 +2022,7 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
if (likely(sent_pkts)) {
tcp_cwnd_validate(sk);
- return 0;
+ return false;
}
return !tp->packets_out && tcp_send_head(sk);
}
@@ -2011,22 +2221,22 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
}
/* Check if coalescing SKBs is legal. */
-static int tcp_can_collapse(const struct sock *sk, const struct sk_buff *skb)
+static bool tcp_can_collapse(const struct sock *sk, const struct sk_buff *skb)
{
if (tcp_skb_pcount(skb) > 1)
- return 0;
+ return false;
/* TODO: SACK collapsing could be used to remove this condition */
if (skb_shinfo(skb)->nr_frags != 0)
- return 0;
+ return false;
if (skb_cloned(skb))
- return 0;
+ return false;
if (skb == tcp_send_head(sk))
- return 0;
+ return false;
/* Some heurestics for collapsing over SACK'd could be invented */
if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
- return 0;
+ return false;
- return 1;
+ return true;
}
/* Collapse packets in the retransmit queue to make to create
@@ -2037,7 +2247,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb = to, *tmp;
- int first = 1;
+ bool first = true;
if (!sysctl_tcp_retrans_collapse)
return;
@@ -2051,7 +2261,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
space -= skb->len;
if (first) {
- first = 0;
+ first = false;
continue;
}
@@ -2060,7 +2270,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
/* Punt if not enough space exists in the first SKB for
* the data in the second
*/
- if (skb->len > skb_tailroom(to))
+ if (skb->len > skb_availroom(to))
break;
if (after(TCP_SKB_CB(skb)->end_seq, tcp_wnd_end(tp)))
@@ -2166,8 +2376,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
#if FASTRETRANS_DEBUG > 0
if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
- if (net_ratelimit())
- printk(KERN_DEBUG "retrans_out leaked.\n");
+ net_dbg_ratelimited("retrans_out leaked\n");
}
#endif
if (!tp->retrans_out)
@@ -2192,18 +2401,18 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
/* Check if we forward retransmits are possible in the current
* window/congestion state.
*/
-static int tcp_can_forward_retransmit(struct sock *sk)
+static bool tcp_can_forward_retransmit(struct sock *sk)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
const struct tcp_sock *tp = tcp_sk(sk);
/* Forward retransmissions are possible only during Recovery. */
if (icsk->icsk_ca_state != TCP_CA_Recovery)
- return 0;
+ return false;
/* No forward retransmissions in Reno are possible. */
if (tcp_is_reno(tp))
- return 0;
+ return false;
/* Yeah, we have to make difficult choice between forward transmission
* and retransmission... Both ways have their merits...
@@ -2214,9 +2423,9 @@ static int tcp_can_forward_retransmit(struct sock *sk)
*/
if (tcp_may_send_now(sk))
- return 0;
+ return false;
- return 1;
+ return true;
}
/* This gets called after a retransmit timeout, and the initially
@@ -2401,7 +2610,7 @@ int tcp_send_synack(struct sock *sk)
skb = tcp_write_queue_head(sk);
if (skb == NULL || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
- printk(KERN_DEBUG "tcp_send_synack: wrong queue state\n");
+ pr_debug("%s: wrong queue state\n", __func__);
return -EFAULT;
}
if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) {
@@ -2425,7 +2634,16 @@ int tcp_send_synack(struct sock *sk)
return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
}
-/* Prepare a SYN-ACK. */
+/**
+ * tcp_make_synack - Prepare a SYN-ACK.
+ * sk: listener socket
+ * dst: dst entry attached to the SYNACK
+ * req: request_sock pointer
+ * rvp: request_values pointer
+ *
+ * Allocate one skb and build a SYNACK packet.
+ * @dst is consumed : Caller should not use it again.
+ */
struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
struct request_sock *req,
struct request_values *rvp)
@@ -2444,14 +2662,15 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
if (cvp != NULL && cvp->s_data_constant && cvp->s_data_desired)
s_data_desired = cvp->s_data_desired;
- skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15 + s_data_desired, 1, GFP_ATOMIC);
- if (skb == NULL)
+ skb = alloc_skb(MAX_TCP_HEADER + 15 + s_data_desired, GFP_ATOMIC);
+ if (unlikely(!skb)) {
+ dst_release(dst);
return NULL;
-
+ }
/* Reserve space for headers. */
skb_reserve(skb, MAX_TCP_HEADER);
- skb_dst_set(skb, dst_clone(dst));
+ skb_dst_set(skb, dst);
mss = dst_metric_advmss(dst);
if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss)
@@ -2561,7 +2780,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
EXPORT_SYMBOL(tcp_make_synack);
/* Do all connect socket setups that can be done AF independent. */
-static void tcp_connect_init(struct sock *sk)
+void tcp_connect_init(struct sock *sk)
{
const struct dst_entry *dst = __sk_dst_get(sk);
struct tcp_sock *tp = tcp_sk(sk);
@@ -2616,15 +2835,121 @@ static void tcp_connect_init(struct sock *sk)
tp->snd_una = tp->write_seq;
tp->snd_sml = tp->write_seq;
tp->snd_up = tp->write_seq;
- tp->rcv_nxt = 0;
- tp->rcv_wup = 0;
- tp->copied_seq = 0;
+ tp->snd_nxt = tp->write_seq;
+
+ if (likely(!tp->repair))
+ tp->rcv_nxt = 0;
+ tp->rcv_wup = tp->rcv_nxt;
+ tp->copied_seq = tp->rcv_nxt;
inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
inet_csk(sk)->icsk_retransmits = 0;
tcp_clear_retrans(tp);
}
+static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
+
+ tcb->end_seq += skb->len;
+ skb_header_release(skb);
+ __tcp_add_write_queue_tail(sk, skb);
+ sk->sk_wmem_queued += skb->truesize;
+ sk_mem_charge(sk, skb->truesize);
+ tp->write_seq = tcb->end_seq;
+ tp->packets_out += tcp_skb_pcount(skb);
+}
+
+/* Build and send a SYN with data and (cached) Fast Open cookie. However,
+ * queue a data-only packet after the regular SYN, such that regular SYNs
+ * are retransmitted on timeouts. Also if the remote SYN-ACK acknowledges
+ * only the SYN sequence, the data are retransmitted in the first ACK.
+ * If cookie is not cached or other error occurs, falls back to send a
+ * regular SYN with Fast Open cookie request option.
+ */
+static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_fastopen_request *fo = tp->fastopen_req;
+ int syn_loss = 0, space, i, err = 0, iovlen = fo->data->msg_iovlen;
+ struct sk_buff *syn_data = NULL, *data;
+ unsigned long last_syn_loss = 0;
+
+ tp->rx_opt.mss_clamp = tp->advmss; /* If MSS is not cached */
+ tcp_fastopen_cache_get(sk, &tp->rx_opt.mss_clamp, &fo->cookie,
+ &syn_loss, &last_syn_loss);
+ /* Recurring FO SYN losses: revert to regular handshake temporarily */
+ if (syn_loss > 1 &&
+ time_before(jiffies, last_syn_loss + (60*HZ << syn_loss))) {
+ fo->cookie.len = -1;
+ goto fallback;
+ }
+
+ if (sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE)
+ fo->cookie.len = -1;
+ else if (fo->cookie.len <= 0)
+ goto fallback;
+
+ /* MSS for SYN-data is based on cached MSS and bounded by PMTU and
+ * user-MSS. Reserve maximum option space for middleboxes that add
+ * private TCP options. The cost is reduced data space in SYN :(
+ */
+ if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->rx_opt.mss_clamp)
+ tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
+ space = tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) -
+ MAX_TCP_OPTION_SPACE;
+
+ syn_data = skb_copy_expand(syn, skb_headroom(syn), space,
+ sk->sk_allocation);
+ if (syn_data == NULL)
+ goto fallback;
+
+ for (i = 0; i < iovlen && syn_data->len < space; ++i) {
+ struct iovec *iov = &fo->data->msg_iov[i];
+ unsigned char __user *from = iov->iov_base;
+ int len = iov->iov_len;
+
+ if (syn_data->len + len > space)
+ len = space - syn_data->len;
+ else if (i + 1 == iovlen)
+ /* No more data pending in inet_wait_for_connect() */
+ fo->data = NULL;
+
+ if (skb_add_data(syn_data, from, len))
+ goto fallback;
+ }
+
+ /* Queue a data-only packet after the regular SYN for retransmission */
+ data = pskb_copy(syn_data, sk->sk_allocation);
+ if (data == NULL)
+ goto fallback;
+ TCP_SKB_CB(data)->seq++;
+ TCP_SKB_CB(data)->tcp_flags &= ~TCPHDR_SYN;
+ TCP_SKB_CB(data)->tcp_flags = (TCPHDR_ACK|TCPHDR_PSH);
+ tcp_connect_queue_skb(sk, data);
+ fo->copied = data->len;
+
+ if (tcp_transmit_skb(sk, syn_data, 0, sk->sk_allocation) == 0) {
+ tp->syn_data = (fo->copied > 0);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE);
+ goto done;
+ }
+ syn_data = NULL;
+
+fallback:
+ /* Send a regular SYN with Fast Open cookie request option */
+ if (fo->cookie.len > 0)
+ fo->cookie.len = 0;
+ err = tcp_transmit_skb(sk, syn, 1, sk->sk_allocation);
+ if (err)
+ tp->syn_fastopen = 0;
+ kfree_skb(syn_data);
+done:
+ fo->cookie.len = -1; /* Exclude Fast Open option for SYN retries */
+ return err;
+}
+
/* Build a SYN and send it off. */
int tcp_connect(struct sock *sk)
{
@@ -2641,19 +2966,14 @@ int tcp_connect(struct sock *sk)
/* Reserve space for headers. */
skb_reserve(buff, MAX_TCP_HEADER);
- tp->snd_nxt = tp->write_seq;
tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN);
+ tp->retrans_stamp = TCP_SKB_CB(buff)->when = tcp_time_stamp;
+ tcp_connect_queue_skb(sk, buff);
TCP_ECN_send_syn(sk, buff);
- /* Send it off. */
- TCP_SKB_CB(buff)->when = tcp_time_stamp;
- tp->retrans_stamp = TCP_SKB_CB(buff)->when;
- skb_header_release(buff);
- __tcp_add_write_queue_tail(sk, buff);
- sk->sk_wmem_queued += buff->truesize;
- sk_mem_charge(sk, buff->truesize);
- tp->packets_out += tcp_skb_pcount(buff);
- err = tcp_transmit_skb(sk, buff, 1, sk->sk_allocation);
+ /* Send off SYN; include data in Fast Open. */
+ err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) :
+ tcp_transmit_skb(sk, buff, 1, sk->sk_allocation);
if (err == -ECONNREFUSED)
return err;
@@ -2790,6 +3110,15 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC);
}
+void tcp_send_window_probe(struct sock *sk)
+{
+ if (sk->sk_state == TCP_ESTABLISHED) {
+ tcp_sk(sk)->snd_wl1 = tcp_sk(sk)->rcv_nxt - 1;
+ tcp_sk(sk)->snd_nxt = tcp_sk(sk)->write_seq;
+ tcp_xmit_probe_skb(sk, 0);
+ }
+}
+
/* Initiate keepalive or window probe from timer. */
int tcp_write_wakeup(struct sock *sk)
{
diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c
index a981cdc0a6e9..4526fe68e60e 100644
--- a/net/ipv4/tcp_probe.c
+++ b/net/ipv4/tcp_probe.c
@@ -91,7 +91,7 @@ static inline int tcp_probe_avail(void)
* Note: arguments must match tcp_rcv_established()!
*/
static int jtcp_rcv_established(struct sock *sk, struct sk_buff *skb,
- struct tcphdr *th, unsigned len)
+ struct tcphdr *th, unsigned int len)
{
const struct tcp_sock *tp = tcp_sk(sk);
const struct inet_sock *inet = inet_sk(sk);
@@ -138,7 +138,7 @@ static struct jprobe tcp_jprobe = {
.entry = jtcp_rcv_established,
};
-static int tcpprobe_open(struct inode * inode, struct file * file)
+static int tcpprobe_open(struct inode *inode, struct file *file)
{
/* Reset (empty) log */
spin_lock_bh(&tcp_probe.lock);
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 34d4a02c2f16..6df36ad55a38 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -32,17 +32,6 @@ int sysctl_tcp_retries2 __read_mostly = TCP_RETR2;
int sysctl_tcp_orphan_retries __read_mostly;
int sysctl_tcp_thin_linear_timeouts __read_mostly;
-static void tcp_write_timer(unsigned long);
-static void tcp_delack_timer(unsigned long);
-static void tcp_keepalive_timer (unsigned long data);
-
-void tcp_init_xmit_timers(struct sock *sk)
-{
- inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
- &tcp_keepalive_timer);
-}
-EXPORT_SYMBOL(tcp_init_xmit_timers);
-
static void tcp_write_err(struct sock *sk)
{
sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT;
@@ -205,21 +194,11 @@ static int tcp_write_timeout(struct sock *sk)
return 0;
}
-static void tcp_delack_timer(unsigned long data)
+void tcp_delack_timer_handler(struct sock *sk)
{
- struct sock *sk = (struct sock *)data;
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
- bh_lock_sock(sk);
- if (sock_owned_by_user(sk)) {
- /* Try again later. */
- icsk->icsk_ack.blocked = 1;
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED);
- sk_reset_timer(sk, &icsk->icsk_delack_timer, jiffies + TCP_DELACK_MIN);
- goto out_unlock;
- }
-
sk_mem_reclaim_partial(sk);
if (sk->sk_state == TCP_CLOSE || !(icsk->icsk_ack.pending & ICSK_ACK_TIMER))
@@ -260,7 +239,21 @@ static void tcp_delack_timer(unsigned long data)
out:
if (sk_under_memory_pressure(sk))
sk_mem_reclaim(sk);
-out_unlock:
+}
+
+static void tcp_delack_timer(unsigned long data)
+{
+ struct sock *sk = (struct sock *)data;
+
+ bh_lock_sock(sk);
+ if (!sock_owned_by_user(sk)) {
+ tcp_delack_timer_handler(sk);
+ } else {
+ inet_csk(sk)->icsk_ack.blocked = 1;
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED);
+ /* deleguate our work to tcp_release_cb() */
+ set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags);
+ }
bh_unlock_sock(sk);
sock_put(sk);
}
@@ -319,6 +312,11 @@ void tcp_retransmit_timer(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
+ if (tp->early_retrans_delayed) {
+ tcp_resume_early_retransmit(sk);
+ return;
+ }
+
if (!tp->packets_out)
goto out;
@@ -445,19 +443,11 @@ out_reset_timer:
out:;
}
-static void tcp_write_timer(unsigned long data)
+void tcp_write_timer_handler(struct sock *sk)
{
- struct sock *sk = (struct sock *)data;
struct inet_connection_sock *icsk = inet_csk(sk);
int event;
- bh_lock_sock(sk);
- if (sock_owned_by_user(sk)) {
- /* Try again later */
- sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + (HZ / 20));
- goto out_unlock;
- }
-
if (sk->sk_state == TCP_CLOSE || !icsk->icsk_pending)
goto out;
@@ -480,7 +470,19 @@ static void tcp_write_timer(unsigned long data)
out:
sk_mem_reclaim(sk);
-out_unlock:
+}
+
+static void tcp_write_timer(unsigned long data)
+{
+ struct sock *sk = (struct sock *)data;
+
+ bh_lock_sock(sk);
+ if (!sock_owned_by_user(sk)) {
+ tcp_write_timer_handler(sk);
+ } else {
+ /* deleguate our work to tcp_release_cb() */
+ set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags);
+ }
bh_unlock_sock(sk);
sock_put(sk);
}
@@ -597,3 +599,10 @@ out:
bh_unlock_sock(sk);
sock_put(sk);
}
+
+void tcp_init_xmit_timers(struct sock *sk)
+{
+ inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
+ &tcp_keepalive_timer);
+}
+EXPORT_SYMBOL(tcp_init_xmit_timers);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index fe141052a1be..b4c3582a991f 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -107,6 +107,8 @@
#include <net/checksum.h>
#include <net/xfrm.h>
#include <trace/events/udp.h>
+#include <linux/static_key.h>
+#include <trace/events/skb.h>
#include "udp_impl.h"
struct udp_table udp_table __read_mostly;
@@ -206,7 +208,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
if (!snum) {
int low, high, remaining;
- unsigned rand;
+ unsigned int rand;
unsigned short first, last;
DECLARE_BITMAP(bitmap, PORTS_PER_CHAIN);
@@ -614,6 +616,7 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
break;
case ICMP_DEST_UNREACH:
if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */
+ ipv4_sk_update_pmtu(skb, sk, info);
if (inet->pmtudisc != IP_PMTUDISC_DONT) {
err = EMSGSIZE;
harderr = 1;
@@ -627,6 +630,9 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
err = icmp_err_convert[code].errno;
}
break;
+ case ICMP_REDIRECT:
+ ipv4_sk_redirect(skb, sk);
+ break;
}
/*
@@ -846,7 +852,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
* Get and verify the address.
*/
if (msg->msg_name) {
- struct sockaddr_in * usin = (struct sockaddr_in *)msg->msg_name;
+ struct sockaddr_in *usin = (struct sockaddr_in *)msg->msg_name;
if (msg->msg_namelen < sizeof(*usin))
return -EINVAL;
if (usin->sin_family != AF_INET) {
@@ -1218,8 +1224,10 @@ try_again:
goto csum_copy_err;
}
- if (err)
+ if (unlikely(err)) {
+ trace_kfree_skb(skb, udp_recvmsg);
goto out_free;
+ }
if (!peeked)
UDP_INC_STATS_USER(sock_net(sk),
@@ -1379,6 +1387,14 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
}
+static struct static_key udp_encap_needed __read_mostly;
+void udp_encap_enable(void)
+{
+ if (!static_key_enabled(&udp_encap_needed))
+ static_key_slow_inc(&udp_encap_needed);
+}
+EXPORT_SYMBOL(udp_encap_enable);
+
/* returns:
* -1: error
* 0: success
@@ -1400,7 +1416,7 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
goto drop;
nf_reset(skb);
- if (up->encap_type) {
+ if (static_key_false(&udp_encap_needed) && up->encap_type) {
int (*encap_rcv)(struct sock *sk, struct sk_buff *skb);
/*
@@ -1470,7 +1486,7 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
goto drop;
- if (sk_rcvqueues_full(sk, skb))
+ if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf))
goto drop;
rc = 0;
@@ -1479,7 +1495,7 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
bh_lock_sock(sk);
if (!sock_owned_by_user(sk))
rc = __udp_queue_rcv_skb(sk, skb);
- else if (sk_add_backlog(sk, skb)) {
+ else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
bh_unlock_sock(sk);
goto drop;
}
@@ -1760,6 +1776,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
/* FALLTHROUGH */
case UDP_ENCAP_L2TPINUDP:
up->encap_type = val;
+ udp_encap_enable();
break;
default:
err = -ENOPROTOOPT;
@@ -2163,9 +2180,15 @@ void udp4_proc_exit(void)
static __initdata unsigned long uhash_entries;
static int __init set_uhash_entries(char *str)
{
+ ssize_t ret;
+
if (!str)
return 0;
- uhash_entries = simple_strtoul(str, &str, 0);
+
+ ret = kstrtoul(str, 0, &uhash_entries);
+ if (ret)
+ return 0;
+
if (uhash_entries && uhash_entries < UDP_HTABLE_SIZE_MIN)
uhash_entries = UDP_HTABLE_SIZE_MIN;
return 1;
@@ -2176,26 +2199,16 @@ void __init udp_table_init(struct udp_table *table, const char *name)
{
unsigned int i;
- if (!CONFIG_BASE_SMALL)
- table->hash = alloc_large_system_hash(name,
- 2 * sizeof(struct udp_hslot),
- uhash_entries,
- 21, /* one slot per 2 MB */
- 0,
- &table->log,
- &table->mask,
- 64 * 1024);
- /*
- * Make sure hash table has the minimum size
- */
- if (CONFIG_BASE_SMALL || table->mask < UDP_HTABLE_SIZE_MIN - 1) {
- table->hash = kmalloc(UDP_HTABLE_SIZE_MIN *
- 2 * sizeof(struct udp_hslot), GFP_KERNEL);
- if (!table->hash)
- panic(name);
- table->log = ilog2(UDP_HTABLE_SIZE_MIN);
- table->mask = UDP_HTABLE_SIZE_MIN - 1;
- }
+ table->hash = alloc_large_system_hash(name,
+ 2 * sizeof(struct udp_hslot),
+ uhash_entries,
+ 21, /* one slot per 2 MB */
+ 0,
+ &table->log,
+ &table->mask,
+ UDP_HTABLE_SIZE_MIN,
+ 64 * 1024);
+
table->hash2 = table->hash + (table->mask + 1);
for (i = 0; i <= table->mask; i++) {
INIT_HLIST_NULLS_HEAD(&table->hash[i].head, i);
diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c
index 8a949f19deb6..16d0960062be 100644
--- a/net/ipv4/udp_diag.c
+++ b/net/ipv4/udp_diag.c
@@ -34,15 +34,16 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
int err = -EINVAL;
struct sock *sk;
struct sk_buff *rep;
+ struct net *net = sock_net(in_skb->sk);
if (req->sdiag_family == AF_INET)
- sk = __udp4_lib_lookup(&init_net,
+ sk = __udp4_lib_lookup(net,
req->id.idiag_src[0], req->id.idiag_sport,
req->id.idiag_dst[0], req->id.idiag_dport,
req->id.idiag_if, tbl);
#if IS_ENABLED(CONFIG_IPV6)
else if (req->sdiag_family == AF_INET6)
- sk = __udp6_lib_lookup(&init_net,
+ sk = __udp6_lib_lookup(net,
(struct in6_addr *)req->id.idiag_src,
req->id.idiag_sport,
(struct in6_addr *)req->id.idiag_dst,
@@ -75,7 +76,7 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
kfree_skb(rep);
goto out;
}
- err = netlink_unicast(sock_diag_nlsk, rep, NETLINK_CB(in_skb).pid,
+ err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).pid,
MSG_DONTWAIT);
if (err > 0)
err = 0;
@@ -90,6 +91,7 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb, struct netlin
struct inet_diag_req_v2 *r, struct nlattr *bc)
{
int num, s_num, slot, s_slot;
+ struct net *net = sock_net(skb->sk);
s_slot = cb->args[0];
num = s_num = cb->args[1];
@@ -106,6 +108,8 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb, struct netlin
sk_nulls_for_each(sk, node, &hslot->head) {
struct inet_sock *inet = inet_sk(sk);
+ if (!net_eq(sock_net(sk), net))
+ continue;
if (num < s_num)
goto next;
if (!(r->idiag_states & (1 << sk->sk_state)))
@@ -146,9 +150,17 @@ static int udp_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh,
return udp_dump_one(&udp_table, in_skb, nlh, req);
}
+static void udp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
+ void *info)
+{
+ r->idiag_rqueue = sk_rmem_alloc_get(sk);
+ r->idiag_wqueue = sk_wmem_alloc_get(sk);
+}
+
static const struct inet_diag_handler udp_diag_handler = {
.dump = udp_diag_dump,
.dump_one = udp_diag_dump_one,
+ .idiag_get_info = udp_diag_get_info,
.idiag_type = IPPROTO_UDP,
};
@@ -167,6 +179,7 @@ static int udplite_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *
static const struct inet_diag_handler udplite_diag_handler = {
.dump = udplite_diag_dump,
.dump_one = udplite_diag_dump_one,
+ .idiag_get_info = udp_diag_get_info,
.idiag_type = IPPROTO_UDPLITE,
};
diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h
index aaad650d47d9..5a681e298b90 100644
--- a/net/ipv4/udp_impl.h
+++ b/net/ipv4/udp_impl.h
@@ -25,7 +25,7 @@ extern int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
size_t len, int noblock, int flags, int *addr_len);
extern int udp_sendpage(struct sock *sk, struct page *page, int offset,
size_t size, int flags);
-extern int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb);
+extern int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
extern void udp_destroy_sock(struct sock *sk);
#ifdef CONFIG_PROC_FS
diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c
index ed4bf11ef9f4..ddee0a099a2c 100644
--- a/net/ipv4/xfrm4_mode_tunnel.c
+++ b/net/ipv4/xfrm4_mode_tunnel.c
@@ -15,6 +15,65 @@
#include <net/ip.h>
#include <net/xfrm.h>
+/* Informational hook. The decap is still done here. */
+static struct xfrm_tunnel __rcu *rcv_notify_handlers __read_mostly;
+static DEFINE_MUTEX(xfrm4_mode_tunnel_input_mutex);
+
+int xfrm4_mode_tunnel_input_register(struct xfrm_tunnel *handler)
+{
+ struct xfrm_tunnel __rcu **pprev;
+ struct xfrm_tunnel *t;
+ int ret = -EEXIST;
+ int priority = handler->priority;
+
+ mutex_lock(&xfrm4_mode_tunnel_input_mutex);
+
+ for (pprev = &rcv_notify_handlers;
+ (t = rcu_dereference_protected(*pprev,
+ lockdep_is_held(&xfrm4_mode_tunnel_input_mutex))) != NULL;
+ pprev = &t->next) {
+ if (t->priority > priority)
+ break;
+ if (t->priority == priority)
+ goto err;
+
+ }
+
+ handler->next = *pprev;
+ rcu_assign_pointer(*pprev, handler);
+
+ ret = 0;
+
+err:
+ mutex_unlock(&xfrm4_mode_tunnel_input_mutex);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(xfrm4_mode_tunnel_input_register);
+
+int xfrm4_mode_tunnel_input_deregister(struct xfrm_tunnel *handler)
+{
+ struct xfrm_tunnel __rcu **pprev;
+ struct xfrm_tunnel *t;
+ int ret = -ENOENT;
+
+ mutex_lock(&xfrm4_mode_tunnel_input_mutex);
+ for (pprev = &rcv_notify_handlers;
+ (t = rcu_dereference_protected(*pprev,
+ lockdep_is_held(&xfrm4_mode_tunnel_input_mutex))) != NULL;
+ pprev = &t->next) {
+ if (t == handler) {
+ *pprev = handler->next;
+ ret = 0;
+ break;
+ }
+ }
+ mutex_unlock(&xfrm4_mode_tunnel_input_mutex);
+ synchronize_net();
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(xfrm4_mode_tunnel_input_deregister);
+
static inline void ipip_ecn_decapsulate(struct sk_buff *skb)
{
struct iphdr *inner_iph = ipip_hdr(skb);
@@ -64,8 +123,14 @@ static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb)
return 0;
}
+#define for_each_input_rcu(head, handler) \
+ for (handler = rcu_dereference(head); \
+ handler != NULL; \
+ handler = rcu_dereference(handler->next))
+
static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb)
{
+ struct xfrm_tunnel *handler;
int err = -EINVAL;
if (XFRM_MODE_SKB_CB(skb)->protocol != IPPROTO_IPIP)
@@ -74,6 +139,9 @@ static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb)
if (!pskb_may_pull(skb, sizeof(struct iphdr)))
goto out;
+ for_each_input_rcu(rcv_notify_handlers, handler)
+ handler->handler(skb);
+
if (skb_cloned(skb) &&
(err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
goto out;
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index a0b4c5da8d43..fcf7678bc009 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -90,10 +90,6 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
xdst->u.dst.dev = dev;
dev_hold(dev);
- xdst->u.rt.peer = rt->peer;
- if (rt->peer)
- atomic_inc(&rt->peer->refcnt);
-
/* Sheit... I remember I did this right. Apparently,
* it was magically lost, so this code needs audit */
xdst->u.rt.rt_flags = rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST |
@@ -102,7 +98,7 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
xdst->u.rt.rt_src = rt->rt_src;
xdst->u.rt.rt_dst = rt->rt_dst;
xdst->u.rt.rt_gateway = rt->rt_gateway;
- xdst->u.rt.rt_spec_dst = rt->rt_spec_dst;
+ xdst->u.rt.rt_pmtu = rt->rt_pmtu;
return 0;
}
@@ -152,7 +148,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
case IPPROTO_AH:
if (pskb_may_pull(skb, xprth + 8 - skb->data)) {
- __be32 *ah_hdr = (__be32*)xprth;
+ __be32 *ah_hdr = (__be32 *)xprth;
fl4->fl4_ipsec_spi = ah_hdr[1];
}
@@ -198,12 +194,22 @@ static inline int xfrm4_garbage_collect(struct dst_ops *ops)
return (dst_entries_get_slow(ops) > ops->gc_thresh * 2);
}
-static void xfrm4_update_pmtu(struct dst_entry *dst, u32 mtu)
+static void xfrm4_update_pmtu(struct dst_entry *dst, struct sock *sk,
+ struct sk_buff *skb, u32 mtu)
+{
+ struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
+ struct dst_entry *path = xdst->route;
+
+ path->ops->update_pmtu(path, sk, skb, mtu);
+}
+
+static void xfrm4_redirect(struct dst_entry *dst, struct sock *sk,
+ struct sk_buff *skb)
{
struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
struct dst_entry *path = xdst->route;
- path->ops->update_pmtu(path, mtu);
+ path->ops->redirect(path, sk, skb);
}
static void xfrm4_dst_destroy(struct dst_entry *dst)
@@ -212,9 +218,6 @@ static void xfrm4_dst_destroy(struct dst_entry *dst)
dst_destroy_metrics_generic(dst);
- if (likely(xdst->u.rt.peer))
- inet_putpeer(xdst->u.rt.peer);
-
xfrm_dst_destroy(xdst);
}
@@ -232,6 +235,7 @@ static struct dst_ops xfrm4_dst_ops = {
.protocol = cpu_to_be16(ETH_P_IP),
.gc = xfrm4_garbage_collect,
.update_pmtu = xfrm4_update_pmtu,
+ .redirect = xfrm4_redirect,
.cow_metrics = dst_cow_metrics_generic,
.destroy = xfrm4_dst_destroy,
.ifdown = xfrm4_dst_ifdown,
@@ -298,8 +302,8 @@ void __init xfrm4_init(int rt_max_size)
xfrm4_state_init();
xfrm4_policy_init();
#ifdef CONFIG_SYSCTL
- sysctl_hdr = register_net_sysctl_table(&init_net, net_ipv4_ctl_path,
- xfrm4_policy_table);
+ sysctl_hdr = register_net_sysctl(&init_net, "net/ipv4",
+ xfrm4_policy_table);
#endif
}