diff options
Diffstat (limited to 'net/ipv4')
74 files changed, 5288 insertions, 2999 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index d183262943d9..5a19aeb86094 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -262,8 +262,8 @@ config ARPD bool "IP: ARP daemon support" ---help--- The kernel maintains an internal cache which maps IP addresses to - hardware addresses on the local network, so that Ethernet/Token Ring/ - etc. frames are sent to the proper address on the physical networking + hardware addresses on the local network, so that Ethernet + frames are sent to the proper address on the physical networking layer. Normally, kernel uses the ARP protocol to resolve these mappings. @@ -310,9 +310,20 @@ config SYN_COOKIES If unsure, say N. +config NET_IPVTI + tristate "Virtual (secure) IP: tunneling" + select INET_TUNNEL + depends on INET_XFRM_MODE_TUNNEL + ---help--- + Tunneling means encapsulating data of one protocol type within + another protocol and sending it over a channel that understands the + encapsulating protocol. This can be used with xfrm mode tunnel to give + the notion of a secure tunnel for IPSEC and then use routing protocol + on top. + config INET_AH tristate "IP: AH transformation" - select XFRM + select XFRM_ALGO select CRYPTO select CRYPTO_HMAC select CRYPTO_MD5 @@ -324,7 +335,7 @@ config INET_AH config INET_ESP tristate "IP: ESP transformation" - select XFRM + select XFRM_ALGO select CRYPTO select CRYPTO_AUTHENC select CRYPTO_HMAC diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index ff75d3bbcd6a..ae2ccf2890e4 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -7,7 +7,7 @@ obj-y := route.o inetpeer.o protocol.o \ ip_output.o ip_sockglue.o inet_hashtables.o \ inet_timewait_sock.o inet_connection_sock.o \ tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \ - tcp_minisocks.o tcp_cong.o \ + tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \ datagram.o raw.o udp.o udplite.o \ arp.o icmp.o devinet.o af_inet.o igmp.o \ fib_frontend.o fib_semantics.o fib_trie.o \ @@ -20,6 +20,7 @@ obj-$(CONFIG_IP_MROUTE) += ipmr.o obj-$(CONFIG_NET_IPIP) += ipip.o obj-$(CONFIG_NET_IPGRE_DEMUX) += gre.o obj-$(CONFIG_NET_IPGRE) += ip_gre.o +obj-$(CONFIG_NET_IPVTI) += ip_vti.o obj-$(CONFIG_SYN_COOKIES) += syncookies.o obj-$(CONFIG_INET_AH) += ah4.o obj-$(CONFIG_INET_ESP) += esp4.o diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 10e3751466b5..fe4582ca969a 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -157,6 +157,7 @@ void inet_sock_destruct(struct sock *sk) kfree(rcu_dereference_protected(inet->inet_opt, 1)); dst_release(rcu_dereference_check(sk->sk_dst_cache, 1)); + dst_release(sk->sk_rx_dst); sk_refcnt_debug_dec(sk); } EXPORT_SYMBOL(inet_sock_destruct); @@ -242,20 +243,18 @@ void build_ehash_secret(void) } EXPORT_SYMBOL(build_ehash_secret); -static inline int inet_netns_ok(struct net *net, int protocol) +static inline int inet_netns_ok(struct net *net, __u8 protocol) { - int hash; const struct net_protocol *ipprot; if (net_eq(net, &init_net)) return 1; - hash = protocol & (MAX_INET_PROTOS - 1); - ipprot = rcu_dereference(inet_protos[hash]); - - if (ipprot == NULL) + ipprot = rcu_dereference(inet_protos[protocol]); + if (ipprot == NULL) { /* raw IP is OK */ return 1; + } return ipprot->netns_ok; } @@ -350,7 +349,7 @@ lookup_protocol: err = 0; sk->sk_no_check = answer_no_check; if (INET_PROTOSW_REUSE & answer_flags) - sk->sk_reuse = 1; + sk->sk_reuse = SK_CAN_REUSE; inet = inet_sk(sk); inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0; @@ -541,7 +540,7 @@ out: } EXPORT_SYMBOL(inet_bind); -int inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr, +int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) { struct sock *sk = sock->sk; @@ -553,15 +552,16 @@ int inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr, if (!inet_sk(sk)->inet_num && inet_autobind(sk)) return -EAGAIN; - return sk->sk_prot->connect(sk, (struct sockaddr *)uaddr, addr_len); + return sk->sk_prot->connect(sk, uaddr, addr_len); } EXPORT_SYMBOL(inet_dgram_connect); -static long inet_wait_for_connect(struct sock *sk, long timeo) +static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias) { DEFINE_WAIT(wait); prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + sk->sk_write_pending += writebias; /* Basic assumption: if someone sets sk->sk_err, he _must_ * change state of the socket from TCP_SYN_*. @@ -577,6 +577,7 @@ static long inet_wait_for_connect(struct sock *sk, long timeo) prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); } finish_wait(sk_sleep(sk), &wait); + sk->sk_write_pending -= writebias; return timeo; } @@ -584,8 +585,8 @@ static long inet_wait_for_connect(struct sock *sk, long timeo) * Connect to a remote host. There is regrettably still a little * TCP 'magic' in here. */ -int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, - int addr_len, int flags) +int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, + int addr_len, int flags) { struct sock *sk = sock->sk; int err; @@ -594,8 +595,6 @@ int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, if (addr_len < sizeof(uaddr->sa_family)) return -EINVAL; - lock_sock(sk); - if (uaddr->sa_family == AF_UNSPEC) { err = sk->sk_prot->disconnect(sk, flags); sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED; @@ -635,8 +634,12 @@ int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { + int writebias = (sk->sk_protocol == IPPROTO_TCP) && + tcp_sk(sk)->fastopen_req && + tcp_sk(sk)->fastopen_req->data ? 1 : 0; + /* Error code is set above */ - if (!timeo || !inet_wait_for_connect(sk, timeo)) + if (!timeo || !inet_wait_for_connect(sk, timeo, writebias)) goto out; err = sock_intr_errno(timeo); @@ -658,7 +661,6 @@ int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, sock->state = SS_CONNECTED; err = 0; out: - release_sock(sk); return err; sock_error: @@ -668,6 +670,18 @@ sock_error: sock->state = SS_DISCONNECTING; goto out; } +EXPORT_SYMBOL(__inet_stream_connect); + +int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, + int addr_len, int flags) +{ + int err; + + lock_sock(sock->sk); + err = __inet_stream_connect(sock, uaddr, addr_len, flags); + release_sock(sock->sk); + return err; +} EXPORT_SYMBOL(inet_stream_connect); /* @@ -1216,8 +1230,8 @@ EXPORT_SYMBOL(inet_sk_rebuild_header); static int inet_gso_send_check(struct sk_buff *skb) { - const struct iphdr *iph; const struct net_protocol *ops; + const struct iphdr *iph; int proto; int ihl; int err = -EINVAL; @@ -1236,7 +1250,7 @@ static int inet_gso_send_check(struct sk_buff *skb) __skb_pull(skb, ihl); skb_reset_transport_header(skb); iph = ip_hdr(skb); - proto = iph->protocol & (MAX_INET_PROTOS - 1); + proto = iph->protocol; err = -EPROTONOSUPPORT; rcu_read_lock(); @@ -1253,8 +1267,8 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, netdev_features_t features) { struct sk_buff *segs = ERR_PTR(-EINVAL); - struct iphdr *iph; const struct net_protocol *ops; + struct iphdr *iph; int proto; int ihl; int id; @@ -1286,7 +1300,7 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, skb_reset_transport_header(skb); iph = ip_hdr(skb); id = ntohs(iph->id); - proto = iph->protocol & (MAX_INET_PROTOS - 1); + proto = iph->protocol; segs = ERR_PTR(-EPROTONOSUPPORT); rcu_read_lock(); @@ -1340,7 +1354,7 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head, goto out; } - proto = iph->protocol & (MAX_INET_PROTOS - 1); + proto = iph->protocol; rcu_read_lock(); ops = rcu_dereference(inet_protos[proto]); @@ -1398,11 +1412,11 @@ out: static int inet_gro_complete(struct sk_buff *skb) { - const struct net_protocol *ops; + __be16 newlen = htons(skb->len - skb_network_offset(skb)); struct iphdr *iph = ip_hdr(skb); - int proto = iph->protocol & (MAX_INET_PROTOS - 1); + const struct net_protocol *ops; + int proto = iph->protocol; int err = -ENOSYS; - __be16 newlen = htons(skb->len - skb_network_offset(skb)); csum_replace2(&iph->check, iph->tot_len, newlen); iph->tot_len = newlen; @@ -1520,14 +1534,15 @@ static const struct net_protocol igmp_protocol = { #endif static const struct net_protocol tcp_protocol = { - .handler = tcp_v4_rcv, - .err_handler = tcp_v4_err, - .gso_send_check = tcp_v4_gso_send_check, - .gso_segment = tcp_tso_segment, - .gro_receive = tcp4_gro_receive, - .gro_complete = tcp4_gro_complete, - .no_policy = 1, - .netns_ok = 1, + .early_demux = tcp_v4_early_demux, + .handler = tcp_v4_rcv, + .err_handler = tcp_v4_err, + .gso_send_check = tcp_v4_gso_send_check, + .gso_segment = tcp_tso_segment, + .gro_receive = tcp4_gro_receive, + .gro_complete = tcp4_gro_complete, + .no_policy = 1, + .netns_ok = 1, }; static const struct net_protocol udp_protocol = { diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index fd508b526014..a0d8392491c3 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -77,7 +77,7 @@ static inline struct scatterlist *ah_req_sg(struct crypto_ahash *ahash, static int ip_clear_mutable_options(const struct iphdr *iph, __be32 *daddr) { - unsigned char * optptr = (unsigned char*)(iph+1); + unsigned char *optptr = (unsigned char *)(iph+1); int l = iph->ihl*4 - sizeof(struct iphdr); int optlen; @@ -398,16 +398,25 @@ static void ah4_err(struct sk_buff *skb, u32 info) struct ip_auth_hdr *ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2)); struct xfrm_state *x; - if (icmp_hdr(skb)->type != ICMP_DEST_UNREACH || - icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) + switch (icmp_hdr(skb)->type) { + case ICMP_DEST_UNREACH: + if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) + return; + case ICMP_REDIRECT: + break; + default: return; + } x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, ah->spi, IPPROTO_AH, AF_INET); if (!x) return; - printk(KERN_DEBUG "pmtu discovery on SA AH/%08x/%08x\n", - ntohl(ah->spi), ntohl(iph->daddr)); + + if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) + ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_AH, 0); + else + ipv4_redirect(skb, net, 0, 0, IPPROTO_AH, 0); xfrm_state_put(x); } diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 18d9b81ecb1a..2e560f0c757d 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -73,6 +73,8 @@ * Jesper D. Brouer: Proxy ARP PVLAN RFC 3069 support. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/module.h> #include <linux/types.h> #include <linux/string.h> @@ -89,7 +91,6 @@ #include <linux/etherdevice.h> #include <linux/fddidevice.h> #include <linux/if_arp.h> -#include <linux/trdevice.h> #include <linux/skbuff.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> @@ -193,9 +194,6 @@ int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir) case ARPHRD_IEEE802: ip_eth_mc_map(addr, haddr); return 0; - case ARPHRD_IEEE802_TR: - ip_tr_mc_map(addr, haddr); - return 0; case ARPHRD_INFINIBAND: ip_ib_mc_map(addr, dev->broadcast, haddr); return 0; @@ -364,8 +362,7 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) probes -= neigh->parms->ucast_probes; if (probes < 0) { if (!(neigh->nud_state & NUD_VALID)) - printk(KERN_DEBUG - "trying to ucast probe in NUD_INVALID\n"); + pr_debug("trying to ucast probe in NUD_INVALID\n"); dst_ha = neigh->ha; read_lock_bh(&neigh->lock); } else { @@ -452,7 +449,7 @@ static int arp_set_predefined(int addr_hint, unsigned char *haddr, { switch (addr_hint) { case RTN_LOCAL: - printk(KERN_DEBUG "ARP: arp called for own IP address\n"); + pr_debug("arp called for own IP address\n"); memcpy(haddr, dev->dev_addr, dev->addr_len); return 1; case RTN_MULTICAST: @@ -473,7 +470,7 @@ int arp_find(unsigned char *haddr, struct sk_buff *skb) struct neighbour *n; if (!skb_dst(skb)) { - printk(KERN_DEBUG "arp_find is called with dst==NULL\n"); + pr_debug("arp_find is called with dst==NULL\n"); kfree_skb(skb); return 1; } @@ -648,12 +645,6 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip, arp->ar_pro = htons(ETH_P_IP); break; #endif -#if IS_ENABLED(CONFIG_TR) - case ARPHRD_IEEE802_TR: - arp->ar_hrd = htons(ARPHRD_IEEE802); - arp->ar_pro = htons(ETH_P_IP); - break; -#endif } arp->ar_hln = dev->addr_len; @@ -751,11 +742,10 @@ static int arp_process(struct sk_buff *skb) goto out; break; case ARPHRD_ETHER: - case ARPHRD_IEEE802_TR: case ARPHRD_FDDI: case ARPHRD_IEEE802: /* - * ETHERNET, Token Ring and Fibre Channel (which are IEEE 802 + * ETHERNET, and Fibre Channel (which are IEEE 802 * devices, according to RFC 2625) devices will accept ARP * hardware types of either 1 (Ethernet) or 6 (IEEE 802.2). * This is the case also of FDDI, where the RFC 1390 says that @@ -800,7 +790,8 @@ static int arp_process(struct sk_buff *skb) * Check for bad requests for 127.x.x.x and requests for multicast * addresses. If this is one such, delete it. */ - if (ipv4_is_loopback(tip) || ipv4_is_multicast(tip)) + if (ipv4_is_multicast(tip) || + (!IN_DEV_ROUTE_LOCALNET(in_dev) && ipv4_is_loopback(tip))) goto out; /* @@ -1059,7 +1050,7 @@ static int arp_req_set(struct net *net, struct arpreq *r, neigh = __neigh_lookup_errno(&arp_tbl, &ip, dev); err = PTR_ERR(neigh); if (!IS_ERR(neigh)) { - unsigned state = NUD_STALE; + unsigned int state = NUD_STALE; if (r->arp_flags & ATF_PERM) state = NUD_PERMANENT; err = neigh_update(neigh, (r->arp_flags & ATF_COM) ? @@ -1071,7 +1062,7 @@ static int arp_req_set(struct net *net, struct arpreq *r, return err; } -static unsigned arp_state_to_flags(struct neighbour *neigh) +static unsigned int arp_state_to_flags(struct neighbour *neigh) { if (neigh->nud_state&NUD_PERMANENT) return ATF_PERM | ATF_COM; diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index c48adc565e92..667c1d4ca984 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -1725,8 +1725,10 @@ int cipso_v4_validate(const struct sk_buff *skb, unsigned char **option) case CIPSO_V4_TAG_LOCAL: /* This is a non-standard tag that we only allow for * local connections, so if the incoming interface is - * not the loopback device drop the packet. */ - if (!(skb->dev->flags & IFF_LOOPBACK)) { + * not the loopback device drop the packet. Further, + * there is no legitimate reason for setting this from + * userspace so reject it if skb is NULL. */ + if (skb == NULL || !(skb->dev->flags & IFF_LOOPBACK)) { err_offset = opt_iter; goto validate_return_locked; } diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 6e447ff94dfa..44bf82e3aef7 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -217,8 +217,7 @@ void in_dev_finish_destroy(struct in_device *idev) WARN_ON(idev->ifa_list); WARN_ON(idev->mc_list); #ifdef NET_REFCNT_DEBUG - printk(KERN_DEBUG "in_dev_finish_destroy: %p=%s\n", - idev, dev ? dev->name : "NIL"); + pr_debug("%s: %p=%s\n", __func__, idev, dev ? dev->name : "NIL"); #endif dev_put(dev); if (!idev->dead) @@ -1125,7 +1124,7 @@ skip: } } -static inline bool inetdev_valid_mtu(unsigned mtu) +static inline bool inetdev_valid_mtu(unsigned int mtu) { return mtu >= 68; } @@ -1174,7 +1173,7 @@ static int inetdev_event(struct notifier_block *this, unsigned long event, switch (event) { case NETDEV_REGISTER: - printk(KERN_DEBUG "inetdev_event: bug\n"); + pr_debug("%s: bug\n", __func__); RCU_INIT_POINTER(dev->ip_ptr, NULL); break; case NETDEV_UP: @@ -1266,17 +1265,15 @@ static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa, ifm->ifa_scope = ifa->ifa_scope; ifm->ifa_index = ifa->ifa_dev->dev->ifindex; - if (ifa->ifa_address) - NLA_PUT_BE32(skb, IFA_ADDRESS, ifa->ifa_address); - - if (ifa->ifa_local) - NLA_PUT_BE32(skb, IFA_LOCAL, ifa->ifa_local); - - if (ifa->ifa_broadcast) - NLA_PUT_BE32(skb, IFA_BROADCAST, ifa->ifa_broadcast); - - if (ifa->ifa_label[0]) - NLA_PUT_STRING(skb, IFA_LABEL, ifa->ifa_label); + if ((ifa->ifa_address && + nla_put_be32(skb, IFA_ADDRESS, ifa->ifa_address)) || + (ifa->ifa_local && + nla_put_be32(skb, IFA_LOCAL, ifa->ifa_local)) || + (ifa->ifa_broadcast && + nla_put_be32(skb, IFA_BROADCAST, ifa->ifa_broadcast)) || + (ifa->ifa_label[0] && + nla_put_string(skb, IFA_LABEL, ifa->ifa_label))) + goto nla_put_failure; return nlmsg_end(skb, nlh); @@ -1503,7 +1500,8 @@ static int devinet_conf_proc(ctl_table *ctl, int write, if (cnf == net->ipv4.devconf_dflt) devinet_copy_dflt_conf(net, i); - if (i == IPV4_DEVCONF_ACCEPT_LOCAL - 1) + if (i == IPV4_DEVCONF_ACCEPT_LOCAL - 1 || + i == IPV4_DEVCONF_ROUTE_LOCALNET - 1) if ((new_value == 0) && (old_value != 0)) rt_cache_flush(net, 0); } @@ -1587,7 +1585,6 @@ static int ipv4_doint_and_flush(ctl_table *ctl, int write, static struct devinet_sysctl_table { struct ctl_table_header *sysctl_header; struct ctl_table devinet_vars[__IPV4_DEVCONF_MAX]; - char *dev_name; } devinet_sysctl = { .devinet_vars = { DEVINET_SYSCTL_COMPLEX_ENTRY(FORWARDING, "forwarding", @@ -1621,6 +1618,8 @@ static struct devinet_sysctl_table { "force_igmp_version"), DEVINET_SYSCTL_FLUSHING_ENTRY(PROMOTE_SECONDARIES, "promote_secondaries"), + DEVINET_SYSCTL_FLUSHING_ENTRY(ROUTE_LOCALNET, + "route_localnet"), }, }; @@ -1629,16 +1628,7 @@ static int __devinet_sysctl_register(struct net *net, char *dev_name, { int i; struct devinet_sysctl_table *t; - -#define DEVINET_CTL_PATH_DEV 3 - - struct ctl_path devinet_ctl_path[] = { - { .procname = "net", }, - { .procname = "ipv4", }, - { .procname = "conf", }, - { /* to be set */ }, - { }, - }; + char path[sizeof("net/ipv4/conf/") + IFNAMSIZ]; t = kmemdup(&devinet_sysctl, sizeof(*t), GFP_KERNEL); if (!t) @@ -1650,27 +1640,15 @@ static int __devinet_sysctl_register(struct net *net, char *dev_name, t->devinet_vars[i].extra2 = net; } - /* - * Make a copy of dev_name, because '.procname' is regarded as const - * by sysctl and we wouldn't want anyone to change it under our feet - * (see SIOCSIFNAME). - */ - t->dev_name = kstrdup(dev_name, GFP_KERNEL); - if (!t->dev_name) - goto free; - - devinet_ctl_path[DEVINET_CTL_PATH_DEV].procname = t->dev_name; + snprintf(path, sizeof(path), "net/ipv4/conf/%s", dev_name); - t->sysctl_header = register_net_sysctl_table(net, devinet_ctl_path, - t->devinet_vars); + t->sysctl_header = register_net_sysctl(net, path, t->devinet_vars); if (!t->sysctl_header) - goto free_procname; + goto free; p->sysctl = t; return 0; -free_procname: - kfree(t->dev_name); free: kfree(t); out: @@ -1686,7 +1664,6 @@ static void __devinet_sysctl_unregister(struct ipv4_devconf *cnf) cnf->sysctl = NULL; unregister_net_sysctl_table(t->sysctl_header); - kfree(t->dev_name); kfree(t); } @@ -1716,12 +1693,6 @@ static struct ctl_table ctl_forward_entry[] = { }, { }, }; - -static __net_initdata struct ctl_path net_ipv4_path[] = { - { .procname = "net", }, - { .procname = "ipv4", }, - { }, -}; #endif static __net_init int devinet_init_net(struct net *net) @@ -1767,7 +1738,7 @@ static __net_init int devinet_init_net(struct net *net) goto err_reg_dflt; err = -ENOMEM; - forw_hdr = register_net_sysctl_table(net, net_ipv4_path, tbl); + forw_hdr = register_net_sysctl(net, "net/ipv4", tbl); if (forw_hdr == NULL) goto err_reg_ctl; net->ipv4.forw_hdr = forw_hdr; diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 89a47b35905d..b61e9deb7c7e 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -459,28 +459,22 @@ static u32 esp4_get_mtu(struct xfrm_state *x, int mtu) struct esp_data *esp = x->data; u32 blksize = ALIGN(crypto_aead_blocksize(esp->aead), 4); u32 align = max_t(u32, blksize, esp->padlen); - u32 rem; - - mtu -= x->props.header_len + crypto_aead_authsize(esp->aead); - rem = mtu & (align - 1); - mtu &= ~(align - 1); + unsigned int net_adj; switch (x->props.mode) { - case XFRM_MODE_TUNNEL: - break; - default: case XFRM_MODE_TRANSPORT: - /* The worst case */ - mtu -= blksize - 4; - mtu += min_t(u32, blksize - 4, rem); - break; case XFRM_MODE_BEET: - /* The worst case. */ - mtu += min_t(u32, IPV4_BEET_PHMAXLEN, rem); + net_adj = sizeof(struct iphdr); + break; + case XFRM_MODE_TUNNEL: + net_adj = 0; break; + default: + BUG(); } - return mtu - 2; + return ((mtu - x->props.header_len - crypto_aead_authsize(esp->aead) - + net_adj) & ~(align - 1)) + (net_adj - 2); } static void esp4_err(struct sk_buff *skb, u32 info) @@ -490,16 +484,25 @@ static void esp4_err(struct sk_buff *skb, u32 info) struct ip_esp_hdr *esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2)); struct xfrm_state *x; - if (icmp_hdr(skb)->type != ICMP_DEST_UNREACH || - icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) + switch (icmp_hdr(skb)->type) { + case ICMP_DEST_UNREACH: + if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) + return; + case ICMP_REDIRECT: + break; + default: return; + } x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, esph->spi, IPPROTO_ESP, AF_INET); if (!x) return; - NETDEBUG(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n", - ntohl(esph->spi), ntohl(iph->daddr)); + + if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) + ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_ESP, 0); + else + ipv4_redirect(skb, net, 0, 0, IPPROTO_ESP, 0); xfrm_state_put(x); } diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index cbe3a68507cf..b83203658ee3 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -31,6 +31,7 @@ #include <linux/if_addr.h> #include <linux/if_arp.h> #include <linux/skbuff.h> +#include <linux/cache.h> #include <linux/init.h> #include <linux/list.h> #include <linux/slab.h> @@ -85,6 +86,24 @@ struct fib_table *fib_new_table(struct net *net, u32 id) tb = fib_trie_table(id); if (!tb) return NULL; + + switch (id) { + case RT_TABLE_LOCAL: + net->ipv4.fib_local = tb; + break; + + case RT_TABLE_MAIN: + net->ipv4.fib_main = tb; + break; + + case RT_TABLE_DEFAULT: + net->ipv4.fib_default = tb; + break; + + default: + break; + } + h = id & (FIB_TABLE_HASHSZ - 1); hlist_add_head_rcu(&tb->tb_hlist, &net->ipv4.fib_table_hash[h]); return tb; @@ -136,13 +155,13 @@ static void fib_flush(struct net *net) * Find address type as if only "dev" was present in the system. If * on_dev is NULL then all interfaces are taken into consideration. */ -static inline unsigned __inet_dev_addr_type(struct net *net, - const struct net_device *dev, - __be32 addr) +static inline unsigned int __inet_dev_addr_type(struct net *net, + const struct net_device *dev, + __be32 addr) { struct flowi4 fl4 = { .daddr = addr }; struct fib_result res; - unsigned ret = RTN_BROADCAST; + unsigned int ret = RTN_BROADCAST; struct fib_table *local_table; if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr)) @@ -150,10 +169,6 @@ static inline unsigned __inet_dev_addr_type(struct net *net, if (ipv4_is_multicast(addr)) return RTN_MULTICAST; -#ifdef CONFIG_IP_MULTIPLE_TABLES - res.r = NULL; -#endif - local_table = fib_get_table(net, RT_TABLE_LOCAL); if (local_table) { ret = RTN_UNICAST; @@ -180,6 +195,44 @@ unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev, } EXPORT_SYMBOL(inet_dev_addr_type); +__be32 fib_compute_spec_dst(struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + struct in_device *in_dev; + struct fib_result res; + struct rtable *rt; + struct flowi4 fl4; + struct net *net; + int scope; + + rt = skb_rtable(skb); + if ((rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL)) == + RTCF_LOCAL) + return ip_hdr(skb)->daddr; + + in_dev = __in_dev_get_rcu(dev); + BUG_ON(!in_dev); + + net = dev_net(dev); + + scope = RT_SCOPE_UNIVERSE; + if (!ipv4_is_zeronet(ip_hdr(skb)->saddr)) { + fl4.flowi4_oif = 0; + fl4.flowi4_iif = net->loopback_dev->ifindex; + fl4.daddr = ip_hdr(skb)->saddr; + fl4.saddr = 0; + fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos); + fl4.flowi4_scope = scope; + fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0; + if (!fib_lookup(net, &fl4, &res)) + return FIB_RES_PREFSRC(net, res); + } else { + scope = RT_SCOPE_LINK; + } + + return inet_select_addr(dev, ip_hdr(skb)->saddr, scope); +} + /* Given (packet source, input interface) and optional (dst, oif, tos): * - (main) check, that source is valid i.e. not broadcast or our local * address. @@ -188,17 +241,15 @@ EXPORT_SYMBOL(inet_dev_addr_type); * - check, that packet arrived from expected physical interface. * called with rcu_read_lock() */ -int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos, - int oif, struct net_device *dev, __be32 *spec_dst, - u32 *itag) +static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, + u8 tos, int oif, struct net_device *dev, + int rpf, struct in_device *idev, u32 *itag) { - struct in_device *in_dev; - struct flowi4 fl4; + int ret, no_addr, accept_local; struct fib_result res; - int no_addr, rpf, accept_local; - bool dev_match; - int ret; + struct flowi4 fl4; struct net *net; + bool dev_match; fl4.flowi4_oif = 0; fl4.flowi4_iif = oif; @@ -207,20 +258,11 @@ int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos, fl4.flowi4_tos = tos; fl4.flowi4_scope = RT_SCOPE_UNIVERSE; - no_addr = rpf = accept_local = 0; - in_dev = __in_dev_get_rcu(dev); - if (in_dev) { - no_addr = in_dev->ifa_list == NULL; - - /* Ignore rp_filter for packets protected by IPsec. */ - rpf = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(in_dev); - - accept_local = IN_DEV_ACCEPT_LOCAL(in_dev); - fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0; - } + no_addr = accept_local = 0; + no_addr = idev->ifa_list == NULL; - if (in_dev == NULL) - goto e_inval; + accept_local = IN_DEV_ACCEPT_LOCAL(idev); + fl4.flowi4_mark = IN_DEV_SRC_VMARK(idev) ? skb->mark : 0; net = dev_net(dev); if (fib_lookup(net, &fl4, &res)) @@ -229,7 +271,6 @@ int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos, if (res.type != RTN_LOCAL || !accept_local) goto e_inval; } - *spec_dst = FIB_RES_PREFSRC(net, res); fib_combine_itag(itag, &res); dev_match = false; @@ -258,17 +299,14 @@ int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos, ret = 0; if (fib_lookup(net, &fl4, &res) == 0) { - if (res.type == RTN_UNICAST) { - *spec_dst = FIB_RES_PREFSRC(net, res); + if (res.type == RTN_UNICAST) ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; - } } return ret; last_resort: if (rpf) goto e_rpf; - *spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); *itag = 0; return 0; @@ -278,6 +316,20 @@ e_rpf: return -EXDEV; } +/* Ignore rp_filter for packets protected by IPsec. */ +int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, + u8 tos, int oif, struct net_device *dev, + struct in_device *idev, u32 *itag) +{ + int r = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(idev); + + if (!r && !fib_num_tclassid_users(dev_net(dev))) { + *itag = 0; + return 0; + } + return __fib_validate_source(skb, src, dst, tos, oif, dev, r, idev, itag); +} + static inline __be32 sk_extract_addr(struct sockaddr *addr) { return ((struct sockaddr_in *) addr)->sin_addr.s_addr; @@ -740,7 +792,7 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim) #define BRD_OK 2 #define BRD0_OK 4 #define BRD1_OK 8 - unsigned ok = 0; + unsigned int ok = 0; int subnet = 0; /* Primary network */ int gone = 1; /* Address is missing */ int same_prefsrc = 0; /* Another primary with same IP */ @@ -879,10 +931,6 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb) .flowi4_scope = frn->fl_scope, }; -#ifdef CONFIG_IP_MULTIPLE_TABLES - res.r = NULL; -#endif - frn->err = -ENOENT; if (tb) { local_bh_disable(); @@ -935,8 +983,11 @@ static void nl_fib_input(struct sk_buff *skb) static int __net_init nl_fib_lookup_init(struct net *net) { struct sock *sk; - sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, 0, - nl_fib_input, NULL, THIS_MODULE); + struct netlink_kernel_cfg cfg = { + .input = nl_fib_input, + }; + + sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, THIS_MODULE, &cfg); if (sk == NULL) return -EAFNOSUPPORT; net->ipv4.fibnl = sk; @@ -1090,6 +1141,9 @@ static int __net_init fib_net_init(struct net *net) { int error; +#ifdef CONFIG_IP_ROUTE_CLASSID + net->ipv4.fib_num_tclassid_users = 0; +#endif error = ip_fib_net_init(net); if (error < 0) goto out; diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 799fc790b3cf..a83d74e498d2 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -47,14 +47,7 @@ struct fib4_rule { #endif }; -#ifdef CONFIG_IP_ROUTE_CLASSID -u32 fib_rules_tclass(const struct fib_result *res) -{ - return res->r ? ((struct fib4_rule *) res->r)->tclassid : 0; -} -#endif - -int fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res) +int __fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res) { struct fib_lookup_arg arg = { .result = res, @@ -63,11 +56,15 @@ int fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res) int err; err = fib_rules_lookup(net->ipv4.rules_ops, flowi4_to_flowi(flp), 0, &arg); - res->r = arg.rule; - +#ifdef CONFIG_IP_ROUTE_CLASSID + if (arg.rule) + res->tclassid = ((struct fib4_rule *)arg.rule)->tclassid; + else + res->tclassid = 0; +#endif return err; } -EXPORT_SYMBOL_GPL(fib_lookup); +EXPORT_SYMBOL_GPL(__fib_lookup); static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp, int flags, struct fib_lookup_arg *arg) @@ -169,8 +166,11 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, rule4->dst = nla_get_be32(tb[FRA_DST]); #ifdef CONFIG_IP_ROUTE_CLASSID - if (tb[FRA_FLOW]) + if (tb[FRA_FLOW]) { rule4->tclassid = nla_get_u32(tb[FRA_FLOW]); + if (rule4->tclassid) + net->ipv4.fib_num_tclassid_users++; + } #endif rule4->src_len = frh->src_len; @@ -179,11 +179,24 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, rule4->dstmask = inet_make_mask(rule4->dst_len); rule4->tos = frh->tos; + net->ipv4.fib_has_custom_rules = true; err = 0; errout: return err; } +static void fib4_rule_delete(struct fib_rule *rule) +{ + struct net *net = rule->fr_net; +#ifdef CONFIG_IP_ROUTE_CLASSID + struct fib4_rule *rule4 = (struct fib4_rule *) rule; + + if (rule4->tclassid) + net->ipv4.fib_num_tclassid_users--; +#endif + net->ipv4.fib_has_custom_rules = true; +} + static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh, struct nlattr **tb) { @@ -221,15 +234,15 @@ static int fib4_rule_fill(struct fib_rule *rule, struct sk_buff *skb, frh->src_len = rule4->src_len; frh->tos = rule4->tos; - if (rule4->dst_len) - NLA_PUT_BE32(skb, FRA_DST, rule4->dst); - - if (rule4->src_len) - NLA_PUT_BE32(skb, FRA_SRC, rule4->src); - + if ((rule4->dst_len && + nla_put_be32(skb, FRA_DST, rule4->dst)) || + (rule4->src_len && + nla_put_be32(skb, FRA_SRC, rule4->src))) + goto nla_put_failure; #ifdef CONFIG_IP_ROUTE_CLASSID - if (rule4->tclassid) - NLA_PUT_U32(skb, FRA_FLOW, rule4->tclassid); + if (rule4->tclassid && + nla_put_u32(skb, FRA_FLOW, rule4->tclassid)) + goto nla_put_failure; #endif return 0; @@ -256,6 +269,7 @@ static const struct fib_rules_ops __net_initdata fib4_rules_ops_template = { .action = fib4_rule_action, .match = fib4_rule_match, .configure = fib4_rule_configure, + .delete = fib4_rule_delete, .compare = fib4_rule_compare, .fill = fib4_rule_fill, .default_pref = fib_default_rule_pref, @@ -295,6 +309,7 @@ int __net_init fib4_rules_init(struct net *net) if (err < 0) goto fail; net->ipv4.rules_ops = ops; + net->ipv4.fib_has_custom_rules = false; return 0; fail: diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 5063fa38ac7b..2b57d768240d 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -140,11 +140,40 @@ const struct fib_prop fib_props[RTN_MAX + 1] = { }, }; +static void free_nh_exceptions(struct fib_nh *nh) +{ + struct fnhe_hash_bucket *hash = nh->nh_exceptions; + int i; + + for (i = 0; i < FNHE_HASH_SIZE; i++) { + struct fib_nh_exception *fnhe; + + fnhe = rcu_dereference_protected(hash[i].chain, 1); + while (fnhe) { + struct fib_nh_exception *next; + + next = rcu_dereference_protected(fnhe->fnhe_next, 1); + kfree(fnhe); + + fnhe = next; + } + } + kfree(hash); +} + /* Release a nexthop info record */ static void free_fib_info_rcu(struct rcu_head *head) { struct fib_info *fi = container_of(head, struct fib_info, rcu); + change_nexthops(fi) { + if (nexthop_nh->nh_dev) + dev_put(nexthop_nh->nh_dev); + if (nexthop_nh->nh_exceptions) + free_nh_exceptions(nexthop_nh); + } endfor_nexthops(fi); + + release_net(fi->fib_net); if (fi->fib_metrics != (u32 *) dst_default_metrics) kfree(fi->fib_metrics); kfree(fi); @@ -156,13 +185,13 @@ void free_fib_info(struct fib_info *fi) pr_warn("Freeing alive fib_info %p\n", fi); return; } + fib_info_cnt--; +#ifdef CONFIG_IP_ROUTE_CLASSID change_nexthops(fi) { - if (nexthop_nh->nh_dev) - dev_put(nexthop_nh->nh_dev); - nexthop_nh->nh_dev = NULL; + if (nexthop_nh->nh_tclassid) + fi->fib_net->ipv4.fib_num_tclassid_users--; } endfor_nexthops(fi); - fib_info_cnt--; - release_net(fi->fib_net); +#endif call_rcu(&fi->rcu, free_fib_info_rcu); } @@ -421,6 +450,8 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, #ifdef CONFIG_IP_ROUTE_CLASSID nla = nla_find(attrs, attrlen, RTA_FLOW); nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0; + if (nexthop_nh->nh_tclassid) + fi->fib_net->ipv4.fib_num_tclassid_users++; #endif } @@ -779,9 +810,16 @@ struct fib_info *fib_create_info(struct fib_config *cfg) int type = nla_type(nla); if (type) { + u32 val; + if (type > RTAX_MAX) goto err_inval; - fi->fib_metrics[type - 1] = nla_get_u32(nla); + val = nla_get_u32(nla); + if (type == RTAX_ADVMSS && val > 65535 - 40) + val = 65535 - 40; + if (type == RTAX_MTU && val > 65535 - 15) + val = 65535 - 15; + fi->fib_metrics[type - 1] = val; } } } @@ -810,6 +848,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg) nh->nh_flags = cfg->fc_flags; #ifdef CONFIG_IP_ROUTE_CLASSID nh->nh_tclassid = cfg->fc_flow; + if (nh->nh_tclassid) + fi->fib_net->ipv4.fib_num_tclassid_users++; #endif #ifdef CONFIG_IP_ROUTE_MULTIPATH nh->nh_weight = 1; @@ -931,33 +971,36 @@ int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, rtm->rtm_table = tb_id; else rtm->rtm_table = RT_TABLE_COMPAT; - NLA_PUT_U32(skb, RTA_TABLE, tb_id); + if (nla_put_u32(skb, RTA_TABLE, tb_id)) + goto nla_put_failure; rtm->rtm_type = type; rtm->rtm_flags = fi->fib_flags; rtm->rtm_scope = fi->fib_scope; rtm->rtm_protocol = fi->fib_protocol; - if (rtm->rtm_dst_len) - NLA_PUT_BE32(skb, RTA_DST, dst); - - if (fi->fib_priority) - NLA_PUT_U32(skb, RTA_PRIORITY, fi->fib_priority); - + if (rtm->rtm_dst_len && + nla_put_be32(skb, RTA_DST, dst)) + goto nla_put_failure; + if (fi->fib_priority && + nla_put_u32(skb, RTA_PRIORITY, fi->fib_priority)) + goto nla_put_failure; if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0) goto nla_put_failure; - if (fi->fib_prefsrc) - NLA_PUT_BE32(skb, RTA_PREFSRC, fi->fib_prefsrc); - + if (fi->fib_prefsrc && + nla_put_be32(skb, RTA_PREFSRC, fi->fib_prefsrc)) + goto nla_put_failure; if (fi->fib_nhs == 1) { - if (fi->fib_nh->nh_gw) - NLA_PUT_BE32(skb, RTA_GATEWAY, fi->fib_nh->nh_gw); - - if (fi->fib_nh->nh_oif) - NLA_PUT_U32(skb, RTA_OIF, fi->fib_nh->nh_oif); + if (fi->fib_nh->nh_gw && + nla_put_be32(skb, RTA_GATEWAY, fi->fib_nh->nh_gw)) + goto nla_put_failure; + if (fi->fib_nh->nh_oif && + nla_put_u32(skb, RTA_OIF, fi->fib_nh->nh_oif)) + goto nla_put_failure; #ifdef CONFIG_IP_ROUTE_CLASSID - if (fi->fib_nh[0].nh_tclassid) - NLA_PUT_U32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid); + if (fi->fib_nh[0].nh_tclassid && + nla_put_u32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid)) + goto nla_put_failure; #endif } #ifdef CONFIG_IP_ROUTE_MULTIPATH @@ -978,11 +1021,13 @@ int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, rtnh->rtnh_hops = nh->nh_weight - 1; rtnh->rtnh_ifindex = nh->nh_oif; - if (nh->nh_gw) - NLA_PUT_BE32(skb, RTA_GATEWAY, nh->nh_gw); + if (nh->nh_gw && + nla_put_be32(skb, RTA_GATEWAY, nh->nh_gw)) + goto nla_put_failure; #ifdef CONFIG_IP_ROUTE_CLASSID - if (nh->nh_tclassid) - NLA_PUT_U32(skb, RTA_FLOW, nh->nh_tclassid); + if (nh->nh_tclassid && + nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid)) + goto nla_put_failure; #endif /* length of rtnetlink header + attributes */ rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh; diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index bce36f1a37b4..18cbc15b20d5 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1007,9 +1007,9 @@ static void trie_rebalance(struct trie *t, struct tnode *tn) while (tn != NULL && (tp = node_parent((struct rt_trie_node *)tn)) != NULL) { cindex = tkey_extract_bits(key, tp->pos, tp->bits); wasfull = tnode_full(tp, tnode_get_child(tp, cindex)); - tn = (struct tnode *) resize(t, (struct tnode *)tn); + tn = (struct tnode *)resize(t, tn); - tnode_put_child_reorg((struct tnode *)tp, cindex, + tnode_put_child_reorg(tp, cindex, (struct rt_trie_node *)tn, wasfull); tp = node_parent((struct rt_trie_node *) tn); @@ -1024,7 +1024,7 @@ static void trie_rebalance(struct trie *t, struct tnode *tn) /* Handle last (top) tnode */ if (IS_TNODE(tn)) - tn = (struct tnode *)resize(t, (struct tnode *)tn); + tn = (struct tnode *)resize(t, tn); rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn); tnode_free_flush(); @@ -1125,7 +1125,7 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen) node_set_parent((struct rt_trie_node *)l, tp); cindex = tkey_extract_bits(key, tp->pos, tp->bits); - put_child(t, (struct tnode *)tp, cindex, (struct rt_trie_node *)l); + put_child(t, tp, cindex, (struct rt_trie_node *)l); } else { /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */ /* @@ -1160,8 +1160,7 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen) if (tp) { cindex = tkey_extract_bits(key, tp->pos, tp->bits); - put_child(t, (struct tnode *)tp, cindex, - (struct rt_trie_node *)tn); + put_child(t, tp, cindex, (struct rt_trie_node *)tn); } else { rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn); tp = tn; @@ -1370,6 +1369,8 @@ static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l, if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos) continue; + if (fi->fib_dead) + continue; if (fa->fa_info->fib_scope < flp->flowi4_scope) continue; fib_alias_accessed(fa); @@ -1618,7 +1619,7 @@ static void trie_leaf_remove(struct trie *t, struct leaf *l) if (tp) { t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits); - put_child(t, (struct tnode *)tp, cindex, NULL); + put_child(t, tp, cindex, NULL); trie_rebalance(t, tp); } else RCU_INIT_POINTER(t->trie, NULL); diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 2cb2bf845641..ea3a996de95b 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -95,6 +95,7 @@ #include <net/checksum.h> #include <net/xfrm.h> #include <net/inet_common.h> +#include <net/ip_fib.h> /* * Build xmit assembly blocks @@ -253,10 +254,10 @@ static inline bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt, /* Limit if icmp type is enabled in ratemask. */ if ((1 << type) & net->ipv4.sysctl_icmp_ratemask) { - if (!rt->peer) - rt_bind_peer(rt, fl4->daddr, 1); - rc = inet_peer_xrlim_allow(rt->peer, + struct inet_peer *peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, 1); + rc = inet_peer_xrlim_allow(peer, net->ipv4.sysctl_icmp_ratelimit); + inet_putpeer(peer); } out: return rc; @@ -334,7 +335,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) struct flowi4 fl4; struct sock *sk; struct inet_sock *inet; - __be32 daddr; + __be32 daddr, saddr; if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb)) return; @@ -348,6 +349,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) inet->tos = ip_hdr(skb)->tos; daddr = ipc.addr = ip_hdr(skb)->saddr; + saddr = fib_compute_spec_dst(skb); ipc.opt = NULL; ipc.tx_flags = 0; if (icmp_param->replyopts.opt.opt.optlen) { @@ -357,7 +359,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) } memset(&fl4, 0, sizeof(fl4)); fl4.daddr = daddr; - fl4.saddr = rt->rt_spec_dst; + fl4.saddr = saddr; fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos); fl4.flowi4_proto = IPPROTO_ICMP; security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); @@ -632,6 +634,27 @@ out:; EXPORT_SYMBOL(icmp_send); +static void icmp_socket_deliver(struct sk_buff *skb, u32 info) +{ + const struct iphdr *iph = (const struct iphdr *) skb->data; + const struct net_protocol *ipprot; + int protocol = iph->protocol; + + /* Checkin full IP header plus 8 bytes of protocol to + * avoid additional coding at protocol handlers. + */ + if (!pskb_may_pull(skb, iph->ihl * 4 + 8)) + return; + + raw_icmp_error(skb, protocol, info); + + rcu_read_lock(); + ipprot = rcu_dereference(inet_protos[protocol]); + if (ipprot && ipprot->err_handler) + ipprot->err_handler(skb, info); + rcu_read_unlock(); +} + /* * Handle ICMP_DEST_UNREACH, ICMP_TIME_EXCEED, and ICMP_QUENCH. */ @@ -640,10 +663,8 @@ static void icmp_unreach(struct sk_buff *skb) { const struct iphdr *iph; struct icmphdr *icmph; - int hash, protocol; - const struct net_protocol *ipprot; - u32 info = 0; struct net *net; + u32 info = 0; net = dev_net(skb_dst(skb)->dev); @@ -674,9 +695,7 @@ static void icmp_unreach(struct sk_buff *skb) LIMIT_NETDEBUG(KERN_INFO pr_fmt("%pI4: fragmentation needed and DF set\n"), &iph->daddr); } else { - info = ip_rt_frag_needed(net, iph, - ntohs(icmph->un.frag.mtu), - skb->dev); + info = ntohs(icmph->un.frag.mtu); if (!info) goto out; } @@ -713,34 +732,14 @@ static void icmp_unreach(struct sk_buff *skb) if (!net->ipv4.sysctl_icmp_ignore_bogus_error_responses && inet_addr_type(net, iph->daddr) == RTN_BROADCAST) { - if (net_ratelimit()) - pr_warn("%pI4 sent an invalid ICMP type %u, code %u error to a broadcast: %pI4 on %s\n", - &ip_hdr(skb)->saddr, - icmph->type, icmph->code, - &iph->daddr, skb->dev->name); + net_warn_ratelimited("%pI4 sent an invalid ICMP type %u, code %u error to a broadcast: %pI4 on %s\n", + &ip_hdr(skb)->saddr, + icmph->type, icmph->code, + &iph->daddr, skb->dev->name); goto out; } - /* Checkin full IP header plus 8 bytes of protocol to - * avoid additional coding at protocol handlers. - */ - if (!pskb_may_pull(skb, iph->ihl * 4 + 8)) - goto out; - - iph = (const struct iphdr *)skb->data; - protocol = iph->protocol; - - /* - * Deliver ICMP message to raw sockets. Pretty useless feature? - */ - raw_icmp_error(skb, protocol, info); - - hash = protocol & (MAX_INET_PROTOS - 1); - rcu_read_lock(); - ipprot = rcu_dereference(inet_protos[hash]); - if (ipprot && ipprot->err_handler) - ipprot->err_handler(skb, info); - rcu_read_unlock(); + icmp_socket_deliver(skb, info); out: return; @@ -756,46 +755,15 @@ out_err: static void icmp_redirect(struct sk_buff *skb) { - const struct iphdr *iph; - - if (skb->len < sizeof(struct iphdr)) - goto out_err; - - /* - * Get the copied header of the packet that caused the redirect - */ - if (!pskb_may_pull(skb, sizeof(struct iphdr))) - goto out; - - iph = (const struct iphdr *)skb->data; - - switch (icmp_hdr(skb)->code & 7) { - case ICMP_REDIR_NET: - case ICMP_REDIR_NETTOS: - /* - * As per RFC recommendations now handle it as a host redirect. - */ - case ICMP_REDIR_HOST: - case ICMP_REDIR_HOSTTOS: - ip_rt_redirect(ip_hdr(skb)->saddr, iph->daddr, - icmp_hdr(skb)->un.gateway, - iph->saddr, skb->dev); - break; + if (skb->len < sizeof(struct iphdr)) { + ICMP_INC_STATS_BH(dev_net(skb->dev), ICMP_MIB_INERRORS); + return; } - /* Ping wants to see redirects. - * Let's pretend they are errors of sorts... */ - if (iph->protocol == IPPROTO_ICMP && - iph->ihl >= 5 && - pskb_may_pull(skb, (iph->ihl<<2)+8)) { - ping_err(skb, icmp_hdr(skb)->un.gateway); - } + if (!pskb_may_pull(skb, sizeof(struct iphdr))) + return; -out: - return; -out_err: - ICMP_INC_STATS_BH(dev_net(skb->dev), ICMP_MIB_INERRORS); - goto out; + icmp_socket_deliver(skb, icmp_hdr(skb)->un.gateway); } /* @@ -906,8 +874,7 @@ out_err: static void icmp_address(struct sk_buff *skb) { #if 0 - if (net_ratelimit()) - printk(KERN_DEBUG "a guy asks for address mask. Who is it?\n"); + net_dbg_ratelimited("a guy asks for address mask. Who is it?\n"); #endif } @@ -943,10 +910,10 @@ static void icmp_address_reply(struct sk_buff *skb) inet_ifa_match(ip_hdr(skb)->saddr, ifa)) break; } - if (!ifa && net_ratelimit()) { - pr_info("Wrong address mask %pI4 from %s/%pI4\n", - mp, dev->name, &ip_hdr(skb)->saddr); - } + if (!ifa) + net_info_ratelimited("Wrong address mask %pI4 from %s/%pI4\n", + mp, + dev->name, &ip_hdr(skb)->saddr); } } diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 5dfecfd7d5e9..6699f23e6f55 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -344,10 +344,10 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size) pip->protocol = IPPROTO_IGMP; pip->tot_len = 0; /* filled in later */ ip_select_ident(pip, &rt->dst, NULL); - ((u8*)&pip[1])[0] = IPOPT_RA; - ((u8*)&pip[1])[1] = 4; - ((u8*)&pip[1])[2] = 0; - ((u8*)&pip[1])[3] = 0; + ((u8 *)&pip[1])[0] = IPOPT_RA; + ((u8 *)&pip[1])[1] = 4; + ((u8 *)&pip[1])[2] = 0; + ((u8 *)&pip[1])[3] = 0; skb->transport_header = skb->network_header + sizeof(struct iphdr) + 4; skb_put(skb, sizeof(*pig)); @@ -688,10 +688,10 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, iph->saddr = fl4.saddr; iph->protocol = IPPROTO_IGMP; ip_select_ident(iph, &rt->dst, NULL); - ((u8*)&iph[1])[0] = IPOPT_RA; - ((u8*)&iph[1])[1] = 4; - ((u8*)&iph[1])[2] = 0; - ((u8*)&iph[1])[3] = 0; + ((u8 *)&iph[1])[0] = IPOPT_RA; + ((u8 *)&iph[1])[1] = 4; + ((u8 *)&iph[1])[2] = 0; + ((u8 *)&iph[1])[3] = 0; ih = (struct igmphdr *)skb_put(skb, sizeof(struct igmphdr)); ih->type = type; @@ -774,7 +774,7 @@ static int igmp_xmarksources(struct ip_mc_list *pmc, int nsrcs, __be32 *srcs) if (psf->sf_count[MCAST_INCLUDE] || pmc->sfcount[MCAST_EXCLUDE] != psf->sf_count[MCAST_EXCLUDE]) - continue; + break; if (srcs[i] == psf->sf_inaddr) { scount++; break; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 19d66cefd7d3..c7a4de05ca04 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -42,7 +42,8 @@ EXPORT_SYMBOL(sysctl_local_reserved_ports); void inet_get_local_port_range(int *low, int *high) { - unsigned seq; + unsigned int seq; + do { seq = read_seqbegin(&sysctl_local_ports.lock); @@ -53,7 +54,7 @@ void inet_get_local_port_range(int *low, int *high) EXPORT_SYMBOL(inet_get_local_port_range); int inet_csk_bind_conflict(const struct sock *sk, - const struct inet_bind_bucket *tb) + const struct inet_bind_bucket *tb, bool relax) { struct sock *sk2; struct hlist_node *node; @@ -79,6 +80,14 @@ int inet_csk_bind_conflict(const struct sock *sk, sk2_rcv_saddr == sk_rcv_saddr(sk)) break; } + if (!relax && reuse && sk2->sk_reuse && + sk2->sk_state != TCP_LISTEN) { + const __be32 sk2_rcv_saddr = sk_rcv_saddr(sk2); + + if (!sk2_rcv_saddr || !sk_rcv_saddr(sk) || + sk2_rcv_saddr == sk_rcv_saddr(sk)) + break; + } } } return node != NULL; @@ -122,12 +131,13 @@ again: (tb->num_owners < smallest_size || smallest_size == -1)) { smallest_size = tb->num_owners; smallest_rover = rover; - if (atomic_read(&hashinfo->bsockets) > (high - low) + 1) { + if (atomic_read(&hashinfo->bsockets) > (high - low) + 1 && + !inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) { snum = smallest_rover; goto tb_found; } } - if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb)) { + if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) { snum = rover; goto tb_found; } @@ -172,18 +182,22 @@ have_snum: goto tb_not_found; tb_found: if (!hlist_empty(&tb->owners)) { + if (sk->sk_reuse == SK_FORCE_REUSE) + goto success; + if (tb->fastreuse > 0 && sk->sk_reuse && sk->sk_state != TCP_LISTEN && smallest_size == -1) { goto success; } else { ret = 1; - if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb)) { + if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) { if (sk->sk_reuse && sk->sk_state != TCP_LISTEN && smallest_size != -1 && --attempts >= 0) { spin_unlock(&head->lock); goto again; } + goto fail_unlock; } } @@ -354,16 +368,21 @@ EXPORT_SYMBOL(inet_csk_reset_keepalive_timer); struct dst_entry *inet_csk_route_req(struct sock *sk, struct flowi4 *fl4, - const struct request_sock *req) + const struct request_sock *req, + bool nocache) { struct rtable *rt; const struct inet_request_sock *ireq = inet_rsk(req); struct ip_options_rcu *opt = inet_rsk(req)->opt; struct net *net = sock_net(sk); + int flags = inet_sk_flowi_flags(sk); + if (nocache) + flags |= FLOWI_FLAG_RT_NOCACHE; flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark, RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, - sk->sk_protocol, inet_sk_flowi_flags(sk), + sk->sk_protocol, + flags, (opt && opt->opt.srr) ? opt->opt.faddr : ireq->rmt_addr, ireq->loc_addr, ireq->rmt_port, inet_sk(sk)->inet_sport); security_req_classify_flow(req, flowi4_to_flowi(fl4)); @@ -514,7 +533,7 @@ void inet_csk_reqsk_queue_prune(struct sock *parent, /* Normally all the openreqs are young and become mature * (i.e. converted to established socket) for first timeout. - * If synack was not acknowledged for 3 seconds, it means + * If synack was not acknowledged for 1 second, it means * one of the following things: synack was lost, ack was lost, * rtt is high or nobody planned to ack (i.e. synflood). * When server is a bit loaded, queue is populated with old @@ -555,8 +574,7 @@ void inet_csk_reqsk_queue_prune(struct sock *parent, syn_ack_recalc(req, thresh, max_retries, queue->rskq_defer_accept, &expire, &resend); - if (req->rsk_ops->syn_ack_timeout) - req->rsk_ops->syn_ack_timeout(parent, req); + req->rsk_ops->syn_ack_timeout(parent, req); if (!expire && (!resend || !req->rsk_ops->rtx_syn_ack(parent, req, NULL) || @@ -785,3 +803,49 @@ int inet_csk_compat_setsockopt(struct sock *sk, int level, int optname, } EXPORT_SYMBOL_GPL(inet_csk_compat_setsockopt); #endif + +static struct dst_entry *inet_csk_rebuild_route(struct sock *sk, struct flowi *fl) +{ + const struct inet_sock *inet = inet_sk(sk); + const struct ip_options_rcu *inet_opt; + __be32 daddr = inet->inet_daddr; + struct flowi4 *fl4; + struct rtable *rt; + + rcu_read_lock(); + inet_opt = rcu_dereference(inet->inet_opt); + if (inet_opt && inet_opt->opt.srr) + daddr = inet_opt->opt.faddr; + fl4 = &fl->u.ip4; + rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr, + inet->inet_saddr, inet->inet_dport, + inet->inet_sport, sk->sk_protocol, + RT_CONN_FLAGS(sk), sk->sk_bound_dev_if); + if (IS_ERR(rt)) + rt = NULL; + if (rt) + sk_setup_caps(sk, &rt->dst); + rcu_read_unlock(); + + return &rt->dst; +} + +struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu) +{ + struct dst_entry *dst = __sk_dst_check(sk, 0); + struct inet_sock *inet = inet_sk(sk); + + if (!dst) { + dst = inet_csk_rebuild_route(sk, &inet->cork.fl); + if (!dst) + goto out; + } + dst->ops->update_pmtu(dst, sk, NULL, mtu); + + dst = __sk_dst_check(sk, 0); + if (!dst) + dst = inet_csk_rebuild_route(sk, &inet->cork.fl); +out: + return dst; +} +EXPORT_SYMBOL_GPL(inet_csk_update_pmtu); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 8d25a1c557eb..570e61f9611f 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -46,9 +46,6 @@ struct inet_diag_entry { u16 userlocks; }; -#define INET_DIAG_PUT(skb, attrtype, attrlen) \ - RTA_DATA(__RTA_PUT(skb, attrtype, attrlen)) - static DEFINE_MUTEX(inet_diag_table_mutex); static const struct inet_diag_handler *inet_diag_lock_handler(int proto) @@ -78,24 +75,22 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, const struct inet_sock *inet = inet_sk(sk); struct inet_diag_msg *r; struct nlmsghdr *nlh; + struct nlattr *attr; void *info = NULL; - struct inet_diag_meminfo *minfo = NULL; - unsigned char *b = skb_tail_pointer(skb); const struct inet_diag_handler *handler; int ext = req->idiag_ext; handler = inet_diag_table[req->sdiag_protocol]; BUG_ON(handler == NULL); - nlh = NLMSG_PUT(skb, pid, seq, unlh->nlmsg_type, sizeof(*r)); - nlh->nlmsg_flags = nlmsg_flags; + nlh = nlmsg_put(skb, pid, seq, unlh->nlmsg_type, sizeof(*r), + nlmsg_flags); + if (!nlh) + return -EMSGSIZE; - r = NLMSG_DATA(nlh); + r = nlmsg_data(nlh); BUG_ON(sk->sk_state == TCP_TIME_WAIT); - if (ext & (1 << (INET_DIAG_MEMINFO - 1))) - minfo = INET_DIAG_PUT(skb, INET_DIAG_MEMINFO, sizeof(*minfo)); - r->idiag_family = sk->sk_family; r->idiag_state = sk->sk_state; r->idiag_timer = 0; @@ -113,7 +108,8 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, * hence this needs to be included regardless of socket family. */ if (ext & (1 << (INET_DIAG_TOS - 1))) - RTA_PUT_U8(skb, INET_DIAG_TOS, inet->tos); + if (nla_put_u8(skb, INET_DIAG_TOS, inet->tos) < 0) + goto errout; #if IS_ENABLED(CONFIG_IPV6) if (r->idiag_family == AF_INET6) { @@ -121,27 +117,34 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, *(struct in6_addr *)r->id.idiag_src = np->rcv_saddr; *(struct in6_addr *)r->id.idiag_dst = np->daddr; + if (ext & (1 << (INET_DIAG_TCLASS - 1))) - RTA_PUT_U8(skb, INET_DIAG_TCLASS, np->tclass); + if (nla_put_u8(skb, INET_DIAG_TCLASS, np->tclass) < 0) + goto errout; } #endif r->idiag_uid = sock_i_uid(sk); r->idiag_inode = sock_i_ino(sk); - if (minfo) { - minfo->idiag_rmem = sk_rmem_alloc_get(sk); - minfo->idiag_wmem = sk->sk_wmem_queued; - minfo->idiag_fmem = sk->sk_forward_alloc; - minfo->idiag_tmem = sk_wmem_alloc_get(sk); + if (ext & (1 << (INET_DIAG_MEMINFO - 1))) { + struct inet_diag_meminfo minfo = { + .idiag_rmem = sk_rmem_alloc_get(sk), + .idiag_wmem = sk->sk_wmem_queued, + .idiag_fmem = sk->sk_forward_alloc, + .idiag_tmem = sk_wmem_alloc_get(sk), + }; + + if (nla_put(skb, INET_DIAG_MEMINFO, sizeof(minfo), &minfo) < 0) + goto errout; } if (ext & (1 << (INET_DIAG_SKMEMINFO - 1))) if (sock_diag_put_meminfo(sk, skb, INET_DIAG_SKMEMINFO)) - goto rtattr_failure; + goto errout; if (icsk == NULL) { - r->idiag_rqueue = r->idiag_wqueue = 0; + handler->idiag_get_info(sk, r, NULL); goto out; } @@ -165,16 +168,20 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, } #undef EXPIRES_IN_MS - if (ext & (1 << (INET_DIAG_INFO - 1))) - info = INET_DIAG_PUT(skb, INET_DIAG_INFO, sizeof(struct tcp_info)); - - if ((ext & (1 << (INET_DIAG_CONG - 1))) && icsk->icsk_ca_ops) { - const size_t len = strlen(icsk->icsk_ca_ops->name); + if (ext & (1 << (INET_DIAG_INFO - 1))) { + attr = nla_reserve(skb, INET_DIAG_INFO, + sizeof(struct tcp_info)); + if (!attr) + goto errout; - strcpy(INET_DIAG_PUT(skb, INET_DIAG_CONG, len + 1), - icsk->icsk_ca_ops->name); + info = nla_data(attr); } + if ((ext & (1 << (INET_DIAG_CONG - 1))) && icsk->icsk_ca_ops) + if (nla_put_string(skb, INET_DIAG_CONG, + icsk->icsk_ca_ops->name) < 0) + goto errout; + handler->idiag_get_info(sk, r, info); if (sk->sk_state < TCP_TIME_WAIT && @@ -182,12 +189,10 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, icsk->icsk_ca_ops->get_info(sk, ext, skb); out: - nlh->nlmsg_len = skb_tail_pointer(skb) - b; - return skb->len; + return nlmsg_end(skb, nlh); -rtattr_failure: -nlmsg_failure: - nlmsg_trim(skb, b); +errout: + nlmsg_cancel(skb, nlh); return -EMSGSIZE; } EXPORT_SYMBOL_GPL(inet_sk_diag_fill); @@ -208,14 +213,15 @@ static int inet_twsk_diag_fill(struct inet_timewait_sock *tw, { long tmo; struct inet_diag_msg *r; - const unsigned char *previous_tail = skb_tail_pointer(skb); - struct nlmsghdr *nlh = NLMSG_PUT(skb, pid, seq, - unlh->nlmsg_type, sizeof(*r)); + struct nlmsghdr *nlh; - r = NLMSG_DATA(nlh); - BUG_ON(tw->tw_state != TCP_TIME_WAIT); + nlh = nlmsg_put(skb, pid, seq, unlh->nlmsg_type, sizeof(*r), + nlmsg_flags); + if (!nlh) + return -EMSGSIZE; - nlh->nlmsg_flags = nlmsg_flags; + r = nlmsg_data(nlh); + BUG_ON(tw->tw_state != TCP_TIME_WAIT); tmo = tw->tw_ttd - jiffies; if (tmo < 0) @@ -245,11 +251,8 @@ static int inet_twsk_diag_fill(struct inet_timewait_sock *tw, *(struct in6_addr *)r->id.idiag_dst = tw6->tw_v6_daddr; } #endif - nlh->nlmsg_len = skb_tail_pointer(skb) - previous_tail; - return skb->len; -nlmsg_failure: - nlmsg_trim(skb, previous_tail); - return -EMSGSIZE; + + return nlmsg_end(skb, nlh); } static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, @@ -269,16 +272,17 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_s int err; struct sock *sk; struct sk_buff *rep; + struct net *net = sock_net(in_skb->sk); err = -EINVAL; if (req->sdiag_family == AF_INET) { - sk = inet_lookup(&init_net, hashinfo, req->id.idiag_dst[0], + sk = inet_lookup(net, hashinfo, req->id.idiag_dst[0], req->id.idiag_dport, req->id.idiag_src[0], req->id.idiag_sport, req->id.idiag_if); } #if IS_ENABLED(CONFIG_IPV6) else if (req->sdiag_family == AF_INET6) { - sk = inet6_lookup(&init_net, hashinfo, + sk = inet6_lookup(net, hashinfo, (struct in6_addr *)req->id.idiag_dst, req->id.idiag_dport, (struct in6_addr *)req->id.idiag_src, @@ -298,23 +302,23 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_s if (err) goto out; - err = -ENOMEM; - rep = alloc_skb(NLMSG_SPACE((sizeof(struct inet_diag_msg) + - sizeof(struct inet_diag_meminfo) + - sizeof(struct tcp_info) + 64)), - GFP_KERNEL); - if (!rep) + rep = nlmsg_new(sizeof(struct inet_diag_msg) + + sizeof(struct inet_diag_meminfo) + + sizeof(struct tcp_info) + 64, GFP_KERNEL); + if (!rep) { + err = -ENOMEM; goto out; + } err = sk_diag_fill(sk, rep, req, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, 0, nlh); if (err < 0) { WARN_ON(err == -EMSGSIZE); - kfree_skb(rep); + nlmsg_free(rep); goto out; } - err = netlink_unicast(sock_diag_nlsk, rep, NETLINK_CB(in_skb).pid, + err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).pid, MSG_DONTWAIT); if (err > 0) err = 0; @@ -592,15 +596,16 @@ static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk, { const struct inet_request_sock *ireq = inet_rsk(req); struct inet_sock *inet = inet_sk(sk); - unsigned char *b = skb_tail_pointer(skb); struct inet_diag_msg *r; struct nlmsghdr *nlh; long tmo; - nlh = NLMSG_PUT(skb, pid, seq, unlh->nlmsg_type, sizeof(*r)); - nlh->nlmsg_flags = NLM_F_MULTI; - r = NLMSG_DATA(nlh); + nlh = nlmsg_put(skb, pid, seq, unlh->nlmsg_type, sizeof(*r), + NLM_F_MULTI); + if (!nlh) + return -EMSGSIZE; + r = nlmsg_data(nlh); r->idiag_family = sk->sk_family; r->idiag_state = TCP_SYN_RECV; r->idiag_timer = 1; @@ -628,13 +633,8 @@ static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk, *(struct in6_addr *)r->id.idiag_dst = inet6_rsk(req)->rmt_addr; } #endif - nlh->nlmsg_len = skb_tail_pointer(skb) - b; - - return skb->len; -nlmsg_failure: - nlmsg_trim(skb, b); - return -1; + return nlmsg_end(skb, nlh); } static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, @@ -725,6 +725,7 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, { int i, num; int s_i, s_num; + struct net *net = sock_net(skb->sk); s_i = cb->args[1]; s_num = num = cb->args[2]; @@ -744,6 +745,9 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, sk_nulls_for_each(sk, node, &ilb->head) { struct inet_sock *inet = inet_sk(sk); + if (!net_eq(sock_net(sk), net)) + continue; + if (num < s_num) { num++; continue; @@ -814,6 +818,8 @@ skip_listen_ht: sk_nulls_for_each(sk, node, &head->chain) { struct inet_sock *inet = inet_sk(sk); + if (!net_eq(sock_net(sk), net)) + continue; if (num < s_num) goto next_normal; if (!(r->idiag_states & (1 << sk->sk_state))) @@ -840,6 +846,8 @@ next_normal: inet_twsk_for_each(tw, node, &head->twchain) { + if (!net_eq(twsk_net(tw), net)) + continue; if (num < s_num) goto next_dying; @@ -892,7 +900,7 @@ static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) if (nlmsg_attrlen(cb->nlh, hdrlen)) bc = nlmsg_find_attr(cb->nlh, hdrlen, INET_DIAG_REQ_BYTECODE); - return __inet_diag_dump(skb, cb, (struct inet_diag_req_v2 *)NLMSG_DATA(cb->nlh), bc); + return __inet_diag_dump(skb, cb, nlmsg_data(cb->nlh), bc); } static inline int inet_diag_type2proto(int type) @@ -909,7 +917,7 @@ static inline int inet_diag_type2proto(int type) static int inet_diag_dump_compat(struct sk_buff *skb, struct netlink_callback *cb) { - struct inet_diag_req *rc = NLMSG_DATA(cb->nlh); + struct inet_diag_req *rc = nlmsg_data(cb->nlh); struct inet_diag_req_v2 req; struct nlattr *bc = NULL; int hdrlen = sizeof(struct inet_diag_req); @@ -929,7 +937,7 @@ static int inet_diag_dump_compat(struct sk_buff *skb, struct netlink_callback *c static int inet_diag_get_exact_compat(struct sk_buff *in_skb, const struct nlmsghdr *nlh) { - struct inet_diag_req *rc = NLMSG_DATA(nlh); + struct inet_diag_req *rc = nlmsg_data(nlh); struct inet_diag_req_v2 req; req.sdiag_family = rc->idiag_family; @@ -944,6 +952,7 @@ static int inet_diag_get_exact_compat(struct sk_buff *in_skb, static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh) { int hdrlen = sizeof(struct inet_diag_req); + struct net *net = sock_net(skb->sk); if (nlh->nlmsg_type >= INET_DIAG_GETSOCK_MAX || nlmsg_len(nlh) < hdrlen) @@ -964,7 +973,7 @@ static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh) struct netlink_dump_control c = { .dump = inet_diag_dump_compat, }; - return netlink_dump_start(sock_diag_nlsk, skb, nlh, &c); + return netlink_dump_start(net->diag_nlsk, skb, nlh, &c); } } @@ -974,6 +983,7 @@ static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh) static int inet_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) { int hdrlen = sizeof(struct inet_diag_req_v2); + struct net *net = sock_net(skb->sk); if (nlmsg_len(h) < hdrlen) return -EINVAL; @@ -992,19 +1002,19 @@ static int inet_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) struct netlink_dump_control c = { .dump = inet_diag_dump, }; - return netlink_dump_start(sock_diag_nlsk, skb, h, &c); + return netlink_dump_start(net->diag_nlsk, skb, h, &c); } } - return inet_diag_get_exact(skb, h, (struct inet_diag_req_v2 *)NLMSG_DATA(h)); + return inet_diag_get_exact(skb, h, nlmsg_data(h)); } -static struct sock_diag_handler inet_diag_handler = { +static const struct sock_diag_handler inet_diag_handler = { .family = AF_INET, .dump = inet_diag_handler_dump, }; -static struct sock_diag_handler inet6_diag_handler = { +static const struct sock_diag_handler inet6_diag_handler = { .family = AF_INET6, .dump = inet_diag_handler_dump, }; diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index 5ff2a51b6d0c..85190e69297b 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -243,12 +243,12 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, if (q == NULL) return NULL; + q->net = nf; f->constructor(q, arg); atomic_add(f->qsize, &nf->mem); setup_timer(&q->timer, f->frag_expire, (unsigned long)q); spin_lock_init(&q->lock); atomic_set(&q->refcnt, 1); - q->net = nf; return q; } diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 984ec656b03b..7880af970208 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -217,7 +217,7 @@ begin: } EXPORT_SYMBOL_GPL(__inet_lookup_listener); -struct sock * __inet_lookup_established(struct net *net, +struct sock *__inet_lookup_established(struct net *net, struct inet_hashinfo *hashinfo, const __be32 saddr, const __be16 sport, const __be32 daddr, const u16 hnum, diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 89168c6351ff..2784db3155fb 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -89,8 +89,8 @@ static void __inet_twsk_kill(struct inet_timewait_sock *tw, #ifdef SOCK_REFCNT_DEBUG if (atomic_read(&tw->tw_refcnt) != 1) { - printk(KERN_DEBUG "%s timewait_sock %p refcnt=%d\n", - tw->tw_prot->name, tw, atomic_read(&tw->tw_refcnt)); + pr_debug("%s timewait_sock %p refcnt=%d\n", + tw->tw_prot->name, tw, atomic_read(&tw->tw_refcnt)); } #endif while (refcnt) { @@ -263,7 +263,7 @@ rescan: void inet_twdr_hangman(unsigned long data) { struct inet_timewait_death_row *twdr; - int unsigned need_timer; + unsigned int need_timer; twdr = (struct inet_timewait_death_row *)data; spin_lock(&twdr->death_lock); diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index d4d61b694fab..e1e0a4e8fd34 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -82,23 +82,39 @@ static const struct inet_peer peer_fake_node = { .avl_height = 0 }; -struct inet_peer_base { - struct inet_peer __rcu *root; - seqlock_t lock; - int total; -}; +void inet_peer_base_init(struct inet_peer_base *bp) +{ + bp->root = peer_avl_empty_rcu; + seqlock_init(&bp->lock); + bp->flush_seq = ~0U; + bp->total = 0; +} +EXPORT_SYMBOL_GPL(inet_peer_base_init); -static struct inet_peer_base v4_peers = { - .root = peer_avl_empty_rcu, - .lock = __SEQLOCK_UNLOCKED(v4_peers.lock), - .total = 0, -}; +static atomic_t v4_seq = ATOMIC_INIT(0); +static atomic_t v6_seq = ATOMIC_INIT(0); -static struct inet_peer_base v6_peers = { - .root = peer_avl_empty_rcu, - .lock = __SEQLOCK_UNLOCKED(v6_peers.lock), - .total = 0, -}; +static atomic_t *inetpeer_seq_ptr(int family) +{ + return (family == AF_INET ? &v4_seq : &v6_seq); +} + +static inline void flush_check(struct inet_peer_base *base, int family) +{ + atomic_t *fp = inetpeer_seq_ptr(family); + + if (unlikely(base->flush_seq != atomic_read(fp))) { + inetpeer_invalidate_tree(base); + base->flush_seq = atomic_read(fp); + } +} + +void inetpeer_invalidate_family(int family) +{ + atomic_t *fp = inetpeer_seq_ptr(family); + + atomic_inc(fp); +} #define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */ @@ -110,7 +126,7 @@ int inet_peer_maxttl __read_mostly = 10 * 60 * HZ; /* usual time to live: 10 min static void inetpeer_gc_worker(struct work_struct *work) { - struct inet_peer *p, *n; + struct inet_peer *p, *n, *c; LIST_HEAD(list); spin_lock_bh(&gc_lock); @@ -122,17 +138,19 @@ static void inetpeer_gc_worker(struct work_struct *work) list_for_each_entry_safe(p, n, &list, gc_list) { - if(need_resched()) + if (need_resched()) cond_resched(); - if (p->avl_left != peer_avl_empty) { - list_add_tail(&p->avl_left->gc_list, &list); - p->avl_left = peer_avl_empty; + c = rcu_dereference_protected(p->avl_left, 1); + if (c != peer_avl_empty) { + list_add_tail(&c->gc_list, &list); + p->avl_left = peer_avl_empty_rcu; } - if (p->avl_right != peer_avl_empty) { - list_add_tail(&p->avl_right->gc_list, &list); - p->avl_right = peer_avl_empty; + c = rcu_dereference_protected(p->avl_right, 1); + if (c != peer_avl_empty) { + list_add_tail(&c->gc_list, &list); + p->avl_right = peer_avl_empty_rcu; } n = list_entry(p->gc_list.next, struct inet_peer, gc_list); @@ -401,11 +419,6 @@ static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base, call_rcu(&p->rcu, inetpeer_free_rcu); } -static struct inet_peer_base *family_to_base(int family) -{ - return family == AF_INET ? &v4_peers : &v6_peers; -} - /* perform garbage collect on all items stacked during a lookup */ static int inet_peer_gc(struct inet_peer_base *base, struct inet_peer __rcu **stack[PEER_MAXDEPTH], @@ -443,14 +456,17 @@ static int inet_peer_gc(struct inet_peer_base *base, return cnt; } -struct inet_peer *inet_getpeer(const struct inetpeer_addr *daddr, int create) +struct inet_peer *inet_getpeer(struct inet_peer_base *base, + const struct inetpeer_addr *daddr, + int create) { struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr; - struct inet_peer_base *base = family_to_base(daddr->family); struct inet_peer *p; unsigned int sequence; int invalidated, gccnt = 0; + flush_check(base, daddr->family); + /* Attempt a lockless lookup first. * Because of a concurrent writer, we might not find an existing entry. */ @@ -492,13 +508,9 @@ relookup: (daddr->family == AF_INET) ? secure_ip_id(daddr->addr.a4) : secure_ipv6_id(daddr->addr.a6)); - p->tcp_ts_stamp = 0; p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW; p->rate_tokens = 0; p->rate_last = 0; - p->pmtu_expires = 0; - p->pmtu_orig = 0; - memset(&p->redirect_learned, 0, sizeof(p->redirect_learned)); INIT_LIST_HEAD(&p->gc_list); /* Link the node. */ @@ -560,29 +572,30 @@ bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout) } EXPORT_SYMBOL(inet_peer_xrlim_allow); -void inetpeer_invalidate_tree(int family) +static void inetpeer_inval_rcu(struct rcu_head *head) { - struct inet_peer *old, *new, *prev; - struct inet_peer_base *base = family_to_base(family); + struct inet_peer *p = container_of(head, struct inet_peer, gc_rcu); - write_seqlock_bh(&base->lock); + spin_lock_bh(&gc_lock); + list_add_tail(&p->gc_list, &gc_list); + spin_unlock_bh(&gc_lock); - old = base->root; - if (old == peer_avl_empty_rcu) - goto out; + schedule_delayed_work(&gc_work, gc_delay); +} - new = peer_avl_empty_rcu; +void inetpeer_invalidate_tree(struct inet_peer_base *base) +{ + struct inet_peer *root; + + write_seqlock_bh(&base->lock); - prev = cmpxchg(&base->root, old, new); - if (prev == old) { + root = rcu_deref_locked(base->root, base); + if (root != peer_avl_empty) { + base->root = peer_avl_empty_rcu; base->total = 0; - spin_lock(&gc_lock); - list_add_tail(&prev->gc_list, &gc_list); - spin_unlock(&gc_lock); - schedule_delayed_work(&gc_work, gc_delay); + call_rcu(&root->gc_rcu, inetpeer_inval_rcu); } -out: write_sequnlock_bh(&base->lock); } EXPORT_SYMBOL(inetpeer_invalidate_tree); diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 29a07b6c7168..ab09b126423c 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -41,9 +41,10 @@ static int ip_forward_finish(struct sk_buff *skb) { - struct ip_options * opt = &(IPCB(skb)->opt); + struct ip_options *opt = &(IPCB(skb)->opt); IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS); + IP_ADD_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTOCTETS, skb->len); if (unlikely(opt->optlen)) ip_forward_options(skb); @@ -55,7 +56,7 @@ int ip_forward(struct sk_buff *skb) { struct iphdr *iph; /* Our header */ struct rtable *rt; /* Route we use */ - struct ip_options * opt = &(IPCB(skb)->opt); + struct ip_options *opt = &(IPCB(skb)->opt); if (skb_warn_if_lro(skb)) goto drop; diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 3727e234c884..8d07c973409c 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -148,17 +148,17 @@ static unsigned int ip4_hashfn(struct inet_frag_queue *q) return ipqhashfn(ipq->id, ipq->saddr, ipq->daddr, ipq->protocol); } -static int ip4_frag_match(struct inet_frag_queue *q, void *a) +static bool ip4_frag_match(struct inet_frag_queue *q, void *a) { struct ipq *qp; struct ip4_create_arg *arg = a; qp = container_of(q, struct ipq, q); return qp->id == arg->iph->id && - qp->saddr == arg->iph->saddr && - qp->daddr == arg->iph->daddr && - qp->protocol == arg->iph->protocol && - qp->user == arg->user; + qp->saddr == arg->iph->saddr && + qp->daddr == arg->iph->daddr && + qp->protocol == arg->iph->protocol && + qp->user == arg->user; } /* Memory Tracking Functions. */ @@ -171,6 +171,10 @@ static void frag_kfree_skb(struct netns_frags *nf, struct sk_buff *skb) static void ip4_frag_init(struct inet_frag_queue *q, void *a) { struct ipq *qp = container_of(q, struct ipq, q); + struct netns_ipv4 *ipv4 = container_of(q->net, struct netns_ipv4, + frags); + struct net *net = container_of(ipv4, struct net, ipv4); + struct ip4_create_arg *arg = a; qp->protocol = arg->iph->protocol; @@ -180,7 +184,7 @@ static void ip4_frag_init(struct inet_frag_queue *q, void *a) qp->daddr = arg->iph->daddr; qp->user = arg->user; qp->peer = sysctl_ipfrag_max_dist ? - inet_getpeer_v4(arg->iph->saddr, 1) : NULL; + inet_getpeer_v4(net->ipv4.peers, arg->iph->saddr, 1) : NULL; } static __inline__ void ip4_frag_free(struct inet_frag_queue *q) @@ -545,6 +549,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, int len; int ihlen; int err; + int sum_truesize; u8 ecn; ipq_kill(qp); @@ -569,7 +574,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, skb_morph(head, qp->q.fragments); head->next = qp->q.fragments->next; - kfree_skb(qp->q.fragments); + consume_skb(qp->q.fragments); qp->q.fragments = head; } @@ -611,19 +616,32 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, atomic_add(clone->truesize, &qp->q.net->mem); } - skb_shinfo(head)->frag_list = head->next; skb_push(head, head->data - skb_network_header(head)); - for (fp=head->next; fp; fp = fp->next) { - head->data_len += fp->len; - head->len += fp->len; + sum_truesize = head->truesize; + for (fp = head->next; fp;) { + bool headstolen; + int delta; + struct sk_buff *next = fp->next; + + sum_truesize += fp->truesize; if (head->ip_summed != fp->ip_summed) head->ip_summed = CHECKSUM_NONE; else if (head->ip_summed == CHECKSUM_COMPLETE) head->csum = csum_add(head->csum, fp->csum); - head->truesize += fp->truesize; + + if (skb_try_coalesce(head, fp, &headstolen, &delta)) { + kfree_skb_partial(fp, headstolen); + } else { + if (!skb_shinfo(head)->frag_list) + skb_shinfo(head)->frag_list = fp; + head->data_len += fp->len; + head->len += fp->len; + head->truesize += fp->truesize; + } + fp = next; } - atomic_sub(head->truesize, &qp->q.net->mem); + atomic_sub(sum_truesize, &qp->q.net->mem); head->next = NULL; head->dev = dev; @@ -644,8 +662,7 @@ out_nomem: err = -ENOMEM; goto out_fail; out_oversize: - if (net_ratelimit()) - pr_info("Oversized IP packet from %pI4\n", &qp->saddr); + net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->saddr); out_fail: IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS); return err; @@ -782,7 +799,7 @@ static int __net_init ip4_frags_ns_ctl_register(struct net *net) table[2].data = &net->ipv4.frags.timeout; } - hdr = register_net_sysctl_table(net, net_ipv4_ctl_path, table); + hdr = register_net_sysctl(net, "net/ipv4", table); if (hdr == NULL) goto err_reg; @@ -807,7 +824,7 @@ static void __net_exit ip4_frags_ns_ctl_unregister(struct net *net) static void ip4_frags_ctl_register(void) { - register_net_sysctl_rotable(net_ipv4_ctl_path, ip4_frags_ctl_table); + register_net_sysctl(&init_net, "net/ipv4", ip4_frags_ctl_table); } #else static inline int ip4_frags_ns_ctl_register(struct net *net) diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index b57532d4742c..42c44b1403c9 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -169,37 +169,56 @@ struct ipgre_net { /* often modified stats are per cpu, other are shared (netdev->stats) */ struct pcpu_tstats { - unsigned long rx_packets; - unsigned long rx_bytes; - unsigned long tx_packets; - unsigned long tx_bytes; -} __attribute__((aligned(4*sizeof(unsigned long)))); + u64 rx_packets; + u64 rx_bytes; + u64 tx_packets; + u64 tx_bytes; + struct u64_stats_sync syncp; +}; -static struct net_device_stats *ipgre_get_stats(struct net_device *dev) +static struct rtnl_link_stats64 *ipgre_get_stats64(struct net_device *dev, + struct rtnl_link_stats64 *tot) { - struct pcpu_tstats sum = { 0 }; int i; for_each_possible_cpu(i) { const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i); - - sum.rx_packets += tstats->rx_packets; - sum.rx_bytes += tstats->rx_bytes; - sum.tx_packets += tstats->tx_packets; - sum.tx_bytes += tstats->tx_bytes; + u64 rx_packets, rx_bytes, tx_packets, tx_bytes; + unsigned int start; + + do { + start = u64_stats_fetch_begin_bh(&tstats->syncp); + rx_packets = tstats->rx_packets; + tx_packets = tstats->tx_packets; + rx_bytes = tstats->rx_bytes; + tx_bytes = tstats->tx_bytes; + } while (u64_stats_fetch_retry_bh(&tstats->syncp, start)); + + tot->rx_packets += rx_packets; + tot->tx_packets += tx_packets; + tot->rx_bytes += rx_bytes; + tot->tx_bytes += tx_bytes; } - dev->stats.rx_packets = sum.rx_packets; - dev->stats.rx_bytes = sum.rx_bytes; - dev->stats.tx_packets = sum.tx_packets; - dev->stats.tx_bytes = sum.tx_bytes; - return &dev->stats; + + tot->multicast = dev->stats.multicast; + tot->rx_crc_errors = dev->stats.rx_crc_errors; + tot->rx_fifo_errors = dev->stats.rx_fifo_errors; + tot->rx_length_errors = dev->stats.rx_length_errors; + tot->rx_errors = dev->stats.rx_errors; + tot->tx_fifo_errors = dev->stats.tx_fifo_errors; + tot->tx_carrier_errors = dev->stats.tx_carrier_errors; + tot->tx_dropped = dev->stats.tx_dropped; + tot->tx_aborted_errors = dev->stats.tx_aborted_errors; + tot->tx_errors = dev->stats.tx_errors; + + return tot; } /* Given src, dst and key, find appropriate for input tunnel. */ -static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev, - __be32 remote, __be32 local, - __be32 key, __be16 gre_proto) +static struct ip_tunnel *ipgre_tunnel_lookup(struct net_device *dev, + __be32 remote, __be32 local, + __be32 key, __be16 gre_proto) { struct net *net = dev_net(dev); int link = dev->ifindex; @@ -464,7 +483,7 @@ static void ipgre_err(struct sk_buff *skb, u32 info) */ const struct iphdr *iph = (const struct iphdr *)skb->data; - __be16 *p = (__be16*)(skb->data+(iph->ihl<<2)); + __be16 *p = (__be16 *)(skb->data+(iph->ihl<<2)); int grehlen = (iph->ihl<<2) + 4; const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; @@ -497,9 +516,6 @@ static void ipgre_err(struct sk_buff *skb, u32 info) case ICMP_PORT_UNREACH: /* Impossible event. */ return; - case ICMP_FRAG_NEEDED: - /* Soft state for pmtu is maintained by IP core. */ - return; default: /* All others are translated to HOST_UNREACH. rfc2003 contains "deep thoughts" about NET_UNREACH, @@ -512,6 +528,9 @@ static void ipgre_err(struct sk_buff *skb, u32 info) if (code != ICMP_EXC_TTL) return; break; + + case ICMP_REDIRECT: + break; } rcu_read_lock(); @@ -519,7 +538,20 @@ static void ipgre_err(struct sk_buff *skb, u32 info) flags & GRE_KEY ? *(((__be32 *)p) + (grehlen / 4) - 1) : 0, p[1]); - if (t == NULL || t->parms.iph.daddr == 0 || + if (t == NULL) + goto out; + + if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { + ipv4_update_pmtu(skb, dev_net(skb->dev), info, + t->parms.link, 0, IPPROTO_GRE, 0); + goto out; + } + if (type == ICMP_REDIRECT) { + ipv4_redirect(skb, dev_net(skb->dev), t->parms.link, 0, + IPPROTO_GRE, 0); + goto out; + } + if (t->parms.iph.daddr == 0 || ipv4_is_multicast(t->parms.iph.daddr)) goto out; @@ -574,7 +606,7 @@ static int ipgre_rcv(struct sk_buff *skb) iph = ip_hdr(skb); h = skb->data; - flags = *(__be16*)h; + flags = *(__be16 *)h; if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) { /* - Version must be 0. @@ -598,11 +630,11 @@ static int ipgre_rcv(struct sk_buff *skb) offset += 4; } if (flags&GRE_KEY) { - key = *(__be32*)(h + offset); + key = *(__be32 *)(h + offset); offset += 4; } if (flags&GRE_SEQ) { - seqno = ntohl(*(__be32*)(h + offset)); + seqno = ntohl(*(__be32 *)(h + offset)); offset += 4; } } @@ -672,8 +704,10 @@ static int ipgre_rcv(struct sk_buff *skb) } tstats = this_cpu_ptr(tunnel->dev->tstats); + u64_stats_update_begin(&tstats->syncp); tstats->rx_packets++; tstats->rx_bytes += skb->len; + u64_stats_update_end(&tstats->syncp); __skb_tunnel_rx(skb, tunnel->dev); @@ -799,7 +833,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu; if (skb_dst(skb)) - skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu); + skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); if (skb->protocol == htons(ETH_P_IP)) { df |= (old_iph->frag_off&htons(IP_DF)); @@ -900,7 +934,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev htons(ETH_P_TEB) : skb->protocol; if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) { - __be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4); + __be32 *ptr = (__be32 *)(((u8 *)iph) + tunnel->hlen - 4); if (tunnel->parms.o_flags&GRE_SEQ) { ++tunnel->o_seqno; @@ -913,7 +947,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev } if (tunnel->parms.o_flags&GRE_CSUM) { *ptr = 0; - *(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr)); + *(__sum16 *)ptr = ip_compute_csum((void *)(iph+1), skb->len - sizeof(struct iphdr)); } } @@ -1169,7 +1203,7 @@ static int ipgre_header(struct sk_buff *skb, struct net_device *dev, { struct ip_tunnel *t = netdev_priv(dev); struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen); - __be16 *p = (__be16*)(iph+1); + __be16 *p = (__be16 *)(iph+1); memcpy(iph, &t->parms.iph, sizeof(struct iphdr)); p[0] = t->parms.o_flags; @@ -1253,7 +1287,7 @@ static const struct net_device_ops ipgre_netdev_ops = { .ndo_start_xmit = ipgre_tunnel_xmit, .ndo_do_ioctl = ipgre_tunnel_ioctl, .ndo_change_mtu = ipgre_tunnel_change_mtu, - .ndo_get_stats = ipgre_get_stats, + .ndo_get_stats64 = ipgre_get_stats64, }; static void ipgre_dev_free(struct net_device *dev) @@ -1507,7 +1541,7 @@ static const struct net_device_ops ipgre_tap_netdev_ops = { .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, .ndo_change_mtu = ipgre_tunnel_change_mtu, - .ndo_get_stats = ipgre_get_stats, + .ndo_get_stats64 = ipgre_get_stats64, }; static void ipgre_tap_setup(struct net_device *dev) @@ -1654,17 +1688,18 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev) struct ip_tunnel *t = netdev_priv(dev); struct ip_tunnel_parm *p = &t->parms; - NLA_PUT_U32(skb, IFLA_GRE_LINK, p->link); - NLA_PUT_BE16(skb, IFLA_GRE_IFLAGS, p->i_flags); - NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags); - NLA_PUT_BE32(skb, IFLA_GRE_IKEY, p->i_key); - NLA_PUT_BE32(skb, IFLA_GRE_OKEY, p->o_key); - NLA_PUT_BE32(skb, IFLA_GRE_LOCAL, p->iph.saddr); - NLA_PUT_BE32(skb, IFLA_GRE_REMOTE, p->iph.daddr); - NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl); - NLA_PUT_U8(skb, IFLA_GRE_TOS, p->iph.tos); - NLA_PUT_U8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF))); - + if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) || + nla_put_be16(skb, IFLA_GRE_IFLAGS, p->i_flags) || + nla_put_be16(skb, IFLA_GRE_OFLAGS, p->o_flags) || + nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) || + nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) || + nla_put_be32(skb, IFLA_GRE_LOCAL, p->iph.saddr) || + nla_put_be32(skb, IFLA_GRE_REMOTE, p->iph.daddr) || + nla_put_u8(skb, IFLA_GRE_TTL, p->iph.ttl) || + nla_put_u8(skb, IFLA_GRE_TOS, p->iph.tos) || + nla_put_u8(skb, IFLA_GRE_PMTUDISC, + !!(p->iph.frag_off & htons(IP_DF)))) + goto nla_put_failure; return 0; nla_put_failure: diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 26eccc5bab1c..b27d4440f523 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -198,21 +198,19 @@ static int ip_local_deliver_finish(struct sk_buff *skb) rcu_read_lock(); { int protocol = ip_hdr(skb)->protocol; - int hash, raw; const struct net_protocol *ipprot; + int raw; resubmit: raw = raw_local_deliver(skb, protocol); - hash = protocol & (MAX_INET_PROTOS - 1); - ipprot = rcu_dereference(inet_protos[hash]); + ipprot = rcu_dereference(inet_protos[protocol]); if (ipprot != NULL) { int ret; if (!net_eq(net, &init_net) && !ipprot->netns_ok) { - if (net_ratelimit()) - printk("%s: proto %d isn't netns-ready\n", - __func__, protocol); + net_info_ratelimited("%s: proto %d isn't netns-ready\n", + __func__, protocol); kfree_skb(skb); goto out; } @@ -298,10 +296,10 @@ static inline bool ip_rcv_options(struct sk_buff *skb) if (in_dev) { if (!IN_DEV_SOURCE_ROUTE(in_dev)) { - if (IN_DEV_LOG_MARTIANS(in_dev) && - net_ratelimit()) - pr_info("source route option %pI4 -> %pI4\n", - &iph->saddr, &iph->daddr); + if (IN_DEV_LOG_MARTIANS(in_dev)) + net_info_ratelimited("source route option %pI4 -> %pI4\n", + &iph->saddr, + &iph->daddr); goto drop; } } @@ -315,26 +313,33 @@ drop: return true; } +int sysctl_ip_early_demux __read_mostly = 1; + static int ip_rcv_finish(struct sk_buff *skb) { const struct iphdr *iph = ip_hdr(skb); struct rtable *rt; + if (sysctl_ip_early_demux && !skb_dst(skb)) { + const struct net_protocol *ipprot; + int protocol = iph->protocol; + + rcu_read_lock(); + ipprot = rcu_dereference(inet_protos[protocol]); + if (ipprot && ipprot->early_demux) + ipprot->early_demux(skb); + rcu_read_unlock(); + } + /* * Initialise the virtual path cache for the packet. It describes * how the packet travels inside Linux networking. */ - if (skb_dst(skb) == NULL) { + if (!skb_dst(skb)) { int err = ip_route_input_noref(skb, iph->daddr, iph->saddr, iph->tos, skb->dev); if (unlikely(err)) { - if (err == -EHOSTUNREACH) - IP_INC_STATS_BH(dev_net(skb->dev), - IPSTATS_MIB_INADDRERRORS); - else if (err == -ENETUNREACH) - IP_INC_STATS_BH(dev_net(skb->dev), - IPSTATS_MIB_INNOROUTES); - else if (err == -EXDEV) + if (err == -EXDEV) NET_INC_STATS_BH(dev_net(skb->dev), LINUX_MIB_IPRPFILTER); goto drop; diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index a0d0d9d9b870..1dc01f9793d5 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -27,6 +27,7 @@ #include <net/icmp.h> #include <net/route.h> #include <net/cipso_ipv4.h> +#include <net/ip_fib.h> /* * Write options to IP header, record destination address to @@ -92,7 +93,6 @@ int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb) unsigned char *sptr, *dptr; int soffset, doffset; int optlen; - __be32 daddr; memset(dopt, 0, sizeof(struct ip_options)); @@ -104,8 +104,6 @@ int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb) sptr = skb_network_header(skb); dptr = dopt->__data; - daddr = skb_rtable(skb)->rt_spec_dst; - if (sopt->rr) { optlen = sptr[sopt->rr+1]; soffset = sptr[sopt->rr+2]; @@ -179,6 +177,8 @@ int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb) doffset -= 4; } if (doffset > 3) { + __be32 daddr = fib_compute_spec_dst(skb); + memcpy(&start[doffset-1], &daddr, 4); dopt->faddr = faddr; dptr[0] = start[0]; @@ -210,10 +210,10 @@ int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb) * Simple and stupid 8), but the most efficient way. */ -void ip_options_fragment(struct sk_buff * skb) +void ip_options_fragment(struct sk_buff *skb) { unsigned char *optptr = skb_network_header(skb) + sizeof(struct iphdr); - struct ip_options * opt = &(IPCB(skb)->opt); + struct ip_options *opt = &(IPCB(skb)->opt); int l = opt->optlen; int optlen; @@ -241,6 +241,15 @@ void ip_options_fragment(struct sk_buff * skb) opt->ts_needtime = 0; } +/* helper used by ip_options_compile() to call fib_compute_spec_dst() + * at most one time. + */ +static void spec_dst_fill(__be32 *spec_dst, struct sk_buff *skb) +{ + if (*spec_dst == htonl(INADDR_ANY)) + *spec_dst = fib_compute_spec_dst(skb); +} + /* * Verify options and fill pointers in struct options. * Caller should clear *opt, and set opt->data. @@ -248,14 +257,14 @@ void ip_options_fragment(struct sk_buff * skb) */ int ip_options_compile(struct net *net, - struct ip_options * opt, struct sk_buff * skb) + struct ip_options *opt, struct sk_buff *skb) { - int l; - unsigned char * iph; - unsigned char * optptr; - int optlen; - unsigned char * pp_ptr = NULL; + __be32 spec_dst = htonl(INADDR_ANY); + unsigned char *pp_ptr = NULL; struct rtable *rt = NULL; + unsigned char *optptr; + unsigned char *iph; + int optlen, l; if (skb != NULL) { rt = skb_rtable(skb); @@ -331,7 +340,8 @@ int ip_options_compile(struct net *net, goto error; } if (rt) { - memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4); + spec_dst_fill(&spec_dst, skb); + memcpy(&optptr[optptr[2]-1], &spec_dst, 4); opt->is_changed = 1; } optptr[2] += 4; @@ -373,7 +383,8 @@ int ip_options_compile(struct net *net, } opt->ts = optptr - iph; if (rt) { - memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4); + spec_dst_fill(&spec_dst, skb); + memcpy(&optptr[optptr[2]-1], &spec_dst, 4); timeptr = &optptr[optptr[2]+3]; } opt->ts_needaddr = 1; @@ -413,7 +424,7 @@ int ip_options_compile(struct net *net, opt->is_changed = 1; } } else { - unsigned overflow = optptr[3]>>4; + unsigned int overflow = optptr[3]>>4; if (overflow == 15) { pp_ptr = optptr + 3; goto error; @@ -473,20 +484,20 @@ EXPORT_SYMBOL(ip_options_compile); * Undo all the changes done by ip_options_compile(). */ -void ip_options_undo(struct ip_options * opt) +void ip_options_undo(struct ip_options *opt) { if (opt->srr) { - unsigned char * optptr = opt->__data+opt->srr-sizeof(struct iphdr); + unsigned char *optptr = opt->__data+opt->srr-sizeof(struct iphdr); memmove(optptr+7, optptr+3, optptr[1]-7); memcpy(optptr+3, &opt->faddr, 4); } if (opt->rr_needaddr) { - unsigned char * optptr = opt->__data+opt->rr-sizeof(struct iphdr); + unsigned char *optptr = opt->__data+opt->rr-sizeof(struct iphdr); optptr[2] -= 4; memset(&optptr[optptr[2]-1], 0, 4); } if (opt->ts) { - unsigned char * optptr = opt->__data+opt->ts-sizeof(struct iphdr); + unsigned char *optptr = opt->__data+opt->ts-sizeof(struct iphdr); if (opt->ts_needtime) { optptr[2] -= 4; memset(&optptr[optptr[2]-1], 0, 4); @@ -549,8 +560,8 @@ int ip_options_get(struct net *net, struct ip_options_rcu **optp, void ip_forward_options(struct sk_buff *skb) { - struct ip_options * opt = &(IPCB(skb)->opt); - unsigned char * optptr; + struct ip_options *opt = &(IPCB(skb)->opt); + unsigned char *optptr; struct rtable *rt = skb_rtable(skb); unsigned char *raw = skb_network_header(skb); @@ -578,8 +589,10 @@ void ip_forward_options(struct sk_buff *skb) ip_hdr(skb)->daddr = opt->nexthop; ip_rt_get_source(&optptr[srrptr-1], skb, rt); optptr[2] = srrptr+4; - } else if (net_ratelimit()) - pr_crit("%s(): Argh! Destination lost!\n", __func__); + } else { + net_crit_ratelimited("%s(): Argh! Destination lost!\n", + __func__); + } if (opt->ts_needaddr) { optptr = raw + opt->ts; ip_rt_get_source(&optptr[optptr[2]-9], skb, rt); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 4910176d24ed..c528f841ca4b 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -113,19 +113,6 @@ int ip_local_out(struct sk_buff *skb) } EXPORT_SYMBOL_GPL(ip_local_out); -/* dev_loopback_xmit for use with netfilter. */ -static int ip_dev_loopback_xmit(struct sk_buff *newskb) -{ - skb_reset_mac_header(newskb); - __skb_pull(newskb, skb_network_offset(newskb)); - newskb->pkt_type = PACKET_LOOPBACK; - newskb->ip_summed = CHECKSUM_UNNECESSARY; - WARN_ON(!skb_dst(newskb)); - skb_dst_force(newskb); - netif_rx_ni(newskb); - return 0; -} - static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst) { int ttl = inet->uc_ttl; @@ -183,6 +170,7 @@ static inline int ip_finish_output2(struct sk_buff *skb) struct net_device *dev = dst->dev; unsigned int hh_len = LL_RESERVED_SPACE(dev); struct neighbour *neigh; + u32 nexthop; if (rt->rt_type == RTN_MULTICAST) { IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len); @@ -200,22 +188,25 @@ static inline int ip_finish_output2(struct sk_buff *skb) } if (skb->sk) skb_set_owner_w(skb2, skb->sk); - kfree_skb(skb); + consume_skb(skb); skb = skb2; } - rcu_read_lock(); - neigh = dst_get_neighbour_noref(dst); + rcu_read_lock_bh(); + nexthop = rt->rt_gateway ? rt->rt_gateway : ip_hdr(skb)->daddr; + neigh = __ipv4_neigh_lookup_noref(dev, nexthop); + if (unlikely(!neigh)) + neigh = __neigh_create(&arp_tbl, &nexthop, dev, false); if (neigh) { - int res = neigh_output(neigh, skb); + int res = dst_neigh_output(dst, neigh, skb); - rcu_read_unlock(); + rcu_read_unlock_bh(); return res; } - rcu_read_unlock(); + rcu_read_unlock_bh(); - if (net_ratelimit()) - printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n"); + net_dbg_ratelimited("%s: No header cache and no neighbour!\n", + __func__); kfree_skb(skb); return -EINVAL; } @@ -281,7 +272,7 @@ int ip_mc_output(struct sk_buff *skb) if (newskb) NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb, NULL, newskb->dev, - ip_dev_loopback_xmit); + dev_loopback_xmit); } /* Multicasts with ttl 0 must not go beyond the host */ @@ -296,7 +287,7 @@ int ip_mc_output(struct sk_buff *skb) struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); if (newskb) NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb, - NULL, newskb->dev, ip_dev_loopback_xmit); + NULL, newskb->dev, dev_loopback_xmit); } return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, @@ -709,7 +700,7 @@ slow_path: IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES); } - kfree_skb(skb); + consume_skb(skb); IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS); return err; @@ -1472,19 +1463,33 @@ static int ip_reply_glue_bits(void *dptr, char *to, int offset, /* * Generic function to send a packet as reply to another packet. - * Used to send TCP resets so far. ICMP should use this function too. + * Used to send some TCP resets/acks so far. * - * Should run single threaded per socket because it uses the sock - * structure to pass arguments. + * Use a fake percpu inet socket to avoid false sharing and contention. */ -void ip_send_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr, - const struct ip_reply_arg *arg, unsigned int len) +static DEFINE_PER_CPU(struct inet_sock, unicast_sock) = { + .sk = { + .__sk_common = { + .skc_refcnt = ATOMIC_INIT(1), + }, + .sk_wmem_alloc = ATOMIC_INIT(1), + .sk_allocation = GFP_ATOMIC, + .sk_flags = (1UL << SOCK_USE_WRITE_QUEUE), + }, + .pmtudisc = IP_PMTUDISC_WANT, +}; + +void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr, + __be32 saddr, const struct ip_reply_arg *arg, + unsigned int len) { - struct inet_sock *inet = inet_sk(sk); struct ip_options_data replyopts; struct ipcm_cookie ipc; struct flowi4 fl4; struct rtable *rt = skb_rtable(skb); + struct sk_buff *nskb; + struct sock *sk; + struct inet_sock *inet; if (ip_options_echo(&replyopts.opt.opt, skb)) return; @@ -1502,38 +1507,39 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr, flowi4_init_output(&fl4, arg->bound_dev_if, 0, RT_TOS(arg->tos), - RT_SCOPE_UNIVERSE, sk->sk_protocol, + RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol, ip_reply_arg_flowi_flags(arg), - daddr, rt->rt_spec_dst, + daddr, saddr, tcp_hdr(skb)->source, tcp_hdr(skb)->dest); security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); - rt = ip_route_output_key(sock_net(sk), &fl4); + rt = ip_route_output_key(net, &fl4); if (IS_ERR(rt)) return; - /* And let IP do all the hard work. + inet = &get_cpu_var(unicast_sock); - This chunk is not reenterable, hence spinlock. - Note that it uses the fact, that this function is called - with locally disabled BH and that sk cannot be already spinlocked. - */ - bh_lock_sock(sk); inet->tos = arg->tos; + sk = &inet->sk; sk->sk_priority = skb->priority; sk->sk_protocol = ip_hdr(skb)->protocol; sk->sk_bound_dev_if = arg->bound_dev_if; + sock_net_set(sk, net); + __skb_queue_head_init(&sk->sk_write_queue); + sk->sk_sndbuf = sysctl_wmem_default; ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0, &ipc, &rt, MSG_DONTWAIT); - if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) { + nskb = skb_peek(&sk->sk_write_queue); + if (nskb) { if (arg->csumoffset >= 0) - *((__sum16 *)skb_transport_header(skb) + - arg->csumoffset) = csum_fold(csum_add(skb->csum, + *((__sum16 *)skb_transport_header(nskb) + + arg->csumoffset) = csum_fold(csum_add(nskb->csum, arg->csum)); - skb->ip_summed = CHECKSUM_NONE; + nskb->ip_summed = CHECKSUM_NONE; + skb_set_queue_mapping(nskb, skb_get_queue_mapping(skb)); ip_push_pending_frames(sk, &fl4); } - bh_unlock_sock(sk); + put_cpu_var(unicast_sock); ip_rt_put(rt); } diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 2fd0fba77124..de29f46f68b0 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -40,6 +40,7 @@ #if IS_ENABLED(CONFIG_IPV6) #include <net/transp_v6.h> #endif +#include <net/ip_fib.h> #include <linux/errqueue.h> #include <asm/uaccess.h> @@ -90,7 +91,7 @@ static void ip_cmsg_recv_opts(struct msghdr *msg, struct sk_buff *skb) static void ip_cmsg_recv_retopts(struct msghdr *msg, struct sk_buff *skb) { unsigned char optbuf[sizeof(struct ip_options) + 40]; - struct ip_options * opt = (struct ip_options *)optbuf; + struct ip_options *opt = (struct ip_options *)optbuf; if (IPCB(skb)->opt.optlen == 0) return; @@ -147,7 +148,7 @@ static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb) void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb) { struct inet_sock *inet = inet_sk(skb->sk); - unsigned flags = inet->cmsg_flags; + unsigned int flags = inet->cmsg_flags; /* Ordered by supposed usage frequency */ if (flags & 1) @@ -673,10 +674,15 @@ static int do_ip_setsockopt(struct sock *sk, int level, break; } else { memset(&mreq, 0, sizeof(mreq)); - if (optlen >= sizeof(struct in_addr) && - copy_from_user(&mreq.imr_address, optval, - sizeof(struct in_addr))) - break; + if (optlen >= sizeof(struct ip_mreq)) { + if (copy_from_user(&mreq, optval, + sizeof(struct ip_mreq))) + break; + } else if (optlen >= sizeof(struct in_addr)) { + if (copy_from_user(&mreq.imr_address, optval, + sizeof(struct in_addr))) + break; + } } if (!mreq.imr_ifindex) { @@ -1014,8 +1020,8 @@ e_inval: * @sk: socket * @skb: buffer * - * To support IP_CMSG_PKTINFO option, we store rt_iif and rt_spec_dst - * in skb->cb[] before dst drop. + * To support IP_CMSG_PKTINFO option, we store rt_iif and specific + * destination in skb->cb[] before dst drop. * This way, receiver doesnt make cache line misses to read rtable. */ void ipv4_pktinfo_prepare(struct sk_buff *skb) @@ -1025,7 +1031,7 @@ void ipv4_pktinfo_prepare(struct sk_buff *skb) if (rt) { pktinfo->ipi_ifindex = rt->rt_iif; - pktinfo->ipi_spec_dst.s_addr = rt->rt_spec_dst; + pktinfo->ipi_spec_dst.s_addr = fib_compute_spec_dst(skb); } else { pktinfo->ipi_ifindex = 0; pktinfo->ipi_spec_dst.s_addr = 0; @@ -1094,7 +1100,7 @@ EXPORT_SYMBOL(compat_ip_setsockopt); */ static int do_ip_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen, unsigned flags) + char __user *optval, int __user *optlen, unsigned int flags) { struct inet_sock *inet = inet_sk(sk); int val; diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c new file mode 100644 index 000000000000..c41b5c359936 --- /dev/null +++ b/net/ipv4/ip_vti.c @@ -0,0 +1,956 @@ +/* + * Linux NET3: IP/IP protocol decoder modified to support + * virtual tunnel interface + * + * Authors: + * Saurabh Mohan (saurabh.mohan@vyatta.com) 05/07/2012 + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +/* + This version of net/ipv4/ip_vti.c is cloned of net/ipv4/ipip.c + + For comments look at net/ipv4/ip_gre.c --ANK + */ + + +#include <linux/capability.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/uaccess.h> +#include <linux/skbuff.h> +#include <linux/netdevice.h> +#include <linux/in.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <linux/if_arp.h> +#include <linux/mroute.h> +#include <linux/init.h> +#include <linux/netfilter_ipv4.h> +#include <linux/if_ether.h> + +#include <net/sock.h> +#include <net/ip.h> +#include <net/icmp.h> +#include <net/ipip.h> +#include <net/inet_ecn.h> +#include <net/xfrm.h> +#include <net/net_namespace.h> +#include <net/netns/generic.h> + +#define HASH_SIZE 16 +#define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&(HASH_SIZE-1)) + +static struct rtnl_link_ops vti_link_ops __read_mostly; + +static int vti_net_id __read_mostly; +struct vti_net { + struct ip_tunnel __rcu *tunnels_r_l[HASH_SIZE]; + struct ip_tunnel __rcu *tunnels_r[HASH_SIZE]; + struct ip_tunnel __rcu *tunnels_l[HASH_SIZE]; + struct ip_tunnel __rcu *tunnels_wc[1]; + struct ip_tunnel **tunnels[4]; + + struct net_device *fb_tunnel_dev; +}; + +static int vti_fb_tunnel_init(struct net_device *dev); +static int vti_tunnel_init(struct net_device *dev); +static void vti_tunnel_setup(struct net_device *dev); +static void vti_dev_free(struct net_device *dev); +static int vti_tunnel_bind_dev(struct net_device *dev); + +/* Locking : hash tables are protected by RCU and RTNL */ + +#define for_each_ip_tunnel_rcu(start) \ + for (t = rcu_dereference(start); t; t = rcu_dereference(t->next)) + +/* often modified stats are per cpu, other are shared (netdev->stats) */ +struct pcpu_tstats { + u64 rx_packets; + u64 rx_bytes; + u64 tx_packets; + u64 tx_bytes; + struct u64_stats_sync syncp; +}; + +#define VTI_XMIT(stats1, stats2) do { \ + int err; \ + int pkt_len = skb->len; \ + err = dst_output(skb); \ + if (net_xmit_eval(err) == 0) { \ + u64_stats_update_begin(&(stats1)->syncp); \ + (stats1)->tx_bytes += pkt_len; \ + (stats1)->tx_packets++; \ + u64_stats_update_end(&(stats1)->syncp); \ + } else { \ + (stats2)->tx_errors++; \ + (stats2)->tx_aborted_errors++; \ + } \ +} while (0) + + +static struct rtnl_link_stats64 *vti_get_stats64(struct net_device *dev, + struct rtnl_link_stats64 *tot) +{ + int i; + + for_each_possible_cpu(i) { + const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i); + u64 rx_packets, rx_bytes, tx_packets, tx_bytes; + unsigned int start; + + do { + start = u64_stats_fetch_begin_bh(&tstats->syncp); + rx_packets = tstats->rx_packets; + tx_packets = tstats->tx_packets; + rx_bytes = tstats->rx_bytes; + tx_bytes = tstats->tx_bytes; + } while (u64_stats_fetch_retry_bh(&tstats->syncp, start)); + + tot->rx_packets += rx_packets; + tot->tx_packets += tx_packets; + tot->rx_bytes += rx_bytes; + tot->tx_bytes += tx_bytes; + } + + tot->multicast = dev->stats.multicast; + tot->rx_crc_errors = dev->stats.rx_crc_errors; + tot->rx_fifo_errors = dev->stats.rx_fifo_errors; + tot->rx_length_errors = dev->stats.rx_length_errors; + tot->rx_errors = dev->stats.rx_errors; + tot->tx_fifo_errors = dev->stats.tx_fifo_errors; + tot->tx_carrier_errors = dev->stats.tx_carrier_errors; + tot->tx_dropped = dev->stats.tx_dropped; + tot->tx_aborted_errors = dev->stats.tx_aborted_errors; + tot->tx_errors = dev->stats.tx_errors; + + return tot; +} + +static struct ip_tunnel *vti_tunnel_lookup(struct net *net, + __be32 remote, __be32 local) +{ + unsigned h0 = HASH(remote); + unsigned h1 = HASH(local); + struct ip_tunnel *t; + struct vti_net *ipn = net_generic(net, vti_net_id); + + for_each_ip_tunnel_rcu(ipn->tunnels_r_l[h0 ^ h1]) + if (local == t->parms.iph.saddr && + remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP)) + return t; + for_each_ip_tunnel_rcu(ipn->tunnels_r[h0]) + if (remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP)) + return t; + + for_each_ip_tunnel_rcu(ipn->tunnels_l[h1]) + if (local == t->parms.iph.saddr && (t->dev->flags&IFF_UP)) + return t; + + for_each_ip_tunnel_rcu(ipn->tunnels_wc[0]) + if (t && (t->dev->flags&IFF_UP)) + return t; + return NULL; +} + +static struct ip_tunnel **__vti_bucket(struct vti_net *ipn, + struct ip_tunnel_parm *parms) +{ + __be32 remote = parms->iph.daddr; + __be32 local = parms->iph.saddr; + unsigned h = 0; + int prio = 0; + + if (remote) { + prio |= 2; + h ^= HASH(remote); + } + if (local) { + prio |= 1; + h ^= HASH(local); + } + return &ipn->tunnels[prio][h]; +} + +static inline struct ip_tunnel **vti_bucket(struct vti_net *ipn, + struct ip_tunnel *t) +{ + return __vti_bucket(ipn, &t->parms); +} + +static void vti_tunnel_unlink(struct vti_net *ipn, struct ip_tunnel *t) +{ + struct ip_tunnel __rcu **tp; + struct ip_tunnel *iter; + + for (tp = vti_bucket(ipn, t); + (iter = rtnl_dereference(*tp)) != NULL; + tp = &iter->next) { + if (t == iter) { + rcu_assign_pointer(*tp, t->next); + break; + } + } +} + +static void vti_tunnel_link(struct vti_net *ipn, struct ip_tunnel *t) +{ + struct ip_tunnel __rcu **tp = vti_bucket(ipn, t); + + rcu_assign_pointer(t->next, rtnl_dereference(*tp)); + rcu_assign_pointer(*tp, t); +} + +static struct ip_tunnel *vti_tunnel_locate(struct net *net, + struct ip_tunnel_parm *parms, + int create) +{ + __be32 remote = parms->iph.daddr; + __be32 local = parms->iph.saddr; + struct ip_tunnel *t, *nt; + struct ip_tunnel __rcu **tp; + struct net_device *dev; + char name[IFNAMSIZ]; + struct vti_net *ipn = net_generic(net, vti_net_id); + + for (tp = __vti_bucket(ipn, parms); + (t = rtnl_dereference(*tp)) != NULL; + tp = &t->next) { + if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) + return t; + } + if (!create) + return NULL; + + if (parms->name[0]) + strlcpy(name, parms->name, IFNAMSIZ); + else + strcpy(name, "vti%d"); + + dev = alloc_netdev(sizeof(*t), name, vti_tunnel_setup); + if (dev == NULL) + return NULL; + + dev_net_set(dev, net); + + nt = netdev_priv(dev); + nt->parms = *parms; + dev->rtnl_link_ops = &vti_link_ops; + + vti_tunnel_bind_dev(dev); + + if (register_netdevice(dev) < 0) + goto failed_free; + + dev_hold(dev); + vti_tunnel_link(ipn, nt); + return nt; + +failed_free: + free_netdev(dev); + return NULL; +} + +static void vti_tunnel_uninit(struct net_device *dev) +{ + struct net *net = dev_net(dev); + struct vti_net *ipn = net_generic(net, vti_net_id); + + vti_tunnel_unlink(ipn, netdev_priv(dev)); + dev_put(dev); +} + +static int vti_err(struct sk_buff *skb, u32 info) +{ + + /* All the routers (except for Linux) return only + * 8 bytes of packet payload. It means, that precise relaying of + * ICMP in the real Internet is absolutely infeasible. + */ + struct iphdr *iph = (struct iphdr *)skb->data; + const int type = icmp_hdr(skb)->type; + const int code = icmp_hdr(skb)->code; + struct ip_tunnel *t; + int err; + + switch (type) { + default: + case ICMP_PARAMETERPROB: + return 0; + + case ICMP_DEST_UNREACH: + switch (code) { + case ICMP_SR_FAILED: + case ICMP_PORT_UNREACH: + /* Impossible event. */ + return 0; + default: + /* All others are translated to HOST_UNREACH. */ + break; + } + break; + case ICMP_TIME_EXCEEDED: + if (code != ICMP_EXC_TTL) + return 0; + break; + } + + err = -ENOENT; + + rcu_read_lock(); + t = vti_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr); + if (t == NULL) + goto out; + + if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { + ipv4_update_pmtu(skb, dev_net(skb->dev), info, + t->parms.link, 0, IPPROTO_IPIP, 0); + err = 0; + goto out; + } + + err = 0; + if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) + goto out; + + if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO)) + t->err_count++; + else + t->err_count = 1; + t->err_time = jiffies; +out: + rcu_read_unlock(); + return err; +} + +/* We dont digest the packet therefore let the packet pass */ +static int vti_rcv(struct sk_buff *skb) +{ + struct ip_tunnel *tunnel; + const struct iphdr *iph = ip_hdr(skb); + + rcu_read_lock(); + tunnel = vti_tunnel_lookup(dev_net(skb->dev), iph->saddr, iph->daddr); + if (tunnel != NULL) { + struct pcpu_tstats *tstats; + + tstats = this_cpu_ptr(tunnel->dev->tstats); + u64_stats_update_begin(&tstats->syncp); + tstats->rx_packets++; + tstats->rx_bytes += skb->len; + u64_stats_update_end(&tstats->syncp); + + skb->dev = tunnel->dev; + rcu_read_unlock(); + return 1; + } + rcu_read_unlock(); + + return -1; +} + +/* This function assumes it is being called from dev_queue_xmit() + * and that skb is filled properly by that function. + */ + +static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct ip_tunnel *tunnel = netdev_priv(dev); + struct pcpu_tstats *tstats; + struct iphdr *tiph = &tunnel->parms.iph; + u8 tos; + struct rtable *rt; /* Route to the other host */ + struct net_device *tdev; /* Device to other host */ + struct iphdr *old_iph = ip_hdr(skb); + __be32 dst = tiph->daddr; + struct flowi4 fl4; + + if (skb->protocol != htons(ETH_P_IP)) + goto tx_error; + + tos = old_iph->tos; + + memset(&fl4, 0, sizeof(fl4)); + flowi4_init_output(&fl4, tunnel->parms.link, + htonl(tunnel->parms.i_key), RT_TOS(tos), + RT_SCOPE_UNIVERSE, + IPPROTO_IPIP, 0, + dst, tiph->saddr, 0, 0); + rt = ip_route_output_key(dev_net(dev), &fl4); + if (IS_ERR(rt)) { + dev->stats.tx_carrier_errors++; + goto tx_error_icmp; + } + /* if there is no transform then this tunnel is not functional. + * Or if the xfrm is not mode tunnel. + */ + if (!rt->dst.xfrm || + rt->dst.xfrm->props.mode != XFRM_MODE_TUNNEL) { + dev->stats.tx_carrier_errors++; + goto tx_error_icmp; + } + tdev = rt->dst.dev; + + if (tdev == dev) { + ip_rt_put(rt); + dev->stats.collisions++; + goto tx_error; + } + + if (tunnel->err_count > 0) { + if (time_before(jiffies, + tunnel->err_time + IPTUNNEL_ERR_TIMEO)) { + tunnel->err_count--; + dst_link_failure(skb); + } else + tunnel->err_count = 0; + } + + IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | + IPSKB_REROUTED); + skb_dst_drop(skb); + skb_dst_set(skb, &rt->dst); + nf_reset(skb); + skb->dev = skb_dst(skb)->dev; + + tstats = this_cpu_ptr(dev->tstats); + VTI_XMIT(tstats, &dev->stats); + return NETDEV_TX_OK; + +tx_error_icmp: + dst_link_failure(skb); +tx_error: + dev->stats.tx_errors++; + dev_kfree_skb(skb); + return NETDEV_TX_OK; +} + +static int vti_tunnel_bind_dev(struct net_device *dev) +{ + struct net_device *tdev = NULL; + struct ip_tunnel *tunnel; + struct iphdr *iph; + + tunnel = netdev_priv(dev); + iph = &tunnel->parms.iph; + + if (iph->daddr) { + struct rtable *rt; + struct flowi4 fl4; + memset(&fl4, 0, sizeof(fl4)); + flowi4_init_output(&fl4, tunnel->parms.link, + htonl(tunnel->parms.i_key), + RT_TOS(iph->tos), RT_SCOPE_UNIVERSE, + IPPROTO_IPIP, 0, + iph->daddr, iph->saddr, 0, 0); + rt = ip_route_output_key(dev_net(dev), &fl4); + if (!IS_ERR(rt)) { + tdev = rt->dst.dev; + ip_rt_put(rt); + } + dev->flags |= IFF_POINTOPOINT; + } + + if (!tdev && tunnel->parms.link) + tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link); + + if (tdev) { + dev->hard_header_len = tdev->hard_header_len + + sizeof(struct iphdr); + dev->mtu = tdev->mtu; + } + dev->iflink = tunnel->parms.link; + return dev->mtu; +} + +static int +vti_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) +{ + int err = 0; + struct ip_tunnel_parm p; + struct ip_tunnel *t; + struct net *net = dev_net(dev); + struct vti_net *ipn = net_generic(net, vti_net_id); + + switch (cmd) { + case SIOCGETTUNNEL: + t = NULL; + if (dev == ipn->fb_tunnel_dev) { + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, + sizeof(p))) { + err = -EFAULT; + break; + } + t = vti_tunnel_locate(net, &p, 0); + } + if (t == NULL) + t = netdev_priv(dev); + memcpy(&p, &t->parms, sizeof(p)); + p.i_flags |= GRE_KEY | VTI_ISVTI; + p.o_flags |= GRE_KEY; + if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) + err = -EFAULT; + break; + + case SIOCADDTUNNEL: + case SIOCCHGTUNNEL: + err = -EPERM; + if (!capable(CAP_NET_ADMIN)) + goto done; + + err = -EFAULT; + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + goto done; + + err = -EINVAL; + if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP || + p.iph.ihl != 5) + goto done; + + t = vti_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL); + + if (dev != ipn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) { + if (t != NULL) { + if (t->dev != dev) { + err = -EEXIST; + break; + } + } else { + if (((dev->flags&IFF_POINTOPOINT) && + !p.iph.daddr) || + (!(dev->flags&IFF_POINTOPOINT) && + p.iph.daddr)) { + err = -EINVAL; + break; + } + t = netdev_priv(dev); + vti_tunnel_unlink(ipn, t); + synchronize_net(); + t->parms.iph.saddr = p.iph.saddr; + t->parms.iph.daddr = p.iph.daddr; + t->parms.i_key = p.i_key; + t->parms.o_key = p.o_key; + t->parms.iph.protocol = IPPROTO_IPIP; + memcpy(dev->dev_addr, &p.iph.saddr, 4); + memcpy(dev->broadcast, &p.iph.daddr, 4); + vti_tunnel_link(ipn, t); + netdev_state_change(dev); + } + } + + if (t) { + err = 0; + if (cmd == SIOCCHGTUNNEL) { + t->parms.i_key = p.i_key; + t->parms.o_key = p.o_key; + if (t->parms.link != p.link) { + t->parms.link = p.link; + vti_tunnel_bind_dev(dev); + netdev_state_change(dev); + } + } + p.i_flags |= GRE_KEY | VTI_ISVTI; + p.o_flags |= GRE_KEY; + if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, + sizeof(p))) + err = -EFAULT; + } else + err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT); + break; + + case SIOCDELTUNNEL: + err = -EPERM; + if (!capable(CAP_NET_ADMIN)) + goto done; + + if (dev == ipn->fb_tunnel_dev) { + err = -EFAULT; + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, + sizeof(p))) + goto done; + err = -ENOENT; + + t = vti_tunnel_locate(net, &p, 0); + if (t == NULL) + goto done; + err = -EPERM; + if (t->dev == ipn->fb_tunnel_dev) + goto done; + dev = t->dev; + } + unregister_netdevice(dev); + err = 0; + break; + + default: + err = -EINVAL; + } + +done: + return err; +} + +static int vti_tunnel_change_mtu(struct net_device *dev, int new_mtu) +{ + if (new_mtu < 68 || new_mtu > 0xFFF8) + return -EINVAL; + dev->mtu = new_mtu; + return 0; +} + +static const struct net_device_ops vti_netdev_ops = { + .ndo_init = vti_tunnel_init, + .ndo_uninit = vti_tunnel_uninit, + .ndo_start_xmit = vti_tunnel_xmit, + .ndo_do_ioctl = vti_tunnel_ioctl, + .ndo_change_mtu = vti_tunnel_change_mtu, + .ndo_get_stats64 = vti_get_stats64, +}; + +static void vti_dev_free(struct net_device *dev) +{ + free_percpu(dev->tstats); + free_netdev(dev); +} + +static void vti_tunnel_setup(struct net_device *dev) +{ + dev->netdev_ops = &vti_netdev_ops; + dev->destructor = vti_dev_free; + + dev->type = ARPHRD_TUNNEL; + dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr); + dev->mtu = ETH_DATA_LEN; + dev->flags = IFF_NOARP; + dev->iflink = 0; + dev->addr_len = 4; + dev->features |= NETIF_F_NETNS_LOCAL; + dev->features |= NETIF_F_LLTX; + dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; +} + +static int vti_tunnel_init(struct net_device *dev) +{ + struct ip_tunnel *tunnel = netdev_priv(dev); + + tunnel->dev = dev; + strcpy(tunnel->parms.name, dev->name); + + memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4); + memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); + + dev->tstats = alloc_percpu(struct pcpu_tstats); + if (!dev->tstats) + return -ENOMEM; + + return 0; +} + +static int __net_init vti_fb_tunnel_init(struct net_device *dev) +{ + struct ip_tunnel *tunnel = netdev_priv(dev); + struct iphdr *iph = &tunnel->parms.iph; + struct vti_net *ipn = net_generic(dev_net(dev), vti_net_id); + + tunnel->dev = dev; + strcpy(tunnel->parms.name, dev->name); + + iph->version = 4; + iph->protocol = IPPROTO_IPIP; + iph->ihl = 5; + + dev->tstats = alloc_percpu(struct pcpu_tstats); + if (!dev->tstats) + return -ENOMEM; + + dev_hold(dev); + rcu_assign_pointer(ipn->tunnels_wc[0], tunnel); + return 0; +} + +static struct xfrm_tunnel vti_handler __read_mostly = { + .handler = vti_rcv, + .err_handler = vti_err, + .priority = 1, +}; + +static void vti_destroy_tunnels(struct vti_net *ipn, struct list_head *head) +{ + int prio; + + for (prio = 1; prio < 4; prio++) { + int h; + for (h = 0; h < HASH_SIZE; h++) { + struct ip_tunnel *t; + + t = rtnl_dereference(ipn->tunnels[prio][h]); + while (t != NULL) { + unregister_netdevice_queue(t->dev, head); + t = rtnl_dereference(t->next); + } + } + } +} + +static int __net_init vti_init_net(struct net *net) +{ + int err; + struct vti_net *ipn = net_generic(net, vti_net_id); + + ipn->tunnels[0] = ipn->tunnels_wc; + ipn->tunnels[1] = ipn->tunnels_l; + ipn->tunnels[2] = ipn->tunnels_r; + ipn->tunnels[3] = ipn->tunnels_r_l; + + ipn->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), + "ip_vti0", + vti_tunnel_setup); + if (!ipn->fb_tunnel_dev) { + err = -ENOMEM; + goto err_alloc_dev; + } + dev_net_set(ipn->fb_tunnel_dev, net); + + err = vti_fb_tunnel_init(ipn->fb_tunnel_dev); + if (err) + goto err_reg_dev; + ipn->fb_tunnel_dev->rtnl_link_ops = &vti_link_ops; + + err = register_netdev(ipn->fb_tunnel_dev); + if (err) + goto err_reg_dev; + return 0; + +err_reg_dev: + vti_dev_free(ipn->fb_tunnel_dev); +err_alloc_dev: + /* nothing */ + return err; +} + +static void __net_exit vti_exit_net(struct net *net) +{ + struct vti_net *ipn = net_generic(net, vti_net_id); + LIST_HEAD(list); + + rtnl_lock(); + vti_destroy_tunnels(ipn, &list); + unregister_netdevice_many(&list); + rtnl_unlock(); +} + +static struct pernet_operations vti_net_ops = { + .init = vti_init_net, + .exit = vti_exit_net, + .id = &vti_net_id, + .size = sizeof(struct vti_net), +}; + +static int vti_tunnel_validate(struct nlattr *tb[], struct nlattr *data[]) +{ + return 0; +} + +static void vti_netlink_parms(struct nlattr *data[], + struct ip_tunnel_parm *parms) +{ + memset(parms, 0, sizeof(*parms)); + + parms->iph.protocol = IPPROTO_IPIP; + + if (!data) + return; + + if (data[IFLA_VTI_LINK]) + parms->link = nla_get_u32(data[IFLA_VTI_LINK]); + + if (data[IFLA_VTI_IKEY]) + parms->i_key = nla_get_be32(data[IFLA_VTI_IKEY]); + + if (data[IFLA_VTI_OKEY]) + parms->o_key = nla_get_be32(data[IFLA_VTI_OKEY]); + + if (data[IFLA_VTI_LOCAL]) + parms->iph.saddr = nla_get_be32(data[IFLA_VTI_LOCAL]); + + if (data[IFLA_VTI_REMOTE]) + parms->iph.daddr = nla_get_be32(data[IFLA_VTI_REMOTE]); + +} + +static int vti_newlink(struct net *src_net, struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[]) +{ + struct ip_tunnel *nt; + struct net *net = dev_net(dev); + struct vti_net *ipn = net_generic(net, vti_net_id); + int mtu; + int err; + + nt = netdev_priv(dev); + vti_netlink_parms(data, &nt->parms); + + if (vti_tunnel_locate(net, &nt->parms, 0)) + return -EEXIST; + + mtu = vti_tunnel_bind_dev(dev); + if (!tb[IFLA_MTU]) + dev->mtu = mtu; + + err = register_netdevice(dev); + if (err) + goto out; + + dev_hold(dev); + vti_tunnel_link(ipn, nt); + +out: + return err; +} + +static int vti_changelink(struct net_device *dev, struct nlattr *tb[], + struct nlattr *data[]) +{ + struct ip_tunnel *t, *nt; + struct net *net = dev_net(dev); + struct vti_net *ipn = net_generic(net, vti_net_id); + struct ip_tunnel_parm p; + int mtu; + + if (dev == ipn->fb_tunnel_dev) + return -EINVAL; + + nt = netdev_priv(dev); + vti_netlink_parms(data, &p); + + t = vti_tunnel_locate(net, &p, 0); + + if (t) { + if (t->dev != dev) + return -EEXIST; + } else { + t = nt; + + vti_tunnel_unlink(ipn, t); + t->parms.iph.saddr = p.iph.saddr; + t->parms.iph.daddr = p.iph.daddr; + t->parms.i_key = p.i_key; + t->parms.o_key = p.o_key; + if (dev->type != ARPHRD_ETHER) { + memcpy(dev->dev_addr, &p.iph.saddr, 4); + memcpy(dev->broadcast, &p.iph.daddr, 4); + } + vti_tunnel_link(ipn, t); + netdev_state_change(dev); + } + + if (t->parms.link != p.link) { + t->parms.link = p.link; + mtu = vti_tunnel_bind_dev(dev); + if (!tb[IFLA_MTU]) + dev->mtu = mtu; + netdev_state_change(dev); + } + + return 0; +} + +static size_t vti_get_size(const struct net_device *dev) +{ + return + /* IFLA_VTI_LINK */ + nla_total_size(4) + + /* IFLA_VTI_IKEY */ + nla_total_size(4) + + /* IFLA_VTI_OKEY */ + nla_total_size(4) + + /* IFLA_VTI_LOCAL */ + nla_total_size(4) + + /* IFLA_VTI_REMOTE */ + nla_total_size(4) + + 0; +} + +static int vti_fill_info(struct sk_buff *skb, const struct net_device *dev) +{ + struct ip_tunnel *t = netdev_priv(dev); + struct ip_tunnel_parm *p = &t->parms; + + nla_put_u32(skb, IFLA_VTI_LINK, p->link); + nla_put_be32(skb, IFLA_VTI_IKEY, p->i_key); + nla_put_be32(skb, IFLA_VTI_OKEY, p->o_key); + nla_put_be32(skb, IFLA_VTI_LOCAL, p->iph.saddr); + nla_put_be32(skb, IFLA_VTI_REMOTE, p->iph.daddr); + + return 0; +} + +static const struct nla_policy vti_policy[IFLA_VTI_MAX + 1] = { + [IFLA_VTI_LINK] = { .type = NLA_U32 }, + [IFLA_VTI_IKEY] = { .type = NLA_U32 }, + [IFLA_VTI_OKEY] = { .type = NLA_U32 }, + [IFLA_VTI_LOCAL] = { .len = FIELD_SIZEOF(struct iphdr, saddr) }, + [IFLA_VTI_REMOTE] = { .len = FIELD_SIZEOF(struct iphdr, daddr) }, +}; + +static struct rtnl_link_ops vti_link_ops __read_mostly = { + .kind = "vti", + .maxtype = IFLA_VTI_MAX, + .policy = vti_policy, + .priv_size = sizeof(struct ip_tunnel), + .setup = vti_tunnel_setup, + .validate = vti_tunnel_validate, + .newlink = vti_newlink, + .changelink = vti_changelink, + .get_size = vti_get_size, + .fill_info = vti_fill_info, +}; + +static int __init vti_init(void) +{ + int err; + + pr_info("IPv4 over IPSec tunneling driver\n"); + + err = register_pernet_device(&vti_net_ops); + if (err < 0) + return err; + err = xfrm4_mode_tunnel_input_register(&vti_handler); + if (err < 0) { + unregister_pernet_device(&vti_net_ops); + pr_info(KERN_INFO "vti init: can't register tunnel\n"); + } + + err = rtnl_link_register(&vti_link_ops); + if (err < 0) + goto rtnl_link_failed; + + return err; + +rtnl_link_failed: + xfrm4_mode_tunnel_input_deregister(&vti_handler); + unregister_pernet_device(&vti_net_ops); + return err; +} + +static void __exit vti_fini(void) +{ + rtnl_link_unregister(&vti_link_ops); + if (xfrm4_mode_tunnel_input_deregister(&vti_handler)) + pr_info("vti close: can't deregister tunnel\n"); + + unregister_pernet_device(&vti_net_ops); +} + +module_init(vti_init); +module_exit(vti_fini); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_RTNL_LINK("vti"); +MODULE_ALIAS_NETDEV("ip_vti0"); diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index 63b64c45a826..d3ab47e19a89 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c @@ -31,17 +31,26 @@ static void ipcomp4_err(struct sk_buff *skb, u32 info) struct ip_comp_hdr *ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2)); struct xfrm_state *x; - if (icmp_hdr(skb)->type != ICMP_DEST_UNREACH || - icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) + switch (icmp_hdr(skb)->type) { + case ICMP_DEST_UNREACH: + if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) + return; + case ICMP_REDIRECT: + break; + default: return; + } spi = htonl(ntohs(ipch->cpi)); x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, spi, IPPROTO_COMP, AF_INET); if (!x) return; - NETDEBUG(KERN_DEBUG "pmtu discovery on SA IPCOMP/%08x/%pI4\n", - spi, &iph->daddr); + + if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) + ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_COMP, 0); + else + ipv4_redirect(skb, net, 0, 0, IPPROTO_COMP, 0); xfrm_state_put(x); } diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index 92ac7e7363a0..67e8a6b086ea 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -808,8 +808,6 @@ static void __init ic_bootp_send_if(struct ic_device *d, unsigned long jiffies_d b->op = BOOTP_REQUEST; if (dev->type < 256) /* check for false types */ b->htype = dev->type; - else if (dev->type == ARPHRD_IEEE802_TR) /* fix for token ring */ - b->htype = ARPHRD_IEEE802; else if (dev->type == ARPHRD_FDDI) b->htype = ARPHRD_ETHER; else { @@ -955,8 +953,7 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str /* Fragments are not supported */ if (ip_is_fragment(h)) { - if (net_ratelimit()) - pr_err("DHCP/BOOTP: Ignoring fragmented reply\n"); + net_err_ratelimited("DHCP/BOOTP: Ignoring fragmented reply\n"); goto drop; } @@ -1004,16 +1001,14 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str /* Is it a reply to our BOOTP request? */ if (b->op != BOOTP_REPLY || b->xid != d->xid) { - if (net_ratelimit()) - pr_err("DHCP/BOOTP: Reply not for us, op[%x] xid[%x]\n", - b->op, b->xid); + net_err_ratelimited("DHCP/BOOTP: Reply not for us, op[%x] xid[%x]\n", + b->op, b->xid); goto drop_unlock; } /* Is it a reply for the device we are configuring? */ if (b->xid != ic_dev_xid) { - if (net_ratelimit()) - pr_err("DHCP/BOOTP: Ignoring delayed packet\n"); + net_err_ratelimited("DHCP/BOOTP: Ignoring delayed packet\n"); goto drop_unlock; } @@ -1198,7 +1193,7 @@ static int __init ic_dynamic(void) d = ic_first_dev; retries = CONF_SEND_RETRIES; get_random_bytes(&timeout, sizeof(timeout)); - timeout = CONF_BASE_TIMEOUT + (timeout % (unsigned) CONF_TIMEOUT_RANDOM); + timeout = CONF_BASE_TIMEOUT + (timeout % (unsigned int) CONF_TIMEOUT_RANDOM); for (;;) { /* Track the device we are configuring */ ic_dev_xid = d->xid; @@ -1626,11 +1621,13 @@ static int __init ip_auto_config_setup(char *addrs) return 1; } +__setup("ip=", ip_auto_config_setup); static int __init nfsaddrs_config_setup(char *addrs) { return ip_auto_config_setup(addrs); } +__setup("nfsaddrs=", nfsaddrs_config_setup); static int __init vendor_class_identifier_setup(char *addrs) { @@ -1641,7 +1638,4 @@ static int __init vendor_class_identifier_setup(char *addrs) vendor_class_identifier); return 1; } - -__setup("ip=", ip_auto_config_setup); -__setup("nfsaddrs=", nfsaddrs_config_setup); __setup("dhcpclass=", vendor_class_identifier_setup); diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index ae1413e3f2f8..2c2c35bace76 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -144,33 +144,48 @@ static void ipip_dev_free(struct net_device *dev); /* often modified stats are per cpu, other are shared (netdev->stats) */ struct pcpu_tstats { - unsigned long rx_packets; - unsigned long rx_bytes; - unsigned long tx_packets; - unsigned long tx_bytes; -} __attribute__((aligned(4*sizeof(unsigned long)))); + u64 rx_packets; + u64 rx_bytes; + u64 tx_packets; + u64 tx_bytes; + struct u64_stats_sync syncp; +}; -static struct net_device_stats *ipip_get_stats(struct net_device *dev) +static struct rtnl_link_stats64 *ipip_get_stats64(struct net_device *dev, + struct rtnl_link_stats64 *tot) { - struct pcpu_tstats sum = { 0 }; int i; for_each_possible_cpu(i) { const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i); - - sum.rx_packets += tstats->rx_packets; - sum.rx_bytes += tstats->rx_bytes; - sum.tx_packets += tstats->tx_packets; - sum.tx_bytes += tstats->tx_bytes; + u64 rx_packets, rx_bytes, tx_packets, tx_bytes; + unsigned int start; + + do { + start = u64_stats_fetch_begin_bh(&tstats->syncp); + rx_packets = tstats->rx_packets; + tx_packets = tstats->tx_packets; + rx_bytes = tstats->rx_bytes; + tx_bytes = tstats->tx_bytes; + } while (u64_stats_fetch_retry_bh(&tstats->syncp, start)); + + tot->rx_packets += rx_packets; + tot->tx_packets += tx_packets; + tot->rx_bytes += rx_bytes; + tot->tx_bytes += tx_bytes; } - dev->stats.rx_packets = sum.rx_packets; - dev->stats.rx_bytes = sum.rx_bytes; - dev->stats.tx_packets = sum.tx_packets; - dev->stats.tx_bytes = sum.tx_bytes; - return &dev->stats; + + tot->tx_fifo_errors = dev->stats.tx_fifo_errors; + tot->tx_carrier_errors = dev->stats.tx_carrier_errors; + tot->tx_dropped = dev->stats.tx_dropped; + tot->tx_aborted_errors = dev->stats.tx_aborted_errors; + tot->tx_errors = dev->stats.tx_errors; + tot->collisions = dev->stats.collisions; + + return tot; } -static struct ip_tunnel * ipip_tunnel_lookup(struct net *net, +static struct ip_tunnel *ipip_tunnel_lookup(struct net *net, __be32 remote, __be32 local) { unsigned int h0 = HASH(remote); @@ -245,7 +260,7 @@ static void ipip_tunnel_link(struct ipip_net *ipn, struct ip_tunnel *t) rcu_assign_pointer(*tp, t); } -static struct ip_tunnel * ipip_tunnel_locate(struct net *net, +static struct ip_tunnel *ipip_tunnel_locate(struct net *net, struct ip_tunnel_parm *parms, int create) { __be32 remote = parms->iph.daddr; @@ -333,9 +348,6 @@ static int ipip_err(struct sk_buff *skb, u32 info) case ICMP_PORT_UNREACH: /* Impossible event. */ return 0; - case ICMP_FRAG_NEEDED: - /* Soft state for pmtu is maintained by IP core. */ - return 0; default: /* All others are translated to HOST_UNREACH. rfc2003 contains "deep thoughts" about NET_UNREACH, @@ -348,13 +360,32 @@ static int ipip_err(struct sk_buff *skb, u32 info) if (code != ICMP_EXC_TTL) return 0; break; + case ICMP_REDIRECT: + break; } err = -ENOENT; rcu_read_lock(); t = ipip_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr); - if (t == NULL || t->parms.iph.daddr == 0) + if (t == NULL) + goto out; + + if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { + ipv4_update_pmtu(skb, dev_net(skb->dev), info, + t->dev->ifindex, 0, IPPROTO_IPIP, 0); + err = 0; + goto out; + } + + if (type == ICMP_REDIRECT) { + ipv4_redirect(skb, dev_net(skb->dev), t->dev->ifindex, 0, + IPPROTO_IPIP, 0); + err = 0; + goto out; + } + + if (t->parms.iph.daddr == 0) goto out; err = 0; @@ -404,8 +435,10 @@ static int ipip_rcv(struct sk_buff *skb) skb->pkt_type = PACKET_HOST; tstats = this_cpu_ptr(tunnel->dev->tstats); + u64_stats_update_begin(&tstats->syncp); tstats->rx_packets++; tstats->rx_bytes += skb->len; + u64_stats_update_end(&tstats->syncp); __skb_tunnel_rx(skb, tunnel->dev); @@ -486,7 +519,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) } if (skb_dst(skb)) - skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu); + skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); if ((old_iph->frag_off & htons(IP_DF)) && mtu < ntohs(old_iph->tot_len)) { @@ -730,7 +763,7 @@ static const struct net_device_ops ipip_netdev_ops = { .ndo_start_xmit = ipip_tunnel_xmit, .ndo_do_ioctl = ipip_tunnel_ioctl, .ndo_change_mtu = ipip_tunnel_change_mtu, - .ndo_get_stats = ipip_get_stats, + .ndo_get_stats64 = ipip_get_stats64, }; static void ipip_dev_free(struct net_device *dev) diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 960fbfc3e976..5716c6b808d6 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -524,8 +524,8 @@ failure: } #endif -/* - * Delete a VIF entry +/** + * vif_delete - Delete a VIF entry * @notify: Set to 1, if the caller is a notifier_call */ @@ -949,8 +949,7 @@ static int ipmr_cache_report(struct mr_table *mrt, ret = sock_queue_rcv_skb(mroute_sk, skb); rcu_read_unlock(); if (ret < 0) { - if (net_ratelimit()) - pr_warn("mroute: pending queue full, dropping entries\n"); + net_warn_ratelimited("mroute: pending queue full, dropping entries\n"); kfree_skb(skb); } @@ -1575,6 +1574,7 @@ static inline int ipmr_forward_finish(struct sk_buff *skb) struct ip_options *opt = &(IPCB(skb)->opt); IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS); + IP_ADD_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTOCTETS, skb->len); if (unlikely(opt->optlen)) ip_forward_options(skb); @@ -2006,37 +2006,37 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, { int ct; struct rtnexthop *nhp; - u8 *b = skb_tail_pointer(skb); - struct rtattr *mp_head; + struct nlattr *mp_attr; /* If cache is unresolved, don't try to parse IIF and OIF */ if (c->mfc_parent >= MAXVIFS) return -ENOENT; - if (VIF_EXISTS(mrt, c->mfc_parent)) - RTA_PUT(skb, RTA_IIF, 4, &mrt->vif_table[c->mfc_parent].dev->ifindex); + if (VIF_EXISTS(mrt, c->mfc_parent) && + nla_put_u32(skb, RTA_IIF, mrt->vif_table[c->mfc_parent].dev->ifindex) < 0) + return -EMSGSIZE; - mp_head = (struct rtattr *)skb_put(skb, RTA_LENGTH(0)); + if (!(mp_attr = nla_nest_start(skb, RTA_MULTIPATH))) + return -EMSGSIZE; for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) { if (VIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) { - if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4)) - goto rtattr_failure; - nhp = (struct rtnexthop *)skb_put(skb, RTA_ALIGN(sizeof(*nhp))); + if (!(nhp = nla_reserve_nohdr(skb, sizeof(*nhp)))) { + nla_nest_cancel(skb, mp_attr); + return -EMSGSIZE; + } + nhp->rtnh_flags = 0; nhp->rtnh_hops = c->mfc_un.res.ttls[ct]; nhp->rtnh_ifindex = mrt->vif_table[ct].dev->ifindex; nhp->rtnh_len = sizeof(*nhp); } } - mp_head->rta_type = RTA_MULTIPATH; - mp_head->rta_len = skb_tail_pointer(skb) - (u8 *)mp_head; + + nla_nest_end(skb, mp_attr); + rtm->rtm_type = RTN_MULTICAST; return 1; - -rtattr_failure: - nlmsg_trim(skb, b); - return -EMSGSIZE; } int ipmr_get_route(struct net *net, struct sk_buff *skb, @@ -2119,15 +2119,16 @@ static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, rtm->rtm_src_len = 32; rtm->rtm_tos = 0; rtm->rtm_table = mrt->id; - NLA_PUT_U32(skb, RTA_TABLE, mrt->id); + if (nla_put_u32(skb, RTA_TABLE, mrt->id)) + goto nla_put_failure; rtm->rtm_type = RTN_MULTICAST; rtm->rtm_scope = RT_SCOPE_UNIVERSE; rtm->rtm_protocol = RTPROT_UNSPEC; rtm->rtm_flags = 0; - NLA_PUT_BE32(skb, RTA_SRC, c->mfc_origin); - NLA_PUT_BE32(skb, RTA_DST, c->mfc_mcastgrp); - + if (nla_put_be32(skb, RTA_SRC, c->mfc_origin) || + nla_put_be32(skb, RTA_DST, c->mfc_mcastgrp)) + goto nla_put_failure; if (__ipmr_fill_mroute(mrt, skb, c, rtm) < 0) goto nla_put_failure; diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index 4f47e064e262..ed1b36783192 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c @@ -12,7 +12,7 @@ #include <net/netfilter/nf_queue.h> /* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */ -int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type) +int ip_route_me_harder(struct sk_buff *skb, unsigned int addr_type) { struct net *net = dev_net(skb_dst(skb)->dev); const struct iphdr *iph = ip_hdr(skb); @@ -237,13 +237,3 @@ static void ipv4_netfilter_fini(void) module_init(ipv4_netfilter_init); module_exit(ipv4_netfilter_fini); - -#ifdef CONFIG_SYSCTL -struct ctl_path nf_net_ipv4_netfilter_sysctl_path[] = { - { .procname = "net", }, - { .procname = "ipv4", }, - { .procname = "netfilter", }, - { } -}; -EXPORT_SYMBOL_GPL(nf_net_ipv4_netfilter_sysctl_path); -#endif /* CONFIG_SYSCTL */ diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 240b68469a7a..c20674dc9452 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -66,6 +66,3 @@ obj-$(CONFIG_IP_NF_ARP_MANGLE) += arpt_mangle.o # just filtering instance of ARP tables for now obj-$(CONFIG_IP_NF_ARPFILTER) += arptable_filter.o - -obj-$(CONFIG_IP_NF_QUEUE) += ip_queue.o - diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index fd7a3f68917f..97e61eadf580 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -221,9 +221,8 @@ static inline int arp_checkentry(const struct arpt_arp *arp) static unsigned int arpt_error(struct sk_buff *skb, const struct xt_action_param *par) { - if (net_ratelimit()) - pr_err("arp_tables: error: '%s'\n", - (const char *)par->targinfo); + net_err_ratelimited("arp_tables: error: '%s'\n", + (const char *)par->targinfo); return NF_DROP; } @@ -303,7 +302,7 @@ unsigned int arpt_do_table(struct sk_buff *skb, if (v < 0) { /* Pop from stack? */ if (v != XT_RETURN) { - verdict = (unsigned)(-v) - 1; + verdict = (unsigned int)(-v) - 1; break; } e = back; diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c deleted file mode 100644 index 94d45e1f8882..000000000000 --- a/net/ipv4/netfilter/ip_queue.c +++ /dev/null @@ -1,639 +0,0 @@ -/* - * This is a module which is used for queueing IPv4 packets and - * communicating with userspace via netlink. - * - * (C) 2000-2002 James Morris <jmorris@intercode.com.au> - * (C) 2003-2005 Netfilter Core Team <coreteam@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ -#include <linux/module.h> -#include <linux/skbuff.h> -#include <linux/init.h> -#include <linux/ip.h> -#include <linux/notifier.h> -#include <linux/netdevice.h> -#include <linux/netfilter.h> -#include <linux/netfilter_ipv4/ip_queue.h> -#include <linux/netfilter_ipv4/ip_tables.h> -#include <linux/netlink.h> -#include <linux/spinlock.h> -#include <linux/sysctl.h> -#include <linux/proc_fs.h> -#include <linux/seq_file.h> -#include <linux/security.h> -#include <linux/net.h> -#include <linux/mutex.h> -#include <linux/slab.h> -#include <net/net_namespace.h> -#include <net/sock.h> -#include <net/route.h> -#include <net/netfilter/nf_queue.h> -#include <net/ip.h> - -#define IPQ_QMAX_DEFAULT 1024 -#define IPQ_PROC_FS_NAME "ip_queue" -#define NET_IPQ_QMAX 2088 -#define NET_IPQ_QMAX_NAME "ip_queue_maxlen" - -typedef int (*ipq_cmpfn)(struct nf_queue_entry *, unsigned long); - -static unsigned char copy_mode __read_mostly = IPQ_COPY_NONE; -static unsigned int queue_maxlen __read_mostly = IPQ_QMAX_DEFAULT; -static DEFINE_SPINLOCK(queue_lock); -static int peer_pid __read_mostly; -static unsigned int copy_range __read_mostly; -static unsigned int queue_total; -static unsigned int queue_dropped = 0; -static unsigned int queue_user_dropped = 0; -static struct sock *ipqnl __read_mostly; -static LIST_HEAD(queue_list); -static DEFINE_MUTEX(ipqnl_mutex); - -static inline void -__ipq_enqueue_entry(struct nf_queue_entry *entry) -{ - list_add_tail(&entry->list, &queue_list); - queue_total++; -} - -static inline int -__ipq_set_mode(unsigned char mode, unsigned int range) -{ - int status = 0; - - switch(mode) { - case IPQ_COPY_NONE: - case IPQ_COPY_META: - copy_mode = mode; - copy_range = 0; - break; - - case IPQ_COPY_PACKET: - if (range > 0xFFFF) - range = 0xFFFF; - copy_range = range; - copy_mode = mode; - break; - - default: - status = -EINVAL; - - } - return status; -} - -static void __ipq_flush(ipq_cmpfn cmpfn, unsigned long data); - -static inline void -__ipq_reset(void) -{ - peer_pid = 0; - net_disable_timestamp(); - __ipq_set_mode(IPQ_COPY_NONE, 0); - __ipq_flush(NULL, 0); -} - -static struct nf_queue_entry * -ipq_find_dequeue_entry(unsigned long id) -{ - struct nf_queue_entry *entry = NULL, *i; - - spin_lock_bh(&queue_lock); - - list_for_each_entry(i, &queue_list, list) { - if ((unsigned long)i == id) { - entry = i; - break; - } - } - - if (entry) { - list_del(&entry->list); - queue_total--; - } - - spin_unlock_bh(&queue_lock); - return entry; -} - -static void -__ipq_flush(ipq_cmpfn cmpfn, unsigned long data) -{ - struct nf_queue_entry *entry, *next; - - list_for_each_entry_safe(entry, next, &queue_list, list) { - if (!cmpfn || cmpfn(entry, data)) { - list_del(&entry->list); - queue_total--; - nf_reinject(entry, NF_DROP); - } - } -} - -static void -ipq_flush(ipq_cmpfn cmpfn, unsigned long data) -{ - spin_lock_bh(&queue_lock); - __ipq_flush(cmpfn, data); - spin_unlock_bh(&queue_lock); -} - -static struct sk_buff * -ipq_build_packet_message(struct nf_queue_entry *entry, int *errp) -{ - sk_buff_data_t old_tail; - size_t size = 0; - size_t data_len = 0; - struct sk_buff *skb; - struct ipq_packet_msg *pmsg; - struct nlmsghdr *nlh; - struct timeval tv; - - switch (ACCESS_ONCE(copy_mode)) { - case IPQ_COPY_META: - case IPQ_COPY_NONE: - size = NLMSG_SPACE(sizeof(*pmsg)); - break; - - case IPQ_COPY_PACKET: - if (entry->skb->ip_summed == CHECKSUM_PARTIAL && - (*errp = skb_checksum_help(entry->skb))) - return NULL; - - data_len = ACCESS_ONCE(copy_range); - if (data_len == 0 || data_len > entry->skb->len) - data_len = entry->skb->len; - - size = NLMSG_SPACE(sizeof(*pmsg) + data_len); - break; - - default: - *errp = -EINVAL; - return NULL; - } - - skb = alloc_skb(size, GFP_ATOMIC); - if (!skb) - goto nlmsg_failure; - - old_tail = skb->tail; - nlh = NLMSG_PUT(skb, 0, 0, IPQM_PACKET, size - sizeof(*nlh)); - pmsg = NLMSG_DATA(nlh); - memset(pmsg, 0, sizeof(*pmsg)); - - pmsg->packet_id = (unsigned long )entry; - pmsg->data_len = data_len; - tv = ktime_to_timeval(entry->skb->tstamp); - pmsg->timestamp_sec = tv.tv_sec; - pmsg->timestamp_usec = tv.tv_usec; - pmsg->mark = entry->skb->mark; - pmsg->hook = entry->hook; - pmsg->hw_protocol = entry->skb->protocol; - - if (entry->indev) - strcpy(pmsg->indev_name, entry->indev->name); - else - pmsg->indev_name[0] = '\0'; - - if (entry->outdev) - strcpy(pmsg->outdev_name, entry->outdev->name); - else - pmsg->outdev_name[0] = '\0'; - - if (entry->indev && entry->skb->dev && - entry->skb->mac_header != entry->skb->network_header) { - pmsg->hw_type = entry->skb->dev->type; - pmsg->hw_addrlen = dev_parse_header(entry->skb, - pmsg->hw_addr); - } - - if (data_len) - if (skb_copy_bits(entry->skb, 0, pmsg->payload, data_len)) - BUG(); - - nlh->nlmsg_len = skb->tail - old_tail; - return skb; - -nlmsg_failure: - kfree_skb(skb); - *errp = -EINVAL; - printk(KERN_ERR "ip_queue: error creating packet message\n"); - return NULL; -} - -static int -ipq_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) -{ - int status = -EINVAL; - struct sk_buff *nskb; - - if (copy_mode == IPQ_COPY_NONE) - return -EAGAIN; - - nskb = ipq_build_packet_message(entry, &status); - if (nskb == NULL) - return status; - - spin_lock_bh(&queue_lock); - - if (!peer_pid) - goto err_out_free_nskb; - - if (queue_total >= queue_maxlen) { - queue_dropped++; - status = -ENOSPC; - if (net_ratelimit()) - printk (KERN_WARNING "ip_queue: full at %d entries, " - "dropping packets(s). Dropped: %d\n", queue_total, - queue_dropped); - goto err_out_free_nskb; - } - - /* netlink_unicast will either free the nskb or attach it to a socket */ - status = netlink_unicast(ipqnl, nskb, peer_pid, MSG_DONTWAIT); - if (status < 0) { - queue_user_dropped++; - goto err_out_unlock; - } - - __ipq_enqueue_entry(entry); - - spin_unlock_bh(&queue_lock); - return status; - -err_out_free_nskb: - kfree_skb(nskb); - -err_out_unlock: - spin_unlock_bh(&queue_lock); - return status; -} - -static int -ipq_mangle_ipv4(ipq_verdict_msg_t *v, struct nf_queue_entry *e) -{ - int diff; - struct iphdr *user_iph = (struct iphdr *)v->payload; - struct sk_buff *nskb; - - if (v->data_len < sizeof(*user_iph)) - return 0; - diff = v->data_len - e->skb->len; - if (diff < 0) { - if (pskb_trim(e->skb, v->data_len)) - return -ENOMEM; - } else if (diff > 0) { - if (v->data_len > 0xFFFF) - return -EINVAL; - if (diff > skb_tailroom(e->skb)) { - nskb = skb_copy_expand(e->skb, skb_headroom(e->skb), - diff, GFP_ATOMIC); - if (!nskb) { - printk(KERN_WARNING "ip_queue: error " - "in mangle, dropping packet\n"); - return -ENOMEM; - } - kfree_skb(e->skb); - e->skb = nskb; - } - skb_put(e->skb, diff); - } - if (!skb_make_writable(e->skb, v->data_len)) - return -ENOMEM; - skb_copy_to_linear_data(e->skb, v->payload, v->data_len); - e->skb->ip_summed = CHECKSUM_NONE; - - return 0; -} - -static int -ipq_set_verdict(struct ipq_verdict_msg *vmsg, unsigned int len) -{ - struct nf_queue_entry *entry; - - if (vmsg->value > NF_MAX_VERDICT || vmsg->value == NF_STOLEN) - return -EINVAL; - - entry = ipq_find_dequeue_entry(vmsg->id); - if (entry == NULL) - return -ENOENT; - else { - int verdict = vmsg->value; - - if (vmsg->data_len && vmsg->data_len == len) - if (ipq_mangle_ipv4(vmsg, entry) < 0) - verdict = NF_DROP; - - nf_reinject(entry, verdict); - return 0; - } -} - -static int -ipq_set_mode(unsigned char mode, unsigned int range) -{ - int status; - - spin_lock_bh(&queue_lock); - status = __ipq_set_mode(mode, range); - spin_unlock_bh(&queue_lock); - return status; -} - -static int -ipq_receive_peer(struct ipq_peer_msg *pmsg, - unsigned char type, unsigned int len) -{ - int status = 0; - - if (len < sizeof(*pmsg)) - return -EINVAL; - - switch (type) { - case IPQM_MODE: - status = ipq_set_mode(pmsg->msg.mode.value, - pmsg->msg.mode.range); - break; - - case IPQM_VERDICT: - status = ipq_set_verdict(&pmsg->msg.verdict, - len - sizeof(*pmsg)); - break; - default: - status = -EINVAL; - } - return status; -} - -static int -dev_cmp(struct nf_queue_entry *entry, unsigned long ifindex) -{ - if (entry->indev) - if (entry->indev->ifindex == ifindex) - return 1; - if (entry->outdev) - if (entry->outdev->ifindex == ifindex) - return 1; -#ifdef CONFIG_BRIDGE_NETFILTER - if (entry->skb->nf_bridge) { - if (entry->skb->nf_bridge->physindev && - entry->skb->nf_bridge->physindev->ifindex == ifindex) - return 1; - if (entry->skb->nf_bridge->physoutdev && - entry->skb->nf_bridge->physoutdev->ifindex == ifindex) - return 1; - } -#endif - return 0; -} - -static void -ipq_dev_drop(int ifindex) -{ - ipq_flush(dev_cmp, ifindex); -} - -#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0) - -static inline void -__ipq_rcv_skb(struct sk_buff *skb) -{ - int status, type, pid, flags; - unsigned int nlmsglen, skblen; - struct nlmsghdr *nlh; - bool enable_timestamp = false; - - skblen = skb->len; - if (skblen < sizeof(*nlh)) - return; - - nlh = nlmsg_hdr(skb); - nlmsglen = nlh->nlmsg_len; - if (nlmsglen < sizeof(*nlh) || skblen < nlmsglen) - return; - - pid = nlh->nlmsg_pid; - flags = nlh->nlmsg_flags; - - if(pid <= 0 || !(flags & NLM_F_REQUEST) || flags & NLM_F_MULTI) - RCV_SKB_FAIL(-EINVAL); - - if (flags & MSG_TRUNC) - RCV_SKB_FAIL(-ECOMM); - - type = nlh->nlmsg_type; - if (type < NLMSG_NOOP || type >= IPQM_MAX) - RCV_SKB_FAIL(-EINVAL); - - if (type <= IPQM_BASE) - return; - - if (!capable(CAP_NET_ADMIN)) - RCV_SKB_FAIL(-EPERM); - - spin_lock_bh(&queue_lock); - - if (peer_pid) { - if (peer_pid != pid) { - spin_unlock_bh(&queue_lock); - RCV_SKB_FAIL(-EBUSY); - } - } else { - enable_timestamp = true; - peer_pid = pid; - } - - spin_unlock_bh(&queue_lock); - if (enable_timestamp) - net_enable_timestamp(); - status = ipq_receive_peer(NLMSG_DATA(nlh), type, - nlmsglen - NLMSG_LENGTH(0)); - if (status < 0) - RCV_SKB_FAIL(status); - - if (flags & NLM_F_ACK) - netlink_ack(skb, nlh, 0); -} - -static void -ipq_rcv_skb(struct sk_buff *skb) -{ - mutex_lock(&ipqnl_mutex); - __ipq_rcv_skb(skb); - mutex_unlock(&ipqnl_mutex); -} - -static int -ipq_rcv_dev_event(struct notifier_block *this, - unsigned long event, void *ptr) -{ - struct net_device *dev = ptr; - - if (!net_eq(dev_net(dev), &init_net)) - return NOTIFY_DONE; - - /* Drop any packets associated with the downed device */ - if (event == NETDEV_DOWN) - ipq_dev_drop(dev->ifindex); - return NOTIFY_DONE; -} - -static struct notifier_block ipq_dev_notifier = { - .notifier_call = ipq_rcv_dev_event, -}; - -static int -ipq_rcv_nl_event(struct notifier_block *this, - unsigned long event, void *ptr) -{ - struct netlink_notify *n = ptr; - - if (event == NETLINK_URELEASE && n->protocol == NETLINK_FIREWALL) { - spin_lock_bh(&queue_lock); - if ((net_eq(n->net, &init_net)) && (n->pid == peer_pid)) - __ipq_reset(); - spin_unlock_bh(&queue_lock); - } - return NOTIFY_DONE; -} - -static struct notifier_block ipq_nl_notifier = { - .notifier_call = ipq_rcv_nl_event, -}; - -#ifdef CONFIG_SYSCTL -static struct ctl_table_header *ipq_sysctl_header; - -static ctl_table ipq_table[] = { - { - .procname = NET_IPQ_QMAX_NAME, - .data = &queue_maxlen, - .maxlen = sizeof(queue_maxlen), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { } -}; -#endif - -#ifdef CONFIG_PROC_FS -static int ip_queue_show(struct seq_file *m, void *v) -{ - spin_lock_bh(&queue_lock); - - seq_printf(m, - "Peer PID : %d\n" - "Copy mode : %hu\n" - "Copy range : %u\n" - "Queue length : %u\n" - "Queue max. length : %u\n" - "Queue dropped : %u\n" - "Netlink dropped : %u\n", - peer_pid, - copy_mode, - copy_range, - queue_total, - queue_maxlen, - queue_dropped, - queue_user_dropped); - - spin_unlock_bh(&queue_lock); - return 0; -} - -static int ip_queue_open(struct inode *inode, struct file *file) -{ - return single_open(file, ip_queue_show, NULL); -} - -static const struct file_operations ip_queue_proc_fops = { - .open = ip_queue_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, - .owner = THIS_MODULE, -}; -#endif - -static const struct nf_queue_handler nfqh = { - .name = "ip_queue", - .outfn = &ipq_enqueue_packet, -}; - -static int __init ip_queue_init(void) -{ - int status = -ENOMEM; - struct proc_dir_entry *proc __maybe_unused; - - netlink_register_notifier(&ipq_nl_notifier); - ipqnl = netlink_kernel_create(&init_net, NETLINK_FIREWALL, 0, - ipq_rcv_skb, NULL, THIS_MODULE); - if (ipqnl == NULL) { - printk(KERN_ERR "ip_queue: failed to create netlink socket\n"); - goto cleanup_netlink_notifier; - } - -#ifdef CONFIG_PROC_FS - proc = proc_create(IPQ_PROC_FS_NAME, 0, init_net.proc_net, - &ip_queue_proc_fops); - if (!proc) { - printk(KERN_ERR "ip_queue: failed to create proc entry\n"); - goto cleanup_ipqnl; - } -#endif - register_netdevice_notifier(&ipq_dev_notifier); -#ifdef CONFIG_SYSCTL - ipq_sysctl_header = register_sysctl_paths(net_ipv4_ctl_path, ipq_table); -#endif - status = nf_register_queue_handler(NFPROTO_IPV4, &nfqh); - if (status < 0) { - printk(KERN_ERR "ip_queue: failed to register queue handler\n"); - goto cleanup_sysctl; - } - return status; - -cleanup_sysctl: -#ifdef CONFIG_SYSCTL - unregister_sysctl_table(ipq_sysctl_header); -#endif - unregister_netdevice_notifier(&ipq_dev_notifier); - proc_net_remove(&init_net, IPQ_PROC_FS_NAME); -cleanup_ipqnl: __maybe_unused - netlink_kernel_release(ipqnl); - mutex_lock(&ipqnl_mutex); - mutex_unlock(&ipqnl_mutex); - -cleanup_netlink_notifier: - netlink_unregister_notifier(&ipq_nl_notifier); - return status; -} - -static void __exit ip_queue_fini(void) -{ - nf_unregister_queue_handlers(&nfqh); - - ipq_flush(NULL, 0); - -#ifdef CONFIG_SYSCTL - unregister_sysctl_table(ipq_sysctl_header); -#endif - unregister_netdevice_notifier(&ipq_dev_notifier); - proc_net_remove(&init_net, IPQ_PROC_FS_NAME); - - netlink_kernel_release(ipqnl); - mutex_lock(&ipqnl_mutex); - mutex_unlock(&ipqnl_mutex); - - netlink_unregister_notifier(&ipq_nl_notifier); -} - -MODULE_DESCRIPTION("IPv4 packet queue handler"); -MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>"); -MODULE_LICENSE("GPL"); -MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_FIREWALL); - -module_init(ip_queue_init); -module_exit(ip_queue_fini); diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 24e556e83a3b..170b1fdd6b72 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -153,8 +153,7 @@ ip_checkentry(const struct ipt_ip *ip) static unsigned int ipt_error(struct sk_buff *skb, const struct xt_action_param *par) { - if (net_ratelimit()) - pr_info("error: `%s'\n", (const char *)par->targinfo); + net_info_ratelimited("error: `%s'\n", (const char *)par->targinfo); return NF_DROP; } @@ -377,7 +376,7 @@ ipt_do_table(struct sk_buff *skb, if (v < 0) { /* Pop from stack? */ if (v != XT_RETURN) { - verdict = (unsigned)(-v) - 1; + verdict = (unsigned int)(-v) - 1; break; } if (*stackptr <= origptr) { diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index a639967eb727..fe5daea5214d 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -246,8 +246,7 @@ clusterip_hashfn(const struct sk_buff *skb, dport = ports[1]; } } else { - if (net_ratelimit()) - pr_info("unknown protocol %u\n", iph->protocol); + net_info_ratelimited("unknown protocol %u\n", iph->protocol); } switch (config->hash_mode) { diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c index ba5756d20165..1109f7f6c254 100644 --- a/net/ipv4/netfilter/ipt_ULOG.c +++ b/net/ipv4/netfilter/ipt_ULOG.c @@ -196,12 +196,15 @@ static void ipt_ulog_packet(unsigned int hooknum, pr_debug("qlen %d, qthreshold %Zu\n", ub->qlen, loginfo->qthreshold); - /* NLMSG_PUT contains a hidden goto nlmsg_failure !!! */ - nlh = NLMSG_PUT(ub->skb, 0, ub->qlen, ULOG_NL_EVENT, - sizeof(*pm)+copy_len); + nlh = nlmsg_put(ub->skb, 0, ub->qlen, ULOG_NL_EVENT, + sizeof(*pm)+copy_len, 0); + if (!nlh) { + pr_debug("error during nlmsg_put\n"); + goto out_unlock; + } ub->qlen++; - pm = NLMSG_DATA(nlh); + pm = nlmsg_data(nlh); /* We might not have a timestamp, get one */ if (skb->tstamp.tv64 == 0) @@ -261,13 +264,11 @@ static void ipt_ulog_packet(unsigned int hooknum, nlh->nlmsg_type = NLMSG_DONE; ulog_send(groupnum); } - +out_unlock: spin_unlock_bh(&ulog_lock); return; -nlmsg_failure: - pr_debug("error during NLMSG_PUT\n"); alloc_failure: pr_debug("Error building netlink message\n"); spin_unlock_bh(&ulog_lock); @@ -380,6 +381,9 @@ static struct nf_logger ipt_ulog_logger __read_mostly = { static int __init ulog_tg_init(void) { int ret, i; + struct netlink_kernel_cfg cfg = { + .groups = ULOG_MAXNLGROUPS, + }; pr_debug("init module\n"); @@ -392,9 +396,8 @@ static int __init ulog_tg_init(void) for (i = 0; i < ULOG_MAXNLGROUPS; i++) setup_timer(&ulog_buffers[i].timer, ulog_timer, i); - nflognl = netlink_kernel_create(&init_net, - NETLINK_NFLOG, ULOG_MAXNLGROUPS, NULL, - NULL, THIS_MODULE); + nflognl = netlink_kernel_create(&init_net, NETLINK_NFLOG, + THIS_MODULE, &cfg); if (!nflognl) return -ENOMEM; diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index de9da21113a1..e7ff2dcab6ce 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -74,24 +74,32 @@ static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph); if (iph == NULL) - return -NF_DROP; + return -NF_ACCEPT; /* Conntrack defragments packets, we might still see fragments * inside ICMP packets though. */ if (iph->frag_off & htons(IP_OFFSET)) - return -NF_DROP; + return -NF_ACCEPT; *dataoff = nhoff + (iph->ihl << 2); *protonum = iph->protocol; + /* Check bogus IP headers */ + if (*dataoff > skb->len) { + pr_debug("nf_conntrack_ipv4: bogus IPv4 packet: " + "nhoff %u, ihl %u, skblen %u\n", + nhoff, iph->ihl << 2, skb->len); + return -NF_ACCEPT; + } + return NF_ACCEPT; } -static unsigned int ipv4_confirm(unsigned int hooknum, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) +static unsigned int ipv4_helper(unsigned int hooknum, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) { struct nf_conn *ct; enum ip_conntrack_info ctinfo; @@ -102,24 +110,38 @@ static unsigned int ipv4_confirm(unsigned int hooknum, /* This is where we call the helper: as the packet goes out. */ ct = nf_ct_get(skb, &ctinfo); if (!ct || ctinfo == IP_CT_RELATED_REPLY) - goto out; + return NF_ACCEPT; help = nfct_help(ct); if (!help) - goto out; + return NF_ACCEPT; /* rcu_read_lock()ed by nf_hook_slow */ helper = rcu_dereference(help->helper); if (!helper) - goto out; + return NF_ACCEPT; ret = helper->help(skb, skb_network_offset(skb) + ip_hdrlen(skb), ct, ctinfo); - if (ret != NF_ACCEPT) { + if (ret != NF_ACCEPT && (ret & NF_VERDICT_MASK) != NF_QUEUE) { nf_log_packet(NFPROTO_IPV4, hooknum, skb, in, out, NULL, "nf_ct_%s: dropping packet", helper->name); - return ret; } + return ret; +} + +static unsigned int ipv4_confirm(unsigned int hooknum, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + + ct = nf_ct_get(skb, &ctinfo); + if (!ct || ctinfo == IP_CT_RELATED_REPLY) + goto out; /* adjust seqs for loopback traffic only in outgoing direction */ if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && @@ -177,6 +199,13 @@ static struct nf_hook_ops ipv4_conntrack_ops[] __read_mostly = { .priority = NF_IP_PRI_CONNTRACK, }, { + .hook = ipv4_helper, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_POST_ROUTING, + .priority = NF_IP_PRI_CONNTRACK_HELPER, + }, + { .hook = ipv4_confirm, .owner = THIS_MODULE, .pf = NFPROTO_IPV4, @@ -184,6 +213,13 @@ static struct nf_hook_ops ipv4_conntrack_ops[] __read_mostly = { .priority = NF_IP_PRI_CONNTRACK_CONFIRM, }, { + .hook = ipv4_helper, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_LOCAL_IN, + .priority = NF_IP_PRI_CONNTRACK_HELPER, + }, + { .hook = ipv4_confirm, .owner = THIS_MODULE, .pf = NFPROTO_IPV4, @@ -199,35 +235,30 @@ static int log_invalid_proto_max = 255; static ctl_table ip_ct_sysctl_table[] = { { .procname = "ip_conntrack_max", - .data = &nf_conntrack_max, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "ip_conntrack_count", - .data = &init_net.ct.count, .maxlen = sizeof(int), .mode = 0444, .proc_handler = proc_dointvec, }, { .procname = "ip_conntrack_buckets", - .data = &init_net.ct.htable_size, .maxlen = sizeof(unsigned int), .mode = 0444, .proc_handler = proc_dointvec, }, { .procname = "ip_conntrack_checksum", - .data = &init_net.ct.sysctl_checksum, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "ip_conntrack_log_invalid", - .data = &init_net.ct.sysctl_log_invalid, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, @@ -303,8 +334,9 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len) static int ipv4_tuple_to_nlattr(struct sk_buff *skb, const struct nf_conntrack_tuple *tuple) { - NLA_PUT_BE32(skb, CTA_IP_V4_SRC, tuple->src.u3.ip); - NLA_PUT_BE32(skb, CTA_IP_V4_DST, tuple->dst.u3.ip); + if (nla_put_be32(skb, CTA_IP_V4_SRC, tuple->src.u3.ip) || + nla_put_be32(skb, CTA_IP_V4_DST, tuple->dst.u3.ip)) + goto nla_put_failure; return 0; nla_put_failure: @@ -342,6 +374,25 @@ static struct nf_sockopt_ops so_getorigdst = { .owner = THIS_MODULE, }; +static int ipv4_init_net(struct net *net) +{ +#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) + struct nf_ip_net *in = &net->ct.nf_ct_proto; + in->ctl_table = kmemdup(ip_ct_sysctl_table, + sizeof(ip_ct_sysctl_table), + GFP_KERNEL); + if (!in->ctl_table) + return -ENOMEM; + + in->ctl_table[0].data = &nf_conntrack_max; + in->ctl_table[1].data = &net->ct.count; + in->ctl_table[2].data = &net->ct.htable_size; + in->ctl_table[3].data = &net->ct.sysctl_checksum; + in->ctl_table[4].data = &net->ct.sysctl_log_invalid; +#endif + return 0; +} + struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = { .l3proto = PF_INET, .name = "ipv4", @@ -356,9 +407,9 @@ struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = { .nla_policy = ipv4_nla_policy, #endif #if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) - .ctl_table_path = nf_net_ipv4_netfilter_sysctl_path, - .ctl_table = ip_ct_sysctl_table, + .ctl_table_path = "net/ipv4/netfilter", #endif + .init_net = ipv4_init_net, .me = THIS_MODULE, }; @@ -369,6 +420,65 @@ MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET)); MODULE_ALIAS("ip_conntrack"); MODULE_LICENSE("GPL"); +static int ipv4_net_init(struct net *net) +{ + int ret = 0; + + ret = nf_conntrack_l4proto_register(net, + &nf_conntrack_l4proto_tcp4); + if (ret < 0) { + pr_err("nf_conntrack_l4proto_tcp4 :protocol register failed\n"); + goto out_tcp; + } + ret = nf_conntrack_l4proto_register(net, + &nf_conntrack_l4proto_udp4); + if (ret < 0) { + pr_err("nf_conntrack_l4proto_udp4 :protocol register failed\n"); + goto out_udp; + } + ret = nf_conntrack_l4proto_register(net, + &nf_conntrack_l4proto_icmp); + if (ret < 0) { + pr_err("nf_conntrack_l4proto_icmp4 :protocol register failed\n"); + goto out_icmp; + } + ret = nf_conntrack_l3proto_register(net, + &nf_conntrack_l3proto_ipv4); + if (ret < 0) { + pr_err("nf_conntrack_l3proto_ipv4 :protocol register failed\n"); + goto out_ipv4; + } + return 0; +out_ipv4: + nf_conntrack_l4proto_unregister(net, + &nf_conntrack_l4proto_icmp); +out_icmp: + nf_conntrack_l4proto_unregister(net, + &nf_conntrack_l4proto_udp4); +out_udp: + nf_conntrack_l4proto_unregister(net, + &nf_conntrack_l4proto_tcp4); +out_tcp: + return ret; +} + +static void ipv4_net_exit(struct net *net) +{ + nf_conntrack_l3proto_unregister(net, + &nf_conntrack_l3proto_ipv4); + nf_conntrack_l4proto_unregister(net, + &nf_conntrack_l4proto_icmp); + nf_conntrack_l4proto_unregister(net, + &nf_conntrack_l4proto_udp4); + nf_conntrack_l4proto_unregister(net, + &nf_conntrack_l4proto_tcp4); +} + +static struct pernet_operations ipv4_net_ops = { + .init = ipv4_net_init, + .exit = ipv4_net_exit, +}; + static int __init nf_conntrack_l3proto_ipv4_init(void) { int ret = 0; @@ -382,35 +492,17 @@ static int __init nf_conntrack_l3proto_ipv4_init(void) return ret; } - ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_tcp4); + ret = register_pernet_subsys(&ipv4_net_ops); if (ret < 0) { - pr_err("nf_conntrack_ipv4: can't register tcp.\n"); + pr_err("nf_conntrack_ipv4: can't register pernet ops\n"); goto cleanup_sockopt; } - ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_udp4); - if (ret < 0) { - pr_err("nf_conntrack_ipv4: can't register udp.\n"); - goto cleanup_tcp; - } - - ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_icmp); - if (ret < 0) { - pr_err("nf_conntrack_ipv4: can't register icmp.\n"); - goto cleanup_udp; - } - - ret = nf_conntrack_l3proto_register(&nf_conntrack_l3proto_ipv4); - if (ret < 0) { - pr_err("nf_conntrack_ipv4: can't register ipv4\n"); - goto cleanup_icmp; - } - ret = nf_register_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops)); if (ret < 0) { pr_err("nf_conntrack_ipv4: can't register hooks.\n"); - goto cleanup_ipv4; + goto cleanup_pernet; } #if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) ret = nf_conntrack_ipv4_compat_init(); @@ -422,14 +514,8 @@ static int __init nf_conntrack_l3proto_ipv4_init(void) cleanup_hooks: nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops)); #endif - cleanup_ipv4: - nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv4); - cleanup_icmp: - nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_icmp); - cleanup_udp: - nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp4); - cleanup_tcp: - nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp4); + cleanup_pernet: + unregister_pernet_subsys(&ipv4_net_ops); cleanup_sockopt: nf_unregister_sockopt(&so_getorigdst); return ret; @@ -442,10 +528,7 @@ static void __exit nf_conntrack_l3proto_ipv4_fini(void) nf_conntrack_ipv4_compat_fini(); #endif nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops)); - nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv4); - nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_icmp); - nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp4); - nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp4); + unregister_pernet_subsys(&ipv4_net_ops); nf_unregister_sockopt(&so_getorigdst); } diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c index 7cbe9cb261c2..5241d997ab75 100644 --- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c @@ -23,6 +23,11 @@ static unsigned int nf_ct_icmp_timeout __read_mostly = 30*HZ; +static inline struct nf_icmp_net *icmp_pernet(struct net *net) +{ + return &net->ct.nf_ct_proto.icmp; +} + static bool icmp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, struct nf_conntrack_tuple *tuple) { @@ -77,7 +82,7 @@ static int icmp_print_tuple(struct seq_file *s, static unsigned int *icmp_get_timeouts(struct net *net) { - return &nf_ct_icmp_timeout; + return &icmp_pernet(net)->timeout; } /* Returns verdict for packet, or -1 for invalid. */ @@ -228,10 +233,10 @@ icmp_error(struct net *net, struct nf_conn *tmpl, static int icmp_tuple_to_nlattr(struct sk_buff *skb, const struct nf_conntrack_tuple *t) { - NLA_PUT_BE16(skb, CTA_PROTO_ICMP_ID, t->src.u.icmp.id); - NLA_PUT_U8(skb, CTA_PROTO_ICMP_TYPE, t->dst.u.icmp.type); - NLA_PUT_U8(skb, CTA_PROTO_ICMP_CODE, t->dst.u.icmp.code); - + if (nla_put_be16(skb, CTA_PROTO_ICMP_ID, t->src.u.icmp.id) || + nla_put_u8(skb, CTA_PROTO_ICMP_TYPE, t->dst.u.icmp.type) || + nla_put_u8(skb, CTA_PROTO_ICMP_CODE, t->dst.u.icmp.code)) + goto nla_put_failure; return 0; nla_put_failure: @@ -274,16 +279,18 @@ static int icmp_nlattr_tuple_size(void) #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_cttimeout.h> -static int icmp_timeout_nlattr_to_obj(struct nlattr *tb[], void *data) +static int icmp_timeout_nlattr_to_obj(struct nlattr *tb[], + struct net *net, void *data) { unsigned int *timeout = data; + struct nf_icmp_net *in = icmp_pernet(net); if (tb[CTA_TIMEOUT_ICMP_TIMEOUT]) { *timeout = ntohl(nla_get_be32(tb[CTA_TIMEOUT_ICMP_TIMEOUT])) * HZ; } else { /* Set default ICMP timeout. */ - *timeout = nf_ct_icmp_timeout; + *timeout = in->timeout; } return 0; } @@ -293,8 +300,8 @@ icmp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data) { const unsigned int *timeout = data; - NLA_PUT_BE32(skb, CTA_TIMEOUT_ICMP_TIMEOUT, htonl(*timeout / HZ)); - + if (nla_put_be32(skb, CTA_TIMEOUT_ICMP_TIMEOUT, htonl(*timeout / HZ))) + goto nla_put_failure; return 0; nla_put_failure: @@ -308,11 +315,9 @@ icmp_timeout_nla_policy[CTA_TIMEOUT_ICMP_MAX+1] = { #endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ #ifdef CONFIG_SYSCTL -static struct ctl_table_header *icmp_sysctl_header; static struct ctl_table icmp_sysctl_table[] = { { .procname = "nf_conntrack_icmp_timeout", - .data = &nf_ct_icmp_timeout, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, @@ -323,7 +328,6 @@ static struct ctl_table icmp_sysctl_table[] = { static struct ctl_table icmp_compat_sysctl_table[] = { { .procname = "ip_conntrack_icmp_timeout", - .data = &nf_ct_icmp_timeout, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, @@ -333,6 +337,62 @@ static struct ctl_table icmp_compat_sysctl_table[] = { #endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */ #endif /* CONFIG_SYSCTL */ +static int icmp_kmemdup_sysctl_table(struct nf_proto_net *pn, + struct nf_icmp_net *in) +{ +#ifdef CONFIG_SYSCTL + pn->ctl_table = kmemdup(icmp_sysctl_table, + sizeof(icmp_sysctl_table), + GFP_KERNEL); + if (!pn->ctl_table) + return -ENOMEM; + + pn->ctl_table[0].data = &in->timeout; +#endif + return 0; +} + +static int icmp_kmemdup_compat_sysctl_table(struct nf_proto_net *pn, + struct nf_icmp_net *in) +{ +#ifdef CONFIG_SYSCTL +#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT + pn->ctl_compat_table = kmemdup(icmp_compat_sysctl_table, + sizeof(icmp_compat_sysctl_table), + GFP_KERNEL); + if (!pn->ctl_compat_table) + return -ENOMEM; + + pn->ctl_compat_table[0].data = &in->timeout; +#endif +#endif + return 0; +} + +static int icmp_init_net(struct net *net, u_int16_t proto) +{ + int ret; + struct nf_icmp_net *in = icmp_pernet(net); + struct nf_proto_net *pn = &in->pn; + + in->timeout = nf_ct_icmp_timeout; + + ret = icmp_kmemdup_compat_sysctl_table(pn, in); + if (ret < 0) + return ret; + + ret = icmp_kmemdup_sysctl_table(pn, in); + if (ret < 0) + nf_ct_kfree_compat_sysctl_table(pn); + + return ret; +} + +static struct nf_proto_net *icmp_get_net_proto(struct net *net) +{ + return &net->ct.nf_ct_proto.icmp.pn; +} + struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp __read_mostly = { .l3proto = PF_INET, @@ -362,11 +422,6 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp __read_mostly = .nla_policy = icmp_timeout_nla_policy, }, #endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ -#ifdef CONFIG_SYSCTL - .ctl_table_header = &icmp_sysctl_header, - .ctl_table = icmp_sysctl_table, -#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT - .ctl_compat_table = icmp_compat_sysctl_table, -#endif -#endif + .init_net = icmp_init_net, + .get_net_proto = icmp_get_net_proto, }; diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c index 9bb1b8a37a22..742815518b0f 100644 --- a/net/ipv4/netfilter/nf_defrag_ipv4.c +++ b/net/ipv4/netfilter/nf_defrag_ipv4.c @@ -94,14 +94,14 @@ static struct nf_hook_ops ipv4_defrag_ops[] = { { .hook = ipv4_conntrack_defrag, .owner = THIS_MODULE, - .pf = PF_INET, + .pf = NFPROTO_IPV4, .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP_PRI_CONNTRACK_DEFRAG, }, { .hook = ipv4_conntrack_defrag, .owner = THIS_MODULE, - .pf = PF_INET, + .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP_PRI_CONNTRACK_DEFRAG, }, diff --git a/net/ipv4/netfilter/nf_nat_amanda.c b/net/ipv4/netfilter/nf_nat_amanda.c index 7b22382ff0e9..3c04d24e2976 100644 --- a/net/ipv4/netfilter/nf_nat_amanda.c +++ b/net/ipv4/netfilter/nf_nat_amanda.c @@ -13,10 +13,10 @@ #include <linux/skbuff.h> #include <linux/udp.h> -#include <net/netfilter/nf_nat_helper.h> -#include <net/netfilter/nf_nat_rule.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_expect.h> +#include <net/netfilter/nf_nat_helper.h> +#include <net/netfilter/nf_nat_rule.h> #include <linux/netfilter/nf_conntrack_amanda.h> MODULE_AUTHOR("Brian J. Murrell <netfilter@interlinx.bc.ca>"); diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c index abb52adf5acd..44b082fd48ab 100644 --- a/net/ipv4/netfilter/nf_nat_core.c +++ b/net/ipv4/netfilter/nf_nat_core.c @@ -691,6 +691,10 @@ static struct nf_ct_helper_expectfn follow_master_nat = { .expectfn = nf_nat_follow_master, }; +static struct nfq_ct_nat_hook nfq_ct_nat = { + .seq_adjust = nf_nat_tcp_seq_adjust, +}; + static int __init nf_nat_init(void) { size_t i; @@ -731,6 +735,7 @@ static int __init nf_nat_init(void) nfnetlink_parse_nat_setup); BUG_ON(nf_ct_nat_offset != NULL); RCU_INIT_POINTER(nf_ct_nat_offset, nf_nat_get_offset); + RCU_INIT_POINTER(nfq_ct_nat_hook, &nfq_ct_nat); return 0; cleanup_extend: @@ -747,6 +752,7 @@ static void __exit nf_nat_cleanup(void) RCU_INIT_POINTER(nf_nat_seq_adjust_hook, NULL); RCU_INIT_POINTER(nfnetlink_parse_nat_setup_hook, NULL); RCU_INIT_POINTER(nf_ct_nat_offset, NULL); + RCU_INIT_POINTER(nfq_ct_nat_hook, NULL); synchronize_net(); } diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c index 82536701e3a3..c6784a18c1c4 100644 --- a/net/ipv4/netfilter/nf_nat_h323.c +++ b/net/ipv4/netfilter/nf_nat_h323.c @@ -42,9 +42,7 @@ static int set_addr(struct sk_buff *skb, if (!nf_nat_mangle_tcp_packet(skb, ct, ctinfo, addroff, sizeof(buf), (char *) &buf, sizeof(buf))) { - if (net_ratelimit()) - pr_notice("nf_nat_h323: nf_nat_mangle_tcp_packet" - " error\n"); + net_notice_ratelimited("nf_nat_h323: nf_nat_mangle_tcp_packet error\n"); return -1; } @@ -58,9 +56,7 @@ static int set_addr(struct sk_buff *skb, if (!nf_nat_mangle_udp_packet(skb, ct, ctinfo, addroff, sizeof(buf), (char *) &buf, sizeof(buf))) { - if (net_ratelimit()) - pr_notice("nf_nat_h323: nf_nat_mangle_udp_packet" - " error\n"); + net_notice_ratelimited("nf_nat_h323: nf_nat_mangle_udp_packet error\n"); return -1; } /* nf_nat_mangle_udp_packet uses skb_make_writable() to copy @@ -99,7 +95,7 @@ static int set_sig_addr(struct sk_buff *skb, struct nf_conn *ct, unsigned char **data, TransportAddress *taddr, int count) { - const struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info; + const struct nf_ct_h323_master *info = nfct_help_data(ct); int dir = CTINFO2DIR(ctinfo); int i; __be16 port; @@ -182,7 +178,7 @@ static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct, struct nf_conntrack_expect *rtp_exp, struct nf_conntrack_expect *rtcp_exp) { - struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info; + struct nf_ct_h323_master *info = nfct_help_data(ct); int dir = CTINFO2DIR(ctinfo); int i; u_int16_t nated_port; @@ -214,8 +210,7 @@ static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct, /* Run out of expectations */ if (i >= H323_RTP_CHANNEL_MAX) { - if (net_ratelimit()) - pr_notice("nf_nat_h323: out of expectations\n"); + net_notice_ratelimited("nf_nat_h323: out of expectations\n"); return 0; } @@ -244,8 +239,7 @@ static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct, } if (nated_port == 0) { /* No port available */ - if (net_ratelimit()) - pr_notice("nf_nat_h323: out of RTP ports\n"); + net_notice_ratelimited("nf_nat_h323: out of RTP ports\n"); return 0; } @@ -308,8 +302,7 @@ static int nat_t120(struct sk_buff *skb, struct nf_conn *ct, } if (nated_port == 0) { /* No port available */ - if (net_ratelimit()) - pr_notice("nf_nat_h323: out of TCP ports\n"); + net_notice_ratelimited("nf_nat_h323: out of TCP ports\n"); return 0; } @@ -337,7 +330,7 @@ static int nat_h245(struct sk_buff *skb, struct nf_conn *ct, TransportAddress *taddr, __be16 port, struct nf_conntrack_expect *exp) { - struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info; + struct nf_ct_h323_master *info = nfct_help_data(ct); int dir = CTINFO2DIR(ctinfo); u_int16_t nated_port = ntohs(port); @@ -365,8 +358,7 @@ static int nat_h245(struct sk_buff *skb, struct nf_conn *ct, } if (nated_port == 0) { /* No port available */ - if (net_ratelimit()) - pr_notice("nf_nat_q931: out of TCP ports\n"); + net_notice_ratelimited("nf_nat_q931: out of TCP ports\n"); return 0; } @@ -427,7 +419,7 @@ static int nat_q931(struct sk_buff *skb, struct nf_conn *ct, unsigned char **data, TransportAddress *taddr, int idx, __be16 port, struct nf_conntrack_expect *exp) { - struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info; + struct nf_ct_h323_master *info = nfct_help_data(ct); int dir = CTINFO2DIR(ctinfo); u_int16_t nated_port = ntohs(port); union nf_inet_addr addr; @@ -456,8 +448,7 @@ static int nat_q931(struct sk_buff *skb, struct nf_conn *ct, } if (nated_port == 0) { /* No port available */ - if (net_ratelimit()) - pr_notice("nf_nat_ras: out of TCP ports\n"); + net_notice_ratelimited("nf_nat_ras: out of TCP ports\n"); return 0; } @@ -545,8 +536,7 @@ static int nat_callforwarding(struct sk_buff *skb, struct nf_conn *ct, } if (nated_port == 0) { /* No port available */ - if (net_ratelimit()) - pr_notice("nf_nat_q931: out of TCP ports\n"); + net_notice_ratelimited("nf_nat_q931: out of TCP ports\n"); return 0; } diff --git a/net/ipv4/netfilter/nf_nat_helper.c b/net/ipv4/netfilter/nf_nat_helper.c index af65958f6308..2e59ad0b90ca 100644 --- a/net/ipv4/netfilter/nf_nat_helper.c +++ b/net/ipv4/netfilter/nf_nat_helper.c @@ -153,6 +153,19 @@ void nf_nat_set_seq_adjust(struct nf_conn *ct, enum ip_conntrack_info ctinfo, } EXPORT_SYMBOL_GPL(nf_nat_set_seq_adjust); +void nf_nat_tcp_seq_adjust(struct sk_buff *skb, struct nf_conn *ct, + u32 ctinfo, int off) +{ + const struct tcphdr *th; + + if (nf_ct_protonum(ct) != IPPROTO_TCP) + return; + + th = (struct tcphdr *)(skb_network_header(skb)+ ip_hdrlen(skb)); + nf_nat_set_seq_adjust(ct, ctinfo, th->seq, off); +} +EXPORT_SYMBOL_GPL(nf_nat_tcp_seq_adjust); + static void nf_nat_csum(struct sk_buff *skb, const struct iphdr *iph, void *data, int datalen, __sum16 *check, int oldlen) { diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c index c273d58980ae..388140881ebe 100644 --- a/net/ipv4/netfilter/nf_nat_pptp.c +++ b/net/ipv4/netfilter/nf_nat_pptp.c @@ -49,7 +49,7 @@ static void pptp_nat_expected(struct nf_conn *ct, const struct nf_nat_pptp *nat_pptp_info; struct nf_nat_ipv4_range range; - ct_pptp_info = &nfct_help(master)->help.ct_pptp_info; + ct_pptp_info = nfct_help_data(master); nat_pptp_info = &nfct_nat(master)->help.nat_pptp_info; /* And here goes the grand finale of corrosion... */ @@ -123,7 +123,7 @@ pptp_outbound_pkt(struct sk_buff *skb, __be16 new_callid; unsigned int cid_off; - ct_pptp_info = &nfct_help(ct)->help.ct_pptp_info; + ct_pptp_info = nfct_help_data(ct); nat_pptp_info = &nfct_nat(ct)->help.nat_pptp_info; new_callid = ct_pptp_info->pns_call_id; @@ -192,7 +192,7 @@ pptp_exp_gre(struct nf_conntrack_expect *expect_orig, struct nf_ct_pptp_master *ct_pptp_info; struct nf_nat_pptp *nat_pptp_info; - ct_pptp_info = &nfct_help(ct)->help.ct_pptp_info; + ct_pptp_info = nfct_help_data(ct); nat_pptp_info = &nfct_nat(ct)->help.nat_pptp_info; /* save original PAC call ID in nat_info */ diff --git a/net/ipv4/netfilter/nf_nat_sip.c b/net/ipv4/netfilter/nf_nat_sip.c index 57932c43960e..ea4a23813d26 100644 --- a/net/ipv4/netfilter/nf_nat_sip.c +++ b/net/ipv4/netfilter/nf_nat_sip.c @@ -283,7 +283,7 @@ static unsigned int ip_nat_sip_expect(struct sk_buff *skb, unsigned int dataoff, __be32 newip; u_int16_t port; char buffer[sizeof("nnn.nnn.nnn.nnn:nnnnn")]; - unsigned buflen; + unsigned int buflen; /* Connection will come from reply */ if (ct->tuplehash[dir].tuple.src.u3.ip == ct->tuplehash[!dir].tuple.dst.u3.ip) diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c index 2133c30a4a5f..bac712293fd6 100644 --- a/net/ipv4/netfilter/nf_nat_snmp_basic.c +++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c @@ -405,7 +405,7 @@ static unsigned char asn1_octets_decode(struct asn1_ctx *ctx, ptr = *octets; while (ctx->pointer < eoc) { - if (!asn1_octet_decode(ctx, (unsigned char *)ptr++)) { + if (!asn1_octet_decode(ctx, ptr++)) { kfree(*octets); *octets = NULL; return 0; @@ -759,7 +759,7 @@ static unsigned char snmp_object_decode(struct asn1_ctx *ctx, } break; case SNMP_OBJECTID: - if (!asn1_oid_decode(ctx, end, (unsigned long **)&lp, &len)) { + if (!asn1_oid_decode(ctx, end, &lp, &len)) { kfree(id); return 0; } @@ -1206,8 +1206,7 @@ static int snmp_translate(struct nf_conn *ct, if (!snmp_parse_mangle((unsigned char *)udph + sizeof(struct udphdr), paylen, &map, &udph->check)) { - if (net_ratelimit()) - printk(KERN_WARNING "bsalg: parser failed\n"); + net_warn_ratelimited("bsalg: parser failed\n"); return NF_DROP; } return NF_ACCEPT; @@ -1241,9 +1240,8 @@ static int help(struct sk_buff *skb, unsigned int protoff, * can mess around with the payload. */ if (ntohs(udph->len) != skb->len - (iph->ihl << 2)) { - if (net_ratelimit()) - printk(KERN_WARNING "SNMP: dropping malformed packet src=%pI4 dst=%pI4\n", - &iph->saddr, &iph->daddr); + net_warn_ratelimited("SNMP: dropping malformed packet src=%pI4 dst=%pI4\n", + &iph->saddr, &iph->daddr); return NF_DROP; } diff --git a/net/ipv4/netfilter/nf_nat_tftp.c b/net/ipv4/netfilter/nf_nat_tftp.c index a2901bf829c0..9dbb8d284f99 100644 --- a/net/ipv4/netfilter/nf_nat_tftp.c +++ b/net/ipv4/netfilter/nf_nat_tftp.c @@ -8,10 +8,10 @@ #include <linux/module.h> #include <linux/udp.h> -#include <net/netfilter/nf_nat_helper.h> -#include <net/netfilter/nf_nat_rule.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_expect.h> +#include <net/netfilter/nf_nat_helper.h> +#include <net/netfilter/nf_nat_rule.h> #include <linux/netfilter/nf_conntrack_tftp.h> MODULE_AUTHOR("Magnus Boden <mb@ozaba.mine.nu>"); diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 50009c787bcd..6232d476f37e 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -51,15 +51,16 @@ static struct ping_table ping_table; static u16 ping_port_rover; -static inline int ping_hashfn(struct net *net, unsigned num, unsigned mask) +static inline int ping_hashfn(struct net *net, unsigned int num, unsigned int mask) { int res = (num + net_hash_mix(net)) & mask; + pr_debug("hash(%d) = %d\n", num, res); return res; } static inline struct hlist_nulls_head *ping_hashslot(struct ping_table *table, - struct net *net, unsigned num) + struct net *net, unsigned int num) { return &table->hash[ping_hashfn(net, num, PING_HTABLE_MASK)]; } @@ -188,7 +189,8 @@ static void inet_get_ping_group_range_net(struct net *net, gid_t *low, gid_t *high) { gid_t *data = net->ipv4.sysctl_ping_group_range; - unsigned seq; + unsigned int seq; + do { seq = read_seqbegin(&sysctl_local_ports.lock); @@ -205,17 +207,22 @@ static int ping_init_sock(struct sock *sk) gid_t range[2]; struct group_info *group_info = get_current_groups(); int i, j, count = group_info->ngroups; + kgid_t low, high; inet_get_ping_group_range_net(net, range, range+1); + low = make_kgid(&init_user_ns, range[0]); + high = make_kgid(&init_user_ns, range[1]); + if (!gid_valid(low) || !gid_valid(high) || gid_lt(high, low)) + return -EACCES; + if (range[0] <= group && group <= range[1]) return 0; for (i = 0; i < group_info->nblocks; i++) { int cp_count = min_t(int, NGROUPS_PER_BLOCK, count); - for (j = 0; j < cp_count; j++) { - group = group_info->blocks[i][j]; - if (range[0] <= group && group <= range[1]) + kgid_t gid = group_info->blocks[i][j]; + if (gid_lte(low, gid) && gid_lte(gid, high)) return 0; } @@ -364,6 +371,7 @@ void ping_err(struct sk_buff *skb, u32 info) break; case ICMP_DEST_UNREACH: if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */ + ipv4_sk_update_pmtu(skb, sk, info); if (inet_sock->pmtudisc != IP_PMTUDISC_DONT) { err = EMSGSIZE; harderr = 1; @@ -379,6 +387,7 @@ void ping_err(struct sk_buff *skb, u32 info) break; case ICMP_REDIRECT: /* See ICMP_SOURCE_QUENCH */ + ipv4_sk_redirect(skb, sk); err = EREMOTEIO; break; } @@ -410,7 +419,7 @@ struct pingfakehdr { __wsum wcheck; }; -static int ping_getfrag(void *from, char * to, +static int ping_getfrag(void *from, char *to, int offset, int fraglen, int odd, struct sk_buff *skb) { struct pingfakehdr *pfh = (struct pingfakehdr *)from; diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 8af0d44e4e22..957acd12250b 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -232,7 +232,6 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPDSACKOfoSent", LINUX_MIB_TCPDSACKOFOSENT), SNMP_MIB_ITEM("TCPDSACKRecv", LINUX_MIB_TCPDSACKRECV), SNMP_MIB_ITEM("TCPDSACKOfoRecv", LINUX_MIB_TCPDSACKOFORECV), - SNMP_MIB_ITEM("TCPAbortOnSyn", LINUX_MIB_TCPABORTONSYN), SNMP_MIB_ITEM("TCPAbortOnData", LINUX_MIB_TCPABORTONDATA), SNMP_MIB_ITEM("TCPAbortOnClose", LINUX_MIB_TCPABORTONCLOSE), SNMP_MIB_ITEM("TCPAbortOnMemory", LINUX_MIB_TCPABORTONMEMORY), @@ -258,6 +257,12 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPReqQFullDrop", LINUX_MIB_TCPREQQFULLDROP), SNMP_MIB_ITEM("TCPRetransFail", LINUX_MIB_TCPRETRANSFAIL), SNMP_MIB_ITEM("TCPRcvCoalesce", LINUX_MIB_TCPRCVCOALESCE), + SNMP_MIB_ITEM("TCPOFOQueue", LINUX_MIB_TCPOFOQUEUE), + SNMP_MIB_ITEM("TCPOFODrop", LINUX_MIB_TCPOFODROP), + SNMP_MIB_ITEM("TCPOFOMerge", LINUX_MIB_TCPOFOMERGE), + SNMP_MIB_ITEM("TCPChallengeACK", LINUX_MIB_TCPCHALLENGEACK), + SNMP_MIB_ITEM("TCPSYNChallenge", LINUX_MIB_TCPSYNCHALLENGE), + SNMP_MIB_ITEM("TCPFastOpenActive", LINUX_MIB_TCPFASTOPENACTIVE), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c index 9ae5c01cd0b2..8918eff1426d 100644 --- a/net/ipv4/protocol.c +++ b/net/ipv4/protocol.c @@ -36,9 +36,7 @@ const struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly; int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol) { - int hash = protocol & (MAX_INET_PROTOS - 1); - - return !cmpxchg((const struct net_protocol **)&inet_protos[hash], + return !cmpxchg((const struct net_protocol **)&inet_protos[protocol], NULL, prot) ? 0 : -1; } EXPORT_SYMBOL(inet_add_protocol); @@ -49,9 +47,9 @@ EXPORT_SYMBOL(inet_add_protocol); int inet_del_protocol(const struct net_protocol *prot, unsigned char protocol) { - int ret, hash = protocol & (MAX_INET_PROTOS - 1); + int ret; - ret = (cmpxchg((const struct net_protocol **)&inet_protos[hash], + ret = (cmpxchg((const struct net_protocol **)&inet_protos[protocol], prot, NULL) == prot) ? 0 : -1; synchronize_net(); diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index bbd604c68e68..ff0f071969ea 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -216,6 +216,11 @@ static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info) int err = 0; int harderr = 0; + if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) + ipv4_sk_update_pmtu(skb, sk, info); + else if (type == ICMP_REDIRECT) + ipv4_sk_redirect(skb, sk); + /* Report error on raw socket, if: 1. User requested ip_recverr. 2. Socket is connected (otherwise the error indication @@ -288,7 +293,7 @@ void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info) read_unlock(&raw_v4_hashinfo.lock); } -static int raw_rcv_skb(struct sock * sk, struct sk_buff * skb) +static int raw_rcv_skb(struct sock *sk, struct sk_buff *skb) { /* Charge it to the socket. */ diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 4dc1c104c942..d547f6fae20d 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -109,6 +109,7 @@ #include <net/rtnetlink.h> #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> +#include <linux/kmemleak.h> #endif #include <net/secure_seq.h> @@ -147,7 +148,10 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst); static void ipv4_dst_destroy(struct dst_entry *dst); static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst); static void ipv4_link_failure(struct sk_buff *skb); -static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu); +static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb, u32 mtu); +static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb); static int rt_garbage_collect(struct dst_ops *ops); static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, @@ -157,40 +161,13 @@ static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old) { - struct rtable *rt = (struct rtable *) dst; - struct inet_peer *peer; - u32 *p = NULL; - - if (!rt->peer) - rt_bind_peer(rt, rt->rt_dst, 1); - - peer = rt->peer; - if (peer) { - u32 *old_p = __DST_METRICS_PTR(old); - unsigned long prev, new; - - p = peer->metrics; - if (inet_metrics_new(peer)) - memcpy(p, old_p, sizeof(u32) * RTAX_MAX); - - new = (unsigned long) p; - prev = cmpxchg(&dst->_metrics, old, new); - - if (prev != old) { - p = __DST_METRICS_PTR(prev); - if (prev & DST_METRICS_READ_ONLY) - p = NULL; - } else { - if (rt->fi) { - fib_info_put(rt->fi); - rt->fi = NULL; - } - } - } - return p; + WARN_ON(1); + return NULL; } -static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr); +static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, + struct sk_buff *skb, + const void *daddr); static struct dst_ops ipv4_dst_ops = { .family = AF_INET, @@ -205,6 +182,7 @@ static struct dst_ops ipv4_dst_ops = { .negative_advice = ipv4_negative_advice, .link_failure = ipv4_link_failure, .update_pmtu = ip_rt_update_pmtu, + .redirect = ip_do_redirect, .local_out = __ip_local_out, .neigh_lookup = ipv4_neigh_lookup, }; @@ -229,7 +207,7 @@ const __u8 ip_tos2prio[16] = { TC_PRIO_INTERACTIVE_BULK, ECN_OR_COST(INTERACTIVE_BULK) }; - +EXPORT_SYMBOL(ip_tos2prio); /* * Route cache. @@ -296,7 +274,7 @@ static inline void rt_hash_lock_init(void) #endif static struct rt_hash_bucket *rt_hash_table __read_mostly; -static unsigned rt_hash_mask __read_mostly; +static unsigned int rt_hash_mask __read_mostly; static unsigned int rt_hash_log __read_mostly; static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); @@ -420,29 +398,19 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v) "HHUptod\tSpecDst"); else { struct rtable *r = v; - struct neighbour *n; - int len, HHUptod; - - rcu_read_lock(); - n = dst_get_neighbour_noref(&r->dst); - HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0; - rcu_read_unlock(); + int len; seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t" - "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n", - r->dst.dev ? r->dst.dev->name : "*", - (__force u32)r->rt_dst, - (__force u32)r->rt_gateway, - r->rt_flags, atomic_read(&r->dst.__refcnt), - r->dst.__use, 0, (__force u32)r->rt_src, - dst_metric_advmss(&r->dst) + 40, - dst_metric(&r->dst, RTAX_WINDOW), - (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) + - dst_metric(&r->dst, RTAX_RTTVAR)), - r->rt_key_tos, - -1, - HHUptod, - r->rt_spec_dst, &len); + "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n", + r->dst.dev ? r->dst.dev->name : "*", + (__force u32)r->rt_dst, + (__force u32)r->rt_gateway, + r->rt_flags, atomic_read(&r->dst.__refcnt), + r->dst.__use, 0, (__force u32)r->rt_src, + dst_metric_advmss(&r->dst) + 40, + dst_metric(&r->dst, RTAX_WINDOW), 0, + r->rt_key_tos, + -1, 0, 0, &len); seq_printf(seq, "%*s\n", 127 - len, ""); } @@ -679,7 +647,7 @@ static inline int rt_fast_clean(struct rtable *rth) static inline int rt_valuable(struct rtable *rth) { return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) || - (rth->peer && rth->peer->pmtu_expires); + rth->dst.expires; } static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2) @@ -872,34 +840,22 @@ static void rt_check_expire(void) while ((rth = rcu_dereference_protected(*rthp, lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) { prefetch(rth->dst.rt_next); - if (rt_is_expired(rth)) { + if (rt_is_expired(rth) || + rt_may_expire(rth, tmo, ip_rt_gc_timeout)) { *rthp = rth->dst.rt_next; rt_free(rth); continue; } - if (rth->dst.expires) { - /* Entry is expired even if it is in use */ - if (time_before_eq(jiffies, rth->dst.expires)) { -nofree: - tmo >>= 1; - rthp = &rth->dst.rt_next; - /* - * We only count entries on - * a chain with equal hash inputs once - * so that entries for different QOS - * levels, and other non-hash input - * attributes don't unfairly skew - * the length computation - */ - length += has_noalias(rt_hash_table[i].chain, rth); - continue; - } - } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) - goto nofree; - /* Cleanup aged off entries. */ - *rthp = rth->dst.rt_next; - rt_free(rth); + /* We only count entries on a chain with equal + * hash inputs once so that entries for + * different QOS levels, and other non-hash + * input attributes don't unfairly skew the + * length computation + */ + tmo >>= 1; + rthp = &rth->dst.rt_next; + length += has_noalias(rt_hash_table[i].chain, rth); } spin_unlock_bh(rt_hash_lock_addr(i)); sum += length; @@ -937,7 +893,6 @@ static void rt_cache_invalidate(struct net *net) get_random_bytes(&shuffle, sizeof(shuffle)); atomic_add(shuffle + 1U, &net->ipv4.rt_genid); - inetpeer_invalidate_tree(AF_INET); } /* @@ -959,8 +914,7 @@ void rt_cache_flush_batch(struct net *net) static void rt_emergency_hash_rebuild(struct net *net) { - if (net_ratelimit()) - pr_warn("Route hash chain too long!\n"); + net_warn_ratelimited("Route hash chain too long!\n"); rt_cache_invalidate(net); } @@ -1083,8 +1037,7 @@ static int rt_garbage_collect(struct dst_ops *ops) goto out; if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size) goto out; - if (net_ratelimit()) - pr_warn("dst cache overflow\n"); + net_warn_ratelimited("dst cache overflow\n"); RT_CACHE_STAT_INC(gc_dst_overflow); return 1; @@ -1112,20 +1065,20 @@ static int slow_chain_length(const struct rtable *head) return length >> FRACT_BITS; } -static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr) +static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, + struct sk_buff *skb, + const void *daddr) { - static const __be32 inaddr_any = 0; struct net_device *dev = dst->dev; const __be32 *pkey = daddr; const struct rtable *rt; struct neighbour *n; rt = (const struct rtable *) dst; - - if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) - pkey = &inaddr_any; - else if (rt->rt_gateway) + if (rt->rt_gateway) pkey = (const __be32 *) &rt->rt_gateway; + else if (skb) + pkey = &ip_hdr(skb)->daddr; n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey); if (n) @@ -1133,17 +1086,7 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const vo return neigh_create(&arp_tbl, pkey, dev); } -static int rt_bind_neighbour(struct rtable *rt) -{ - struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway); - if (IS_ERR(n)) - return PTR_ERR(n); - dst_set_neighbour(&rt->dst, n); - - return 0; -} - -static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt, +static struct rtable *rt_intern_hash(unsigned int hash, struct rtable *rt, struct sk_buff *skb, int ifindex) { struct rtable *rth, *cand; @@ -1151,7 +1094,6 @@ static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt, unsigned long now; u32 min_score; int chain_length; - int attempts = !in_softirq(); restart: chain_length = 0; @@ -1160,7 +1102,7 @@ restart: candp = NULL; now = jiffies; - if (!rt_caching(dev_net(rt->dst.dev))) { + if (!rt_caching(dev_net(rt->dst.dev)) || (rt->dst.flags & DST_NOCACHE)) { /* * If we're not caching, just tell the caller we * were successful and don't touch the route. The @@ -1178,16 +1120,6 @@ restart: */ rt->dst.flags |= DST_NOCACHE; - if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) { - int err = rt_bind_neighbour(rt); - if (err) { - if (net_ratelimit()) - pr_warn("Neighbour table failure & not caching routes\n"); - ip_rt_put(rt); - return ERR_PTR(err); - } - } - goto skip_hashing; } @@ -1270,41 +1202,6 @@ restart: } } - /* Try to bind route to arp only if it is output - route or unicast forwarding path. - */ - if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) { - int err = rt_bind_neighbour(rt); - if (err) { - spin_unlock_bh(rt_hash_lock_addr(hash)); - - if (err != -ENOBUFS) { - rt_drop(rt); - return ERR_PTR(err); - } - - /* Neighbour tables are full and nothing - can be released. Try to shrink route cache, - it is most likely it holds some neighbour records. - */ - if (attempts-- > 0) { - int saved_elasticity = ip_rt_gc_elasticity; - int saved_int = ip_rt_gc_min_interval; - ip_rt_gc_elasticity = 1; - ip_rt_gc_min_interval = 0; - rt_garbage_collect(&ipv4_dst_ops); - ip_rt_gc_min_interval = saved_int; - ip_rt_gc_elasticity = saved_elasticity; - goto restart; - } - - if (net_ratelimit()) - pr_warn("Neighbour table overflow\n"); - rt_drop(rt); - return ERR_PTR(-ENOBUFS); - } - } - rt->dst.rt_next = rt_hash_table[hash].chain; /* @@ -1322,25 +1219,6 @@ skip_hashing: return rt; } -static atomic_t __rt_peer_genid = ATOMIC_INIT(0); - -static u32 rt_peer_genid(void) -{ - return atomic_read(&__rt_peer_genid); -} - -void rt_bind_peer(struct rtable *rt, __be32 daddr, int create) -{ - struct inet_peer *peer; - - peer = inet_getpeer_v4(daddr, create); - - if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL) - inet_putpeer(peer); - else - rt->rt_peer_genid = rt_peer_genid(); -} - /* * Peer allocation may fail only in serious out-of-memory conditions. However * we still can generate some output. @@ -1363,28 +1241,21 @@ static void ip_select_fb_ident(struct iphdr *iph) void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more) { - struct rtable *rt = (struct rtable *) dst; - - if (rt && !(rt->dst.flags & DST_NOPEER)) { - if (rt->peer == NULL) - rt_bind_peer(rt, rt->rt_dst, 1); + struct net *net = dev_net(dst->dev); + struct inet_peer *peer; - /* If peer is attached to destination, it is never detached, - so that we need not to grab a lock to dereference it. - */ - if (rt->peer) { - iph->id = htons(inet_getid(rt->peer, more)); - return; - } - } else if (!rt) - printk(KERN_DEBUG "rt_bind_peer(0) @%p\n", - __builtin_return_address(0)); + peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1); + if (peer) { + iph->id = htons(inet_getid(peer, more)); + inet_putpeer(peer); + return; + } ip_select_fb_ident(iph); } EXPORT_SYMBOL(__ip_select_ident); -static void rt_del(unsigned hash, struct rtable *rt) +static void rt_del(unsigned int hash, struct rtable *rt) { struct rtable __rcu **rthp; struct rtable *aux; @@ -1404,43 +1275,173 @@ static void rt_del(unsigned hash, struct rtable *rt) spin_unlock_bh(rt_hash_lock_addr(hash)); } -static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer) +static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk, + const struct iphdr *iph, + int oif, u8 tos, + u8 prot, u32 mark, int flow_flags) { - struct rtable *rt = (struct rtable *) dst; - __be32 orig_gw = rt->rt_gateway; - struct neighbour *n, *old_n; + if (sk) { + const struct inet_sock *inet = inet_sk(sk); + + oif = sk->sk_bound_dev_if; + mark = sk->sk_mark; + tos = RT_CONN_FLAGS(sk); + prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol; + } + flowi4_init_output(fl4, oif, mark, tos, + RT_SCOPE_UNIVERSE, prot, + flow_flags, + iph->daddr, iph->saddr, 0, 0); +} - dst_confirm(&rt->dst); +static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb, + const struct sock *sk) +{ + const struct iphdr *iph = ip_hdr(skb); + int oif = skb->dev->ifindex; + u8 tos = RT_TOS(iph->tos); + u8 prot = iph->protocol; + u32 mark = skb->mark; - rt->rt_gateway = peer->redirect_learned.a4; + __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0); +} - n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway); - if (IS_ERR(n)) { - rt->rt_gateway = orig_gw; - return; +static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk) +{ + const struct inet_sock *inet = inet_sk(sk); + const struct ip_options_rcu *inet_opt; + __be32 daddr = inet->inet_daddr; + + rcu_read_lock(); + inet_opt = rcu_dereference(inet->inet_opt); + if (inet_opt && inet_opt->opt.srr) + daddr = inet_opt->opt.faddr; + flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark, + RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, + inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol, + inet_sk_flowi_flags(sk), + daddr, inet->inet_saddr, 0, 0); + rcu_read_unlock(); +} + +static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk, + const struct sk_buff *skb) +{ + if (skb) + build_skb_flow_key(fl4, skb, sk); + else + build_sk_flow_key(fl4, sk); +} + +static DEFINE_SEQLOCK(fnhe_seqlock); + +static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash) +{ + struct fib_nh_exception *fnhe, *oldest; + + oldest = rcu_dereference(hash->chain); + for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe; + fnhe = rcu_dereference(fnhe->fnhe_next)) { + if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) + oldest = fnhe; + } + return oldest; +} + +static inline u32 fnhe_hashfun(__be32 daddr) +{ + u32 hval; + + hval = (__force u32) daddr; + hval ^= (hval >> 11) ^ (hval >> 22); + + return hval & (FNHE_HASH_SIZE - 1); +} + +static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw, + u32 pmtu, unsigned long expires) +{ + struct fnhe_hash_bucket *hash; + struct fib_nh_exception *fnhe; + int depth; + u32 hval = fnhe_hashfun(daddr); + + write_seqlock_bh(&fnhe_seqlock); + + hash = nh->nh_exceptions; + if (!hash) { + hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC); + if (!hash) + goto out_unlock; + nh->nh_exceptions = hash; + } + + hash += hval; + + depth = 0; + for (fnhe = rcu_dereference(hash->chain); fnhe; + fnhe = rcu_dereference(fnhe->fnhe_next)) { + if (fnhe->fnhe_daddr == daddr) + break; + depth++; } - old_n = xchg(&rt->dst._neighbour, n); - if (old_n) - neigh_release(old_n); - if (!(n->nud_state & NUD_VALID)) { - neigh_event_send(n, NULL); + + if (fnhe) { + if (gw) + fnhe->fnhe_gw = gw; + if (pmtu) { + fnhe->fnhe_pmtu = pmtu; + fnhe->fnhe_expires = expires; + } } else { - rt->rt_flags |= RTCF_REDIRECTED; - call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n); + if (depth > FNHE_RECLAIM_DEPTH) + fnhe = fnhe_oldest(hash); + else { + fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC); + if (!fnhe) + goto out_unlock; + + fnhe->fnhe_next = hash->chain; + rcu_assign_pointer(hash->chain, fnhe); + } + fnhe->fnhe_daddr = daddr; + fnhe->fnhe_gw = gw; + fnhe->fnhe_pmtu = pmtu; + fnhe->fnhe_expires = expires; } + + fnhe->fnhe_stamp = jiffies; + +out_unlock: + write_sequnlock_bh(&fnhe_seqlock); + return; } -/* called in rcu_read_lock() section */ -void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, - __be32 saddr, struct net_device *dev) +static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4) { - int s, i; - struct in_device *in_dev = __in_dev_get_rcu(dev); - __be32 skeys[2] = { saddr, 0 }; - int ikeys[2] = { dev->ifindex, 0 }; - struct inet_peer *peer; + __be32 new_gw = icmp_hdr(skb)->un.gateway; + __be32 old_gw = ip_hdr(skb)->saddr; + struct net_device *dev = skb->dev; + struct in_device *in_dev; + struct fib_result res; + struct neighbour *n; struct net *net; + switch (icmp_hdr(skb)->code & 7) { + case ICMP_REDIR_NET: + case ICMP_REDIR_NETTOS: + case ICMP_REDIR_HOST: + case ICMP_REDIR_HOSTTOS: + break; + + default: + return; + } + + if (rt->rt_gateway != old_gw) + return; + + in_dev = __in_dev_get_rcu(dev); if (!in_dev) return; @@ -1460,72 +1461,50 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, goto reject_redirect; } - for (s = 0; s < 2; s++) { - for (i = 0; i < 2; i++) { - unsigned int hash; - struct rtable __rcu **rthp; - struct rtable *rt; - - hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net)); - - rthp = &rt_hash_table[hash].chain; - - while ((rt = rcu_dereference(*rthp)) != NULL) { - rthp = &rt->dst.rt_next; - - if (rt->rt_key_dst != daddr || - rt->rt_key_src != skeys[s] || - rt->rt_oif != ikeys[i] || - rt_is_input_route(rt) || - rt_is_expired(rt) || - !net_eq(dev_net(rt->dst.dev), net) || - rt->dst.error || - rt->dst.dev != dev || - rt->rt_gateway != old_gw) - continue; - - if (!rt->peer) - rt_bind_peer(rt, rt->rt_dst, 1); + n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw); + if (n) { + if (!(n->nud_state & NUD_VALID)) { + neigh_event_send(n, NULL); + } else { + if (fib_lookup(net, fl4, &res) == 0) { + struct fib_nh *nh = &FIB_RES_NH(res); - peer = rt->peer; - if (peer) { - if (peer->redirect_learned.a4 != new_gw) { - peer->redirect_learned.a4 = new_gw; - atomic_inc(&__rt_peer_genid); - } - check_peer_redir(&rt->dst, peer); - } + update_or_create_fnhe(nh, fl4->daddr, new_gw, + 0, 0); } + rt->rt_gateway = new_gw; + rt->rt_flags |= RTCF_REDIRECTED; + call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n); } + neigh_release(n); } return; reject_redirect: #ifdef CONFIG_IP_ROUTE_VERBOSE - if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) - pr_info("Redirect from %pI4 on %s about %pI4 ignored\n" - " Advised path = %pI4 -> %pI4\n", - &old_gw, dev->name, &new_gw, - &saddr, &daddr); + if (IN_DEV_LOG_MARTIANS(in_dev)) { + const struct iphdr *iph = (const struct iphdr *) skb->data; + __be32 daddr = iph->daddr; + __be32 saddr = iph->saddr; + + net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n" + " Advised path = %pI4 -> %pI4\n", + &old_gw, dev->name, &new_gw, + &saddr, &daddr); + } #endif ; } -static bool peer_pmtu_expired(struct inet_peer *peer) +static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) { - unsigned long orig = ACCESS_ONCE(peer->pmtu_expires); - - return orig && - time_after_eq(jiffies, orig) && - cmpxchg(&peer->pmtu_expires, orig, 0) == orig; -} + struct rtable *rt; + struct flowi4 fl4; -static bool peer_pmtu_cleaned(struct inet_peer *peer) -{ - unsigned long orig = ACCESS_ONCE(peer->pmtu_expires); + rt = (struct rtable *) dst; - return orig && - cmpxchg(&peer->pmtu_expires, orig, 0) == orig; + ip_rt_build_flow_key(&fl4, sk, skb); + __ip_do_redirect(rt, skb, &fl4); } static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) @@ -1537,14 +1516,13 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) if (dst->obsolete > 0) { ip_rt_put(rt); ret = NULL; - } else if (rt->rt_flags & RTCF_REDIRECTED) { - unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src, + } else if ((rt->rt_flags & RTCF_REDIRECTED) || + rt->dst.expires) { + unsigned int hash = rt_hash(rt->rt_key_dst, rt->rt_key_src, rt->rt_oif, rt_genid(dev_net(dst->dev))); rt_del(hash, rt); ret = NULL; - } else if (rt->peer && peer_pmtu_expired(rt->peer)) { - dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig); } } return ret; @@ -1571,6 +1549,7 @@ void ip_rt_send_redirect(struct sk_buff *skb) struct rtable *rt = skb_rtable(skb); struct in_device *in_dev; struct inet_peer *peer; + struct net *net; int log_martians; rcu_read_lock(); @@ -1582,9 +1561,8 @@ void ip_rt_send_redirect(struct sk_buff *skb) log_martians = IN_DEV_LOG_MARTIANS(in_dev); rcu_read_unlock(); - if (!rt->peer) - rt_bind_peer(rt, rt->rt_dst, 1); - peer = rt->peer; + net = dev_net(rt->dst.dev); + peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1); if (!peer) { icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway); return; @@ -1601,7 +1579,7 @@ void ip_rt_send_redirect(struct sk_buff *skb) */ if (peer->rate_tokens >= ip_rt_redirect_number) { peer->rate_last = jiffies; - return; + goto out_put_peer; } /* Check for load limit; set rate_last to the latest sent @@ -1616,23 +1594,40 @@ void ip_rt_send_redirect(struct sk_buff *skb) ++peer->rate_tokens; #ifdef CONFIG_IP_ROUTE_VERBOSE if (log_martians && - peer->rate_tokens == ip_rt_redirect_number && - net_ratelimit()) - pr_warn("host %pI4/if%d ignores redirects for %pI4 to %pI4\n", - &ip_hdr(skb)->saddr, rt->rt_iif, - &rt->rt_dst, &rt->rt_gateway); + peer->rate_tokens == ip_rt_redirect_number) + net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n", + &ip_hdr(skb)->saddr, rt->rt_iif, + &rt->rt_dst, &rt->rt_gateway); #endif } +out_put_peer: + inet_putpeer(peer); } static int ip_error(struct sk_buff *skb) { + struct in_device *in_dev = __in_dev_get_rcu(skb->dev); struct rtable *rt = skb_rtable(skb); struct inet_peer *peer; unsigned long now; + struct net *net; bool send; int code; + net = dev_net(rt->dst.dev); + if (!IN_DEV_FORWARD(in_dev)) { + switch (rt->dst.error) { + case EHOSTUNREACH: + IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS); + break; + + case ENETUNREACH: + IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES); + break; + } + goto out; + } + switch (rt->dst.error) { case EINVAL: default: @@ -1642,17 +1637,14 @@ static int ip_error(struct sk_buff *skb) break; case ENETUNREACH: code = ICMP_NET_UNREACH; - IP_INC_STATS_BH(dev_net(rt->dst.dev), - IPSTATS_MIB_INNOROUTES); + IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES); break; case EACCES: code = ICMP_PKT_FILTERED; break; } - if (!rt->peer) - rt_bind_peer(rt, rt->rt_dst, 1); - peer = rt->peer; + peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1); send = true; if (peer) { @@ -1665,6 +1657,7 @@ static int ip_error(struct sk_buff *skb) peer->rate_tokens -= ip_rt_error_cost; else send = false; + inet_putpeer(peer); } if (send) icmp_send(skb, ICMP_DEST_UNREACH, code, 0); @@ -1673,136 +1666,96 @@ out: kfree_skb(skb); return 0; } -/* - * The last two values are not from the RFC but - * are needed for AMPRnet AX.25 paths. - */ +static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) +{ + struct fib_result res; -static const unsigned short mtu_plateau[] = -{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 }; + if (mtu < ip_rt_min_pmtu) + mtu = ip_rt_min_pmtu; -static inline unsigned short guess_mtu(unsigned short old_mtu) -{ - int i; + if (fib_lookup(dev_net(rt->dst.dev), fl4, &res) == 0) { + struct fib_nh *nh = &FIB_RES_NH(res); - for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++) - if (old_mtu > mtu_plateau[i]) - return mtu_plateau[i]; - return 68; + update_or_create_fnhe(nh, fl4->daddr, 0, mtu, + jiffies + ip_rt_mtu_expires); + } + rt->rt_pmtu = mtu; + dst_set_expires(&rt->dst, ip_rt_mtu_expires); } -unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph, - unsigned short new_mtu, - struct net_device *dev) +static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb, u32 mtu) { - unsigned short old_mtu = ntohs(iph->tot_len); - unsigned short est_mtu = 0; - struct inet_peer *peer; - - peer = inet_getpeer_v4(iph->daddr, 1); - if (peer) { - unsigned short mtu = new_mtu; - - if (new_mtu < 68 || new_mtu >= old_mtu) { - /* BSD 4.2 derived systems incorrectly adjust - * tot_len by the IP header length, and report - * a zero MTU in the ICMP message. - */ - if (mtu == 0 && - old_mtu >= 68 + (iph->ihl << 2)) - old_mtu -= iph->ihl << 2; - mtu = guess_mtu(old_mtu); - } - - if (mtu < ip_rt_min_pmtu) - mtu = ip_rt_min_pmtu; - if (!peer->pmtu_expires || mtu < peer->pmtu_learned) { - unsigned long pmtu_expires; + struct rtable *rt = (struct rtable *) dst; + struct flowi4 fl4; - pmtu_expires = jiffies + ip_rt_mtu_expires; - if (!pmtu_expires) - pmtu_expires = 1UL; + ip_rt_build_flow_key(&fl4, sk, skb); + __ip_rt_update_pmtu(rt, &fl4, mtu); +} - est_mtu = mtu; - peer->pmtu_learned = mtu; - peer->pmtu_expires = pmtu_expires; - atomic_inc(&__rt_peer_genid); - } +void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, + int oif, u32 mark, u8 protocol, int flow_flags) +{ + const struct iphdr *iph = (const struct iphdr *) skb->data; + struct flowi4 fl4; + struct rtable *rt; - inet_putpeer(peer); + __build_flow_key(&fl4, NULL, iph, oif, + RT_TOS(iph->tos), protocol, mark, flow_flags); + rt = __ip_route_output_key(net, &fl4); + if (!IS_ERR(rt)) { + __ip_rt_update_pmtu(rt, &fl4, mtu); + ip_rt_put(rt); } - return est_mtu ? : new_mtu; } +EXPORT_SYMBOL_GPL(ipv4_update_pmtu); -static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer) +void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) { - unsigned long expires = ACCESS_ONCE(peer->pmtu_expires); + const struct iphdr *iph = (const struct iphdr *) skb->data; + struct flowi4 fl4; + struct rtable *rt; - if (!expires) - return; - if (time_before(jiffies, expires)) { - u32 orig_dst_mtu = dst_mtu(dst); - if (peer->pmtu_learned < orig_dst_mtu) { - if (!peer->pmtu_orig) - peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU); - dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned); - } - } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires) - dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig); + __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0); + rt = __ip_route_output_key(sock_net(sk), &fl4); + if (!IS_ERR(rt)) { + __ip_rt_update_pmtu(rt, &fl4, mtu); + ip_rt_put(rt); + } } +EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu); -static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu) +void ipv4_redirect(struct sk_buff *skb, struct net *net, + int oif, u32 mark, u8 protocol, int flow_flags) { - struct rtable *rt = (struct rtable *) dst; - struct inet_peer *peer; - - dst_confirm(dst); - - if (!rt->peer) - rt_bind_peer(rt, rt->rt_dst, 1); - peer = rt->peer; - if (peer) { - unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires); - - if (mtu < ip_rt_min_pmtu) - mtu = ip_rt_min_pmtu; - if (!pmtu_expires || mtu < peer->pmtu_learned) { - - pmtu_expires = jiffies + ip_rt_mtu_expires; - if (!pmtu_expires) - pmtu_expires = 1UL; - - peer->pmtu_learned = mtu; - peer->pmtu_expires = pmtu_expires; + const struct iphdr *iph = (const struct iphdr *) skb->data; + struct flowi4 fl4; + struct rtable *rt; - atomic_inc(&__rt_peer_genid); - rt->rt_peer_genid = rt_peer_genid(); - } - check_peer_pmtu(dst, peer); + __build_flow_key(&fl4, NULL, iph, oif, + RT_TOS(iph->tos), protocol, mark, flow_flags); + rt = __ip_route_output_key(net, &fl4); + if (!IS_ERR(rt)) { + __ip_do_redirect(rt, skb, &fl4); + ip_rt_put(rt); } } +EXPORT_SYMBOL_GPL(ipv4_redirect); - -static void ipv4_validate_peer(struct rtable *rt) +void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk) { - if (rt->rt_peer_genid != rt_peer_genid()) { - struct inet_peer *peer; - - if (!rt->peer) - rt_bind_peer(rt, rt->rt_dst, 0); - - peer = rt->peer; - if (peer) { - check_peer_pmtu(&rt->dst, peer); - - if (peer->redirect_learned.a4 && - peer->redirect_learned.a4 != rt->rt_gateway) - check_peer_redir(&rt->dst, peer); - } + const struct iphdr *iph = (const struct iphdr *) skb->data; + struct flowi4 fl4; + struct rtable *rt; - rt->rt_peer_genid = rt_peer_genid(); + __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0); + rt = __ip_route_output_key(sock_net(sk), &fl4); + if (!IS_ERR(rt)) { + __ip_do_redirect(rt, skb, &fl4); + ip_rt_put(rt); } } +EXPORT_SYMBOL_GPL(ipv4_sk_redirect); static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) { @@ -1810,23 +1763,17 @@ static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) if (rt_is_expired(rt)) return NULL; - ipv4_validate_peer(rt); return dst; } static void ipv4_dst_destroy(struct dst_entry *dst) { struct rtable *rt = (struct rtable *) dst; - struct inet_peer *peer = rt->peer; if (rt->fi) { fib_info_put(rt->fi); rt->fi = NULL; } - if (peer) { - rt->peer = NULL; - inet_putpeer(peer); - } } @@ -1837,15 +1784,15 @@ static void ipv4_link_failure(struct sk_buff *skb) icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); rt = skb_rtable(skb); - if (rt && rt->peer && peer_pmtu_cleaned(rt->peer)) - dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig); + if (rt) + dst_set_expires(&rt->dst, 0); } static int ip_rt_bug(struct sk_buff *skb) { - printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n", - &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, - skb->dev ? skb->dev->name : "?"); + pr_debug("%s: %pI4 -> %pI4, %s\n", + __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, + skb->dev ? skb->dev->name : "?"); kfree_skb(skb); WARN_ON(1); return 0; @@ -1918,7 +1865,13 @@ static unsigned int ipv4_default_advmss(const struct dst_entry *dst) static unsigned int ipv4_mtu(const struct dst_entry *dst) { const struct rtable *rt = (const struct rtable *) dst; - unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); + unsigned int mtu = rt->rt_pmtu; + + if (mtu && time_after_eq(jiffies, rt->dst.expires)) + mtu = 0; + + if (!mtu) + mtu = dst_metric_raw(dst, RTAX_MTU); if (mtu && rt_is_output_route(rt)) return mtu; @@ -1940,36 +1893,50 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst) static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4, struct fib_info *fi) { - struct inet_peer *peer; - int create = 0; + if (fi->fib_metrics != (u32 *) dst_default_metrics) { + rt->fi = fi; + atomic_inc(&fi->fib_clntref); + } + dst_init_metrics(&rt->dst, fi->fib_metrics, true); +} - /* If a peer entry exists for this destination, we must hook - * it up in order to get at cached metrics. - */ - if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS)) - create = 1; +static void rt_bind_exception(struct rtable *rt, struct fib_nh *nh, __be32 daddr) +{ + struct fnhe_hash_bucket *hash = nh->nh_exceptions; + struct fib_nh_exception *fnhe; + u32 hval; - rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create); - if (peer) { - rt->rt_peer_genid = rt_peer_genid(); - if (inet_metrics_new(peer)) - memcpy(peer->metrics, fi->fib_metrics, - sizeof(u32) * RTAX_MAX); - dst_init_metrics(&rt->dst, peer->metrics, false); + hval = fnhe_hashfun(daddr); - check_peer_pmtu(&rt->dst, peer); +restart: + for (fnhe = rcu_dereference(hash[hval].chain); fnhe; + fnhe = rcu_dereference(fnhe->fnhe_next)) { + __be32 fnhe_daddr, gw; + unsigned long expires; + unsigned int seq; + u32 pmtu; + + seq = read_seqbegin(&fnhe_seqlock); + fnhe_daddr = fnhe->fnhe_daddr; + gw = fnhe->fnhe_gw; + pmtu = fnhe->fnhe_pmtu; + expires = fnhe->fnhe_expires; + if (read_seqretry(&fnhe_seqlock, seq)) + goto restart; + if (daddr != fnhe_daddr) + continue; + if (pmtu) { + unsigned long diff = expires - jiffies; - if (peer->redirect_learned.a4 && - peer->redirect_learned.a4 != rt->rt_gateway) { - rt->rt_gateway = peer->redirect_learned.a4; - rt->rt_flags |= RTCF_REDIRECTED; - } - } else { - if (fi->fib_metrics != (u32 *) dst_default_metrics) { - rt->fi = fi; - atomic_inc(&fi->fib_clntref); + if (time_before(jiffies, expires)) { + rt->rt_pmtu = pmtu; + dst_set_expires(&rt->dst, diff); + } } - dst_init_metrics(&rt->dst, fi->fib_metrics, true); + if (gw) + rt->rt_gateway = gw; + fnhe->fnhe_stamp = jiffies; + break; } } @@ -1977,26 +1944,22 @@ static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4, const struct fib_result *res, struct fib_info *fi, u16 type, u32 itag) { - struct dst_entry *dst = &rt->dst; - if (fi) { - if (FIB_RES_GW(*res) && - FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) - rt->rt_gateway = FIB_RES_GW(*res); + struct fib_nh *nh = &FIB_RES_NH(*res); + + if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) + rt->rt_gateway = nh->nh_gw; + if (unlikely(nh->nh_exceptions)) + rt_bind_exception(rt, nh, fl4->daddr); rt_init_metrics(rt, fl4, fi); #ifdef CONFIG_IP_ROUTE_CLASSID - dst->tclassid = FIB_RES_NH(*res).nh_tclassid; + rt->dst.tclassid = FIB_RES_NH(*res).nh_tclassid; #endif } - if (dst_mtu(dst) > IP_MAX_MTU) - dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU); - if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40) - dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40); - #ifdef CONFIG_IP_ROUTE_CLASSID #ifdef CONFIG_IP_MULTIPLE_TABLES - set_class_tag(rt, fib_rules_tclass(res)); + set_class_tag(rt, res->tclassid); #endif set_class_tag(rt, itag); #endif @@ -2017,7 +1980,6 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, { unsigned int hash; struct rtable *rth; - __be32 spec_dst; struct in_device *in_dev = __in_dev_get_rcu(dev); u32 itag = 0; int err; @@ -2028,20 +1990,23 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, return -EINVAL; if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || - ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP)) + skb->protocol != htons(ETH_P_IP)) goto e_inval; + if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) + if (ipv4_is_loopback(saddr)) + goto e_inval; + if (ipv4_is_zeronet(saddr)) { if (!ipv4_is_local_multicast(daddr)) goto e_inval; - spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); } else { - err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst, - &itag); + err = fib_validate_source(skb, saddr, 0, tos, 0, dev, + in_dev, &itag); if (err < 0) goto e_err; } - rth = rt_dst_alloc(init_net.loopback_dev, + rth = rt_dst_alloc(dev_net(dev)->loopback_dev, IN_DEV_CONF_GET(in_dev, NOPOLICY), false); if (!rth) goto e_nobufs; @@ -2063,10 +2028,8 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, rth->rt_iif = dev->ifindex; rth->rt_oif = 0; rth->rt_mark = skb->mark; + rth->rt_pmtu = 0; rth->rt_gateway = daddr; - rth->rt_spec_dst= spec_dst; - rth->rt_peer_genid = 0; - rth->peer = NULL; rth->fi = NULL; if (our) { rth->dst.input= ip_local_deliver; @@ -2128,20 +2091,18 @@ static int __mkroute_input(struct sk_buff *skb, int err; struct in_device *out_dev; unsigned int flags = 0; - __be32 spec_dst; u32 itag; /* get a working reference to the output device */ out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res)); if (out_dev == NULL) { - if (net_ratelimit()) - pr_crit("Bug in ip_route_input_slow(). Please report.\n"); + net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n"); return -EINVAL; } err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res), - in_dev->dev, &spec_dst, &itag); + in_dev->dev, in_dev, &itag); if (err < 0) { ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, saddr); @@ -2192,10 +2153,8 @@ static int __mkroute_input(struct sk_buff *skb, rth->rt_iif = in_dev->dev->ifindex; rth->rt_oif = 0; rth->rt_mark = skb->mark; + rth->rt_pmtu = 0; rth->rt_gateway = daddr; - rth->rt_spec_dst= spec_dst; - rth->rt_peer_genid = 0; - rth->peer = NULL; rth->fi = NULL; rth->dst.input = ip_forward; @@ -2215,9 +2174,9 @@ static int ip_mkroute_input(struct sk_buff *skb, struct in_device *in_dev, __be32 daddr, __be32 saddr, u32 tos) { - struct rtable* rth = NULL; + struct rtable *rth = NULL; int err; - unsigned hash; + unsigned int hash; #ifdef CONFIG_IP_ROUTE_MULTIPATH if (res->fi && res->fi->fib_nhs > 1) @@ -2255,13 +2214,12 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, struct fib_result res; struct in_device *in_dev = __in_dev_get_rcu(dev); struct flowi4 fl4; - unsigned flags = 0; + unsigned int flags = 0; u32 itag = 0; - struct rtable * rth; - unsigned hash; - __be32 spec_dst; + struct rtable *rth; + unsigned int hash; int err = -EINVAL; - struct net * net = dev_net(dev); + struct net *net = dev_net(dev); /* IP on this device is disabled. */ @@ -2272,8 +2230,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, by fib_lookup. */ - if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || - ipv4_is_loopback(saddr)) + if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) goto martian_source; if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0)) @@ -2285,9 +2242,17 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, if (ipv4_is_zeronet(saddr)) goto martian_source; - if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr)) + if (ipv4_is_zeronet(daddr)) goto martian_destination; + if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) { + if (ipv4_is_loopback(daddr)) + goto martian_destination; + + if (ipv4_is_loopback(saddr)) + goto martian_source; + } + /* * Now we are ready to route packet. */ @@ -2299,11 +2264,8 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, fl4.daddr = daddr; fl4.saddr = saddr; err = fib_lookup(net, &fl4, &res); - if (err != 0) { - if (!IN_DEV_FORWARD(in_dev)) - goto e_hostunreach; + if (err != 0) goto no_route; - } RT_CACHE_STAT_INC(in_slow_tot); @@ -2313,17 +2275,16 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, if (res.type == RTN_LOCAL) { err = fib_validate_source(skb, saddr, daddr, tos, net->loopback_dev->ifindex, - dev, &spec_dst, &itag); + dev, in_dev, &itag); if (err < 0) goto martian_source_keep_err; if (err) flags |= RTCF_DIRECTSRC; - spec_dst = daddr; goto local_input; } if (!IN_DEV_FORWARD(in_dev)) - goto e_hostunreach; + goto no_route; if (res.type != RTN_UNICAST) goto martian_destination; @@ -2334,11 +2295,9 @@ brd_input: if (skb->protocol != htons(ETH_P_IP)) goto e_inval; - if (ipv4_is_zeronet(saddr)) - spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); - else { - err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst, - &itag); + if (!ipv4_is_zeronet(saddr)) { + err = fib_validate_source(skb, saddr, 0, tos, 0, dev, + in_dev, &itag); if (err < 0) goto martian_source_keep_err; if (err) @@ -2368,17 +2327,12 @@ local_input: rth->rt_key_tos = tos; rth->rt_dst = daddr; rth->rt_src = saddr; -#ifdef CONFIG_IP_ROUTE_CLASSID - rth->dst.tclassid = itag; -#endif rth->rt_route_iif = dev->ifindex; rth->rt_iif = dev->ifindex; rth->rt_oif = 0; rth->rt_mark = skb->mark; + rth->rt_pmtu = 0; rth->rt_gateway = daddr; - rth->rt_spec_dst= spec_dst; - rth->rt_peer_genid = 0; - rth->peer = NULL; rth->fi = NULL; if (res.type == RTN_UNREACHABLE) { rth->dst.input= ip_error; @@ -2394,7 +2348,6 @@ local_input: no_route: RT_CACHE_STAT_INC(in_no_route); - spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); res.type = RTN_UNREACHABLE; if (err == -ESRCH) err = -ENETUNREACH; @@ -2406,15 +2359,11 @@ no_route: martian_destination: RT_CACHE_STAT_INC(in_martian_dst); #ifdef CONFIG_IP_ROUTE_VERBOSE - if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) - pr_warn("martian destination %pI4 from %pI4, dev %s\n", - &daddr, &saddr, dev->name); + if (IN_DEV_LOG_MARTIANS(in_dev)) + net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n", + &daddr, &saddr, dev->name); #endif -e_hostunreach: - err = -EHOSTUNREACH; - goto out; - e_inval: err = -EINVAL; goto out; @@ -2433,8 +2382,8 @@ martian_source_keep_err: int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr, u8 tos, struct net_device *dev, bool noref) { - struct rtable * rth; - unsigned hash; + struct rtable *rth; + unsigned int hash; int iif = dev->ifindex; struct net *net; int res; @@ -2458,7 +2407,6 @@ int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr, rth->rt_mark == skb->mark && net_eq(dev_net(rth->dst.dev), net) && !rt_is_expired(rth)) { - ipv4_validate_peer(rth); if (noref) { dst_use_noref(&rth->dst, jiffies); skb_dst_set_noref(skb, &rth->dst); @@ -2526,9 +2474,14 @@ static struct rtable *__mkroute_output(const struct fib_result *res, u16 type = res->type; struct rtable *rth; - if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK)) + in_dev = __in_dev_get_rcu(dev_out); + if (!in_dev) return ERR_PTR(-EINVAL); + if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) + if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK)) + return ERR_PTR(-EINVAL); + if (ipv4_is_lbcast(fl4->daddr)) type = RTN_BROADCAST; else if (ipv4_is_multicast(fl4->daddr)) @@ -2539,10 +2492,6 @@ static struct rtable *__mkroute_output(const struct fib_result *res, if (dev_out->flags & IFF_LOOPBACK) flags |= RTCF_LOCAL; - in_dev = __in_dev_get_rcu(dev_out); - if (!in_dev) - return ERR_PTR(-EINVAL); - if (type == RTN_BROADCAST) { flags |= RTCF_BROADCAST | RTCF_LOCAL; fi = NULL; @@ -2579,20 +2528,15 @@ static struct rtable *__mkroute_output(const struct fib_result *res, rth->rt_iif = orig_oif ? : dev_out->ifindex; rth->rt_oif = orig_oif; rth->rt_mark = fl4->flowi4_mark; + rth->rt_pmtu = 0; rth->rt_gateway = fl4->daddr; - rth->rt_spec_dst= fl4->saddr; - rth->rt_peer_genid = 0; - rth->peer = NULL; rth->fi = NULL; RT_CACHE_STAT_INC(out_slow_tot); - if (flags & RTCF_LOCAL) { + if (flags & RTCF_LOCAL) rth->dst.input = ip_local_deliver; - rth->rt_spec_dst = fl4->daddr; - } if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) { - rth->rt_spec_dst = fl4->saddr; if (flags & RTCF_LOCAL && !(dev_out->flags & IFF_LOOPBACK)) { rth->dst.output = ip_mc_output; @@ -2611,6 +2555,9 @@ static struct rtable *__mkroute_output(const struct fib_result *res, rt_set_nexthop(rth, fl4, res, fi, type, 0); + if (fl4->flowi4_flags & FLOWI_FLAG_RT_NOCACHE) + rth->dst.flags |= DST_NOCACHE; + return rth; } @@ -2630,10 +2577,9 @@ static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4) __be32 orig_saddr; int orig_oif; + res.tclassid = 0; res.fi = NULL; -#ifdef CONFIG_IP_MULTIPLE_TABLES - res.r = NULL; -#endif + res.table = NULL; orig_daddr = fl4->daddr; orig_saddr = fl4->saddr; @@ -2736,6 +2682,7 @@ static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4) if (fib_lookup(net, fl4, &res)) { res.fi = NULL; + res.table = NULL; if (fl4->flowi4_oif) { /* Apparently, routing tables are wrong. Assume, that the destination is on link. @@ -2834,7 +2781,6 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4) (IPTOS_RT_MASK | RTO_ONLINK)) && net_eq(dev_net(rth->dst.dev), net) && !rt_is_expired(rth)) { - ipv4_validate_peer(rth); dst_use(&rth->dst, jiffies); RT_CACHE_STAT_INC(out_hit); rcu_read_unlock_bh(); @@ -2865,7 +2811,13 @@ static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst) return mtu ? : dst->dev->mtu; } -static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu) +static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb, u32 mtu) +{ +} + +static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb) { } @@ -2883,6 +2835,7 @@ static struct dst_ops ipv4_dst_blackhole_ops = { .mtu = ipv4_blackhole_mtu, .default_advmss = ipv4_default_advmss, .update_pmtu = ipv4_rt_blackhole_update_pmtu, + .redirect = ipv4_rt_blackhole_redirect, .cow_metrics = ipv4_rt_blackhole_cow_metrics, .neigh_lookup = ipv4_neigh_lookup, }; @@ -2898,7 +2851,6 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or new->__use = 1; new->input = dst_discard; new->output = dst_discard; - dst_copy_metrics(new, &ort->dst); new->dev = ort->dst.dev; if (new->dev) @@ -2911,6 +2863,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or rt->rt_iif = ort->rt_iif; rt->rt_oif = ort->rt_oif; rt->rt_mark = ort->rt_mark; + rt->rt_pmtu = ort->rt_pmtu; rt->rt_genid = rt_genid(net); rt->rt_flags = ort->rt_flags; @@ -2918,10 +2871,6 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or rt->rt_dst = ort->rt_dst; rt->rt_src = ort->rt_src; rt->rt_gateway = ort->rt_gateway; - rt->rt_spec_dst = ort->rt_spec_dst; - rt->peer = ort->peer; - if (rt->peer) - atomic_inc(&rt->peer->refcnt); rt->fi = ort->fi; if (rt->fi) atomic_inc(&rt->fi->fib_clntref); @@ -2959,8 +2908,8 @@ static int rt_fill_info(struct net *net, struct rtmsg *r; struct nlmsghdr *nlh; unsigned long expires = 0; - const struct inet_peer *peer = rt->peer; - u32 id = 0, ts = 0, tsage = 0, error; + u32 error; + u32 metrics[RTAX_MAX]; nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags); if (nlh == NULL) @@ -2972,7 +2921,8 @@ static int rt_fill_info(struct net *net, r->rtm_src_len = 0; r->rtm_tos = rt->rt_key_tos; r->rtm_table = RT_TABLE_MAIN; - NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN); + if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN)) + goto nla_put_failure; r->rtm_type = rt->rt_type; r->rtm_scope = RT_SCOPE_UNIVERSE; r->rtm_protocol = RTPROT_UNSPEC; @@ -2980,47 +2930,47 @@ static int rt_fill_info(struct net *net, if (rt->rt_flags & RTCF_NOTIFY) r->rtm_flags |= RTM_F_NOTIFY; - NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst); - + if (nla_put_be32(skb, RTA_DST, rt->rt_dst)) + goto nla_put_failure; if (rt->rt_key_src) { r->rtm_src_len = 32; - NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src); + if (nla_put_be32(skb, RTA_SRC, rt->rt_key_src)) + goto nla_put_failure; } - if (rt->dst.dev) - NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex); + if (rt->dst.dev && + nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex)) + goto nla_put_failure; #ifdef CONFIG_IP_ROUTE_CLASSID - if (rt->dst.tclassid) - NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid); + if (rt->dst.tclassid && + nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid)) + goto nla_put_failure; #endif - if (rt_is_input_route(rt)) - NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst); - else if (rt->rt_src != rt->rt_key_src) - NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src); - - if (rt->rt_dst != rt->rt_gateway) - NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway); + if (!rt_is_input_route(rt) && + rt->rt_src != rt->rt_key_src) { + if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_src)) + goto nla_put_failure; + } + if (rt->rt_dst != rt->rt_gateway && + nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway)) + goto nla_put_failure; - if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0) + memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics)); + if (rt->rt_pmtu) + metrics[RTAX_MTU - 1] = rt->rt_pmtu; + if (rtnetlink_put_metrics(skb, metrics) < 0) goto nla_put_failure; - if (rt->rt_mark) - NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark); + if (rt->rt_mark && + nla_put_be32(skb, RTA_MARK, rt->rt_mark)) + goto nla_put_failure; error = rt->dst.error; - if (peer) { - inet_peer_refcheck(rt->peer); - id = atomic_read(&peer->ip_id_count) & 0xffff; - if (peer->tcp_ts_stamp) { - ts = peer->tcp_ts; - tsage = get_seconds() - peer->tcp_ts_stamp; - } - expires = ACCESS_ONCE(peer->pmtu_expires); - if (expires) { - if (time_before(jiffies, expires)) - expires -= jiffies; - else - expires = 0; - } + expires = rt->dst.expires; + if (expires) { + if (time_before(jiffies, expires)) + expires -= jiffies; + else + expires = 0; } if (rt_is_input_route(rt)) { @@ -3045,11 +2995,11 @@ static int rt_fill_info(struct net *net, } } else #endif - NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif); + if (nla_put_u32(skb, RTA_IIF, rt->rt_iif)) + goto nla_put_failure; } - if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage, - expires, error) < 0) + if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0) goto nla_put_failure; return nlmsg_end(skb, nlh); @@ -3059,7 +3009,7 @@ nla_put_failure: return -EMSGSIZE; } -static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg) +static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg) { struct net *net = sock_net(in_skb->sk); struct rtmsg *rtm; @@ -3334,23 +3284,6 @@ static ctl_table ipv4_route_table[] = { { } }; -static struct ctl_table empty[1]; - -static struct ctl_table ipv4_skeleton[] = -{ - { .procname = "route", - .mode = 0555, .child = ipv4_route_table}, - { .procname = "neigh", - .mode = 0555, .child = empty}, - { } -}; - -static __net_initdata struct ctl_path ipv4_path[] = { - { .procname = "net", }, - { .procname = "ipv4", }, - { }, -}; - static struct ctl_table ipv4_route_flush_table[] = { { .procname = "flush", @@ -3361,13 +3294,6 @@ static struct ctl_table ipv4_route_flush_table[] = { { }, }; -static __net_initdata struct ctl_path ipv4_route_path[] = { - { .procname = "net", }, - { .procname = "ipv4", }, - { .procname = "route", }, - { }, -}; - static __net_init int sysctl_route_net_init(struct net *net) { struct ctl_table *tbl; @@ -3380,8 +3306,7 @@ static __net_init int sysctl_route_net_init(struct net *net) } tbl[0].extra1 = net; - net->ipv4.route_hdr = - register_net_sysctl_table(net, ipv4_route_path, tbl); + net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl); if (net->ipv4.route_hdr == NULL) goto err_reg; return 0; @@ -3422,6 +3347,30 @@ static __net_initdata struct pernet_operations rt_genid_ops = { .init = rt_genid_init, }; +static int __net_init ipv4_inetpeer_init(struct net *net) +{ + struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL); + + if (!bp) + return -ENOMEM; + inet_peer_base_init(bp); + net->ipv4.peers = bp; + return 0; +} + +static void __net_exit ipv4_inetpeer_exit(struct net *net) +{ + struct inet_peer_base *bp = net->ipv4.peers; + + net->ipv4.peers = NULL; + inetpeer_invalidate_tree(bp); + kfree(bp); +} + +static __net_initdata struct pernet_operations ipv4_inetpeer_ops = { + .init = ipv4_inetpeer_init, + .exit = ipv4_inetpeer_exit, +}; #ifdef CONFIG_IP_ROUTE_CLASSID struct ip_rt_acct __percpu *ip_rt_acct __read_mostly; @@ -3430,9 +3379,15 @@ struct ip_rt_acct __percpu *ip_rt_acct __read_mostly; static __initdata unsigned long rhash_entries; static int __init set_rhash_entries(char *str) { + ssize_t ret; + if (!str) return 0; - rhash_entries = simple_strtoul(str, &str, 0); + + ret = kstrtoul(str, 0, &rhash_entries); + if (ret) + return 0; + return 1; } __setup("rhash_entries=", set_rhash_entries); @@ -3468,6 +3423,7 @@ int __init ip_rt_init(void) 0, &rt_hash_log, &rt_hash_mask, + 0, rhash_entries ? 0 : 512 * 1024); memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket)); rt_hash_lock_init(); @@ -3495,6 +3451,7 @@ int __init ip_rt_init(void) register_pernet_subsys(&sysctl_route_ops); #endif register_pernet_subsys(&rt_genid_ops); + register_pernet_subsys(&ipv4_inetpeer_ops); return rc; } @@ -3505,6 +3462,6 @@ int __init ip_rt_init(void) */ void __init ip_static_sysctl_init(void) { - register_sysctl_paths(ipv4_path, ipv4_skeleton); + register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table); } #endif diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index eab2a7fb15d1..650e1528e1e6 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -293,7 +293,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, /* check for timestamp cookie support */ memset(&tcp_opt, 0, sizeof(tcp_opt)); - tcp_parse_options(skb, &tcp_opt, &hash_location, 0); + tcp_parse_options(skb, &tcp_opt, &hash_location, 0, NULL); if (!cookie_check_timestamp(&tcp_opt, &ecn_ok)) goto out; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 7a7724da9bff..5840c3255721 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -27,6 +27,7 @@ #include <net/tcp_memcontrol.h> static int zero; +static int two = 2; static int tcp_retr1_max = 255; static int ip_local_port_range_min[] = { 1, 1 }; static int ip_local_port_range_max[] = { 65535, 65535 }; @@ -78,7 +79,7 @@ static int ipv4_local_port_range(ctl_table *table, int write, static void inet_get_ping_group_range_table(struct ctl_table *table, gid_t *low, gid_t *high) { gid_t *data = table->data; - unsigned seq; + unsigned int seq; do { seq = read_seqbegin(&sysctl_local_ports.lock); @@ -300,6 +301,13 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { + .procname = "ip_early_demux", + .data = &sysctl_ip_early_demux, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { .procname = "ip_dynaddr", .data = &sysctl_ip_dynaddr, .maxlen = sizeof(int), @@ -359,6 +367,13 @@ static struct ctl_table ipv4_table[] = { }, #endif { + .procname = "tcp_fastopen", + .data = &sysctl_tcp_fastopen, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { .procname = "tcp_tw_recycle", .data = &tcp_death_row.sysctl_tw_recycle, .maxlen = sizeof(int), @@ -590,6 +605,20 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "tcp_limit_output_bytes", + .data = &sysctl_tcp_limit_output_bytes, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_challenge_ack_limit", + .data = &sysctl_tcp_challenge_ack_limit, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, #ifdef CONFIG_NET_DMA { .procname = "tcp_dma_copybreak", @@ -677,6 +706,15 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { + .procname = "tcp_early_retrans", + .data = &sysctl_tcp_early_retrans, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &two, + }, + { .procname = "udp_mem", .data = &sysctl_udp_mem, .maxlen = sizeof(sysctl_udp_mem), @@ -768,13 +806,6 @@ static struct ctl_table ipv4_net_table[] = { { } }; -struct ctl_path net_ipv4_ctl_path[] = { - { .procname = "net", }, - { .procname = "ipv4", }, - { }, -}; -EXPORT_SYMBOL_GPL(net_ipv4_ctl_path); - static __net_init int ipv4_sysctl_init_net(struct net *net) { struct ctl_table *table; @@ -815,8 +846,7 @@ static __net_init int ipv4_sysctl_init_net(struct net *net) tcp_init_mem(net); - net->ipv4.ipv4_hdr = register_net_sysctl_table(net, - net_ipv4_ctl_path, table); + net->ipv4.ipv4_hdr = register_net_sysctl(net, "net/ipv4", table); if (net->ipv4.ipv4_hdr == NULL) goto err_reg; @@ -857,12 +887,12 @@ static __init int sysctl_ipv4_init(void) if (!i->procname) return -EINVAL; - hdr = register_sysctl_paths(net_ipv4_ctl_path, ipv4_table); + hdr = register_net_sysctl(&init_net, "net/ipv4", ipv4_table); if (hdr == NULL) return -ENOMEM; if (register_pernet_subsys(&ipv4_sysctl_ops)) { - unregister_sysctl_table(hdr); + unregister_net_sysctl_table(hdr); return -ENOMEM; } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index cfd7edda0a8e..581ecf02c6b5 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -270,6 +270,7 @@ #include <linux/slab.h> #include <net/icmp.h> +#include <net/inet_common.h> #include <net/tcp.h> #include <net/xfrm.h> #include <net/ip.h> @@ -363,6 +364,72 @@ static int retrans_to_secs(u8 retrans, int timeout, int rto_max) return period; } +/* Address-family independent initialization for a tcp_sock. + * + * NOTE: A lot of things set to zero explicitly by call to + * sk_alloc() so need not be done here. + */ +void tcp_init_sock(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + + skb_queue_head_init(&tp->out_of_order_queue); + tcp_init_xmit_timers(sk); + tcp_prequeue_init(tp); + INIT_LIST_HEAD(&tp->tsq_node); + + icsk->icsk_rto = TCP_TIMEOUT_INIT; + tp->mdev = TCP_TIMEOUT_INIT; + + /* So many TCP implementations out there (incorrectly) count the + * initial SYN frame in their delayed-ACK and congestion control + * algorithms that we must have the following bandaid to talk + * efficiently to them. -DaveM + */ + tp->snd_cwnd = TCP_INIT_CWND; + + /* See draft-stevens-tcpca-spec-01 for discussion of the + * initialization of these values. + */ + tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; + tp->snd_cwnd_clamp = ~0; + tp->mss_cache = TCP_MSS_DEFAULT; + + tp->reordering = sysctl_tcp_reordering; + tcp_enable_early_retrans(tp); + icsk->icsk_ca_ops = &tcp_init_congestion_ops; + + sk->sk_state = TCP_CLOSE; + + sk->sk_write_space = sk_stream_write_space; + sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); + + icsk->icsk_sync_mss = tcp_sync_mss; + + /* TCP Cookie Transactions */ + if (sysctl_tcp_cookie_size > 0) { + /* Default, cookies without s_data_payload. */ + tp->cookie_values = + kzalloc(sizeof(*tp->cookie_values), + sk->sk_allocation); + if (tp->cookie_values != NULL) + kref_init(&tp->cookie_values->kref); + } + /* Presumed zeroed, in order of appearance: + * cookie_in_always, cookie_out_never, + * s_data_constant, s_data_in, s_data_out + */ + sk->sk_sndbuf = sysctl_tcp_wmem[1]; + sk->sk_rcvbuf = sysctl_tcp_rmem[1]; + + local_bh_disable(); + sock_update_memcg(sk); + sk_sockets_allocated_inc(sk); + local_bh_enable(); +} +EXPORT_SYMBOL(tcp_init_sock); + /* * Wait for a TCP event. * @@ -528,7 +595,7 @@ static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb) tp->pushed_seq = tp->write_seq; } -static inline int forced_push(const struct tcp_sock *tp) +static inline bool forced_push(const struct tcp_sock *tp) { return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1)); } @@ -701,11 +768,12 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp) skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp); if (skb) { if (sk_wmem_schedule(sk, skb->truesize)) { + skb_reserve(skb, sk->sk_prot->max_header); /* * Make sure that we have exactly size bytes * available to the caller, no more, no less. */ - skb_reserve(skb, skb_tailroom(skb) - size); + skb->avail_size = size; return skb; } __kfree_skb(skb); @@ -730,6 +798,10 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, inet_csk(sk)->icsk_ext_hdr_len - tp->tcp_header_len); + /* TSQ : try to have two TSO segments in flight */ + xmit_size_goal = min_t(u32, xmit_size_goal, + sysctl_tcp_limit_output_bytes >> 1); + xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal); /* We try hard to avoid divides here */ @@ -783,9 +855,10 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse while (psize > 0) { struct sk_buff *skb = tcp_write_queue_tail(sk); struct page *page = pages[poffset / PAGE_SIZE]; - int copy, i, can_coalesce; + int copy, i; int offset = poffset % PAGE_SIZE; int size = min_t(size_t, psize, PAGE_SIZE - offset); + bool can_coalesce; if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) { new_segment: @@ -850,8 +923,7 @@ new_segment: wait_for_sndbuf: set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); wait_for_memory: - if (copied) - tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH); + tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH); if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) goto do_error; @@ -860,7 +932,7 @@ wait_for_memory: } out: - if (copied) + if (copied && !(flags & MSG_SENDPAGE_NOTLAST)) tcp_push(sk, flags, mss_now, tp->nonagle); return copied; @@ -911,27 +983,81 @@ static inline int select_size(const struct sock *sk, bool sg) return tmp; } +void tcp_free_fastopen_req(struct tcp_sock *tp) +{ + if (tp->fastopen_req != NULL) { + kfree(tp->fastopen_req); + tp->fastopen_req = NULL; + } +} + +static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *size) +{ + struct tcp_sock *tp = tcp_sk(sk); + int err, flags; + + if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE)) + return -EOPNOTSUPP; + if (tp->fastopen_req != NULL) + return -EALREADY; /* Another Fast Open is in progress */ + + tp->fastopen_req = kzalloc(sizeof(struct tcp_fastopen_request), + sk->sk_allocation); + if (unlikely(tp->fastopen_req == NULL)) + return -ENOBUFS; + tp->fastopen_req->data = msg; + + flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0; + err = __inet_stream_connect(sk->sk_socket, msg->msg_name, + msg->msg_namelen, flags); + *size = tp->fastopen_req->copied; + tcp_free_fastopen_req(tp); + return err; +} + int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t size) { struct iovec *iov; struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; - int iovlen, flags, err, copied; - int mss_now, size_goal; + int iovlen, flags, err, copied = 0; + int mss_now = 0, size_goal, copied_syn = 0, offset = 0; bool sg; long timeo; lock_sock(sk); flags = msg->msg_flags; + if (flags & MSG_FASTOPEN) { + err = tcp_sendmsg_fastopen(sk, msg, &copied_syn); + if (err == -EINPROGRESS && copied_syn > 0) + goto out; + else if (err) + goto out_err; + offset = copied_syn; + } + timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); /* Wait for a connection to finish. */ if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) if ((err = sk_stream_wait_connect(sk, &timeo)) != 0) + goto do_error; + + if (unlikely(tp->repair)) { + if (tp->repair_queue == TCP_RECV_QUEUE) { + copied = tcp_send_rcvq(sk, msg, size); + goto out; + } + + err = -EINVAL; + if (tp->repair_queue == TCP_NO_QUEUE) goto out_err; + /* 'common' sending to sendq */ + } + /* This should be in poll */ clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); @@ -953,6 +1079,15 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, unsigned char __user *from = iov->iov_base; iov++; + if (unlikely(offset > 0)) { /* Skip bytes copied in SYN */ + if (offset >= seglen) { + offset -= seglen; + continue; + } + seglen -= offset; + from += offset; + offset = 0; + } while (seglen > 0) { int copy = 0; @@ -995,15 +1130,14 @@ new_segment: copy = seglen; /* Where to copy to? */ - if (skb_tailroom(skb) > 0) { + if (skb_availroom(skb) > 0) { /* We have some space in skb head. Superb! */ - if (copy > skb_tailroom(skb)) - copy = skb_tailroom(skb); + copy = min_t(int, copy, skb_availroom(skb)); err = skb_add_data_nocache(sk, skb, from, copy); if (err) goto do_fault; } else { - int merge = 0; + bool merge = false; int i = skb_shinfo(skb)->nr_frags; struct page *page = sk->sk_sndmsg_page; int off; @@ -1017,7 +1151,7 @@ new_segment: off != PAGE_SIZE) { /* We can extend the last page * fragment. */ - merge = 1; + merge = true; } else if (i == MAX_SKB_FRAGS || !sg) { /* Need to add new fragment and cannot * do this because interface is non-SG, @@ -1089,7 +1223,7 @@ new_segment: if ((seglen -= copy) == 0 && iovlen == 0) goto out; - if (skb->len < max || (flags & MSG_OOB)) + if (skb->len < max || (flags & MSG_OOB) || unlikely(tp->repair)) continue; if (forced_push(tp)) { @@ -1102,7 +1236,7 @@ new_segment: wait_for_sndbuf: set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); wait_for_memory: - if (copied) + if (copied && likely(!tp->repair)) tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH); if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) @@ -1113,10 +1247,10 @@ wait_for_memory: } out: - if (copied) + if (copied && likely(!tp->repair)) tcp_push(sk, flags, mss_now, tp->nonagle); release_sock(sk); - return copied; + return copied + copied_syn; do_fault: if (!skb->len) { @@ -1129,7 +1263,7 @@ do_fault: } do_error: - if (copied) + if (copied + copied_syn) goto out; out_err: err = sk_stream_error(sk, flags, err); @@ -1187,6 +1321,24 @@ static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags) return -EAGAIN; } +static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len) +{ + struct sk_buff *skb; + int copied = 0, err = 0; + + /* XXX -- need to support SO_PEEK_OFF */ + + skb_queue_walk(&sk->sk_write_queue, skb) { + err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, skb->len); + if (err) + break; + + copied += skb->len; + } + + return err ?: copied; +} + /* Clean up the receive buffer for full frames taken by the user, * then send an ACK if necessary. COPIED is the number of bytes * tcp_recvmsg has given to the user so far, it speeds up the @@ -1196,7 +1348,7 @@ static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags) void tcp_cleanup_rbuf(struct sock *sk, int copied) { struct tcp_sock *tp = tcp_sk(sk); - int time_to_ack = 0; + bool time_to_ack = false; struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); @@ -1222,7 +1374,7 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied) ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) && !icsk->icsk_ack.pingpong)) && !atomic_read(&sk->sk_rmem_alloc))) - time_to_ack = 1; + time_to_ack = true; } /* We send an ACK if we can now advertise a non-zero window @@ -1244,7 +1396,7 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied) * "Lots" means "at least twice" here. */ if (new_window && new_window >= 2 * rcv_window_now) - time_to_ack = 1; + time_to_ack = true; } } if (time_to_ack) @@ -1376,11 +1528,11 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, break; } if (tcp_hdr(skb)->fin) { - sk_eat_skb(sk, skb, 0); + sk_eat_skb(sk, skb, false); ++seq; break; } - sk_eat_skb(sk, skb, 0); + sk_eat_skb(sk, skb, false); if (!desc->count) break; tp->copied_seq = seq; @@ -1416,7 +1568,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, int target; /* Read at least this many bytes */ long timeo; struct task_struct *user_recv = NULL; - int copied_early = 0; + bool copied_early = false; struct sk_buff *skb; u32 urg_hole = 0; @@ -1432,6 +1584,21 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, if (flags & MSG_OOB) goto recv_urg; + if (unlikely(tp->repair)) { + err = -EPERM; + if (!(flags & MSG_PEEK)) + goto out; + + if (tp->repair_queue == TCP_SEND_QUEUE) + goto recv_sndq; + + err = -EINVAL; + if (tp->repair_queue == TCP_NO_QUEUE) + goto out; + + /* 'common' recv queue MSG_PEEK-ing */ + } + seq = &tp->copied_seq; if (flags & MSG_PEEK) { peek_seq = tp->copied_seq; @@ -1452,7 +1619,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, if ((available < target) && (len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) && !sysctl_tcp_low_latency && - dma_find_channel(DMA_MEMCPY)) { + net_dma_find_channel()) { preempt_enable_no_resched(); tp->ucopy.pinned_list = dma_pin_iovec_pages(msg->msg_iov, len); @@ -1633,9 +1800,9 @@ do_prequeue: } if ((flags & MSG_PEEK) && (peek_seq - copied - urg_hole != tp->copied_seq)) { - if (net_ratelimit()) - printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n", - current->comm, task_pid_nr(current)); + net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n", + current->comm, + task_pid_nr(current)); peek_seq = tp->copied_seq; } continue; @@ -1667,7 +1834,7 @@ do_prequeue: if (!(flags & MSG_TRUNC)) { #ifdef CONFIG_NET_DMA if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) - tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY); + tp->ucopy.dma_chan = net_dma_find_channel(); if (tp->ucopy.dma_chan) { tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec( @@ -1689,7 +1856,7 @@ do_prequeue: dma_async_memcpy_issue_pending(tp->ucopy.dma_chan); if ((offset + used) == skb->len) - copied_early = 1; + copied_early = true; } else #endif @@ -1723,7 +1890,7 @@ skip_copy: goto found_fin_ok; if (!(flags & MSG_PEEK)) { sk_eat_skb(sk, skb, copied_early); - copied_early = 0; + copied_early = false; } continue; @@ -1732,7 +1899,7 @@ skip_copy: ++*seq; if (!(flags & MSG_PEEK)) { sk_eat_skb(sk, skb, copied_early); - copied_early = 0; + copied_early = false; } break; } while (len > 0); @@ -1783,6 +1950,10 @@ out: recv_urg: err = tcp_recv_urg(sk, msg, len, flags); goto out; + +recv_sndq: + err = tcp_peek_sndq(sk, msg, len); + goto out; } EXPORT_SYMBOL(tcp_recvmsg); @@ -1886,10 +2057,10 @@ bool tcp_check_oom(struct sock *sk, int shift) too_many_orphans = tcp_too_many_orphans(sk, shift); out_of_socket_memory = tcp_out_of_memory(sk); - if (too_many_orphans && net_ratelimit()) - pr_info("too many orphaned sockets\n"); - if (out_of_socket_memory && net_ratelimit()) - pr_info("out of memory -- consider tuning tcp_mem\n"); + if (too_many_orphans) + net_info_ratelimited("too many orphaned sockets\n"); + if (out_of_socket_memory) + net_info_ratelimited("out of memory -- consider tuning tcp_mem\n"); return too_many_orphans || out_of_socket_memory; } @@ -1935,7 +2106,9 @@ void tcp_close(struct sock *sk, long timeout) * advertise a zero window, then kill -9 the FTP client, wheee... * Note: timeout is always zero in such a case. */ - if (data_was_unread) { + if (unlikely(tcp_sk(sk)->repair)) { + sk->sk_prot->disconnect(sk, 0); + } else if (data_was_unread) { /* Unread data was tossed, zap the connection. */ NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE); tcp_set_state(sk, TCP_CLOSE); @@ -2053,7 +2226,7 @@ EXPORT_SYMBOL(tcp_close); /* These states need RST on ABORT according to RFC793 */ -static inline int tcp_need_reset(int state) +static inline bool tcp_need_reset(int state) { return (1 << state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 | @@ -2074,6 +2247,8 @@ int tcp_disconnect(struct sock *sk, int flags) /* ABORT function of RFC793 */ if (old_state == TCP_LISTEN) { inet_csk_listen_stop(sk); + } else if (unlikely(tp->repair)) { + sk->sk_err = ECONNABORTED; } else if (tcp_need_reset(old_state) || (tp->snd_nxt != tp->write_seq && (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) { @@ -2125,6 +2300,54 @@ int tcp_disconnect(struct sock *sk, int flags) } EXPORT_SYMBOL(tcp_disconnect); +static inline bool tcp_can_repair_sock(const struct sock *sk) +{ + return capable(CAP_NET_ADMIN) && + ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_ESTABLISHED)); +} + +static int tcp_repair_options_est(struct tcp_sock *tp, + struct tcp_repair_opt __user *optbuf, unsigned int len) +{ + struct tcp_repair_opt opt; + + while (len >= sizeof(opt)) { + if (copy_from_user(&opt, optbuf, sizeof(opt))) + return -EFAULT; + + optbuf++; + len -= sizeof(opt); + + switch (opt.opt_code) { + case TCPOPT_MSS: + tp->rx_opt.mss_clamp = opt.opt_val; + break; + case TCPOPT_WINDOW: + if (opt.opt_val > 14) + return -EFBIG; + + tp->rx_opt.snd_wscale = opt.opt_val; + break; + case TCPOPT_SACK_PERM: + if (opt.opt_val != 0) + return -EINVAL; + + tp->rx_opt.sack_ok |= TCP_SACK_SEEN; + if (sysctl_tcp_fack) + tcp_enable_fack(tp); + break; + case TCPOPT_TIMESTAMP: + if (opt.opt_val != 0) + return -EINVAL; + + tp->rx_opt.tstamp_ok = 1; + break; + } + } + + return 0; +} + /* * Socket option code for TCP. */ @@ -2295,6 +2518,55 @@ static int do_tcp_setsockopt(struct sock *sk, int level, err = -EINVAL; else tp->thin_dupack = val; + if (tp->thin_dupack) + tcp_disable_early_retrans(tp); + break; + + case TCP_REPAIR: + if (!tcp_can_repair_sock(sk)) + err = -EPERM; + else if (val == 1) { + tp->repair = 1; + sk->sk_reuse = SK_FORCE_REUSE; + tp->repair_queue = TCP_NO_QUEUE; + } else if (val == 0) { + tp->repair = 0; + sk->sk_reuse = SK_NO_REUSE; + tcp_send_window_probe(sk); + } else + err = -EINVAL; + + break; + + case TCP_REPAIR_QUEUE: + if (!tp->repair) + err = -EPERM; + else if (val < TCP_QUEUES_NR) + tp->repair_queue = val; + else + err = -EINVAL; + break; + + case TCP_QUEUE_SEQ: + if (sk->sk_state != TCP_CLOSE) + err = -EPERM; + else if (tp->repair_queue == TCP_SEND_QUEUE) + tp->write_seq = val; + else if (tp->repair_queue == TCP_RECV_QUEUE) + tp->rcv_nxt = val; + else + err = -EINVAL; + break; + + case TCP_REPAIR_OPTIONS: + if (!tp->repair) + err = -EINVAL; + else if (sk->sk_state == TCP_ESTABLISHED) + err = tcp_repair_options_est(tp, + (struct tcp_repair_opt __user *)optval, + optlen); + else + err = -EPERM; break; case TCP_CORK: @@ -2530,6 +2802,8 @@ static int do_tcp_getsockopt(struct sock *sk, int level, val = tp->mss_cache; if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) val = tp->rx_opt.user_mss; + if (tp->repair) + val = tp->rx_opt.mss_clamp; break; case TCP_NODELAY: val = !!(tp->nonagle&TCP_NAGLE_OFF); @@ -2632,6 +2906,26 @@ static int do_tcp_getsockopt(struct sock *sk, int level, val = tp->thin_dupack; break; + case TCP_REPAIR: + val = tp->repair; + break; + + case TCP_REPAIR_QUEUE: + if (tp->repair) + val = tp->repair_queue; + else + return -EINVAL; + break; + + case TCP_QUEUE_SEQ: + if (tp->repair_queue == TCP_SEND_QUEUE) + val = tp->write_seq; + else if (tp->repair_queue == TCP_RECV_QUEUE) + val = tp->rcv_nxt; + else + return -EINVAL; + break; + case TCP_USER_TIMEOUT: val = jiffies_to_msecs(icsk->icsk_user_timeout); break; @@ -2675,7 +2969,7 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb, { struct sk_buff *segs = ERR_PTR(-EINVAL); struct tcphdr *th; - unsigned thlen; + unsigned int thlen; unsigned int seq; __be32 delta; unsigned int oldlen; @@ -2933,13 +3227,13 @@ out_free: struct tcp_md5sig_pool __percpu *tcp_alloc_md5sig_pool(struct sock *sk) { struct tcp_md5sig_pool __percpu *pool; - int alloc = 0; + bool alloc = false; retry: spin_lock_bh(&tcp_md5sig_pool_lock); pool = tcp_md5sig_pool; if (tcp_md5sig_users++ == 0) { - alloc = 1; + alloc = true; spin_unlock_bh(&tcp_md5sig_pool_lock); } else if (!pool) { tcp_md5sig_users--; @@ -3033,9 +3327,9 @@ int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp, struct scatterlist sg; const struct tcphdr *tp = tcp_hdr(skb); struct hash_desc *desc = &hp->md5_desc; - unsigned i; - const unsigned head_data_len = skb_headlen(skb) > header_len ? - skb_headlen(skb) - header_len : 0; + unsigned int i; + const unsigned int head_data_len = skb_headlen(skb) > header_len ? + skb_headlen(skb) - header_len : 0; const struct skb_shared_info *shi = skb_shinfo(skb); struct sk_buff *frag_iter; @@ -3072,8 +3366,7 @@ EXPORT_SYMBOL(tcp_md5_hash_key); #endif -/** - * Each Responder maintains up to two secret values concurrently for +/* Each Responder maintains up to two secret values concurrently for * efficient secret rollover. Each secret value has 4 states: * * Generating. (tcp_secret_generating != tcp_secret_primary) @@ -3223,9 +3516,15 @@ extern struct tcp_congestion_ops tcp_reno; static __initdata unsigned long thash_entries; static int __init set_thash_entries(char *str) { + ssize_t ret; + if (!str) return 0; - thash_entries = simple_strtoul(str, &str, 0); + + ret = kstrtoul(str, 0, &thash_entries); + if (ret) + return 0; + return 1; } __setup("thash_entries=", set_thash_entries); @@ -3243,7 +3542,7 @@ void __init tcp_init(void) { struct sk_buff *skb = NULL; unsigned long limit; - int max_share, cnt; + int max_rshare, max_wshare, cnt; unsigned int i; unsigned long jiffy = jiffies; @@ -3270,6 +3569,7 @@ void __init tcp_init(void) 0, NULL, &tcp_hashinfo.ehash_mask, + 0, thash_entries ? 0 : 512 * 1024); for (i = 0; i <= tcp_hashinfo.ehash_mask; i++) { INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i); @@ -3286,6 +3586,7 @@ void __init tcp_init(void) 0, &tcp_hashinfo.bhash_size, NULL, + 0, 64 * 1024); tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size; for (i = 0; i < tcp_hashinfo.bhash_size; i++) { @@ -3302,21 +3603,23 @@ void __init tcp_init(void) tcp_init_mem(&init_net); /* Set per-socket limits to no more than 1/128 the pressure threshold */ - limit = nr_free_buffer_pages() << (PAGE_SHIFT - 10); - limit = max(limit, 128UL); - max_share = min(4UL*1024*1024, limit); + limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7); + max_wshare = min(4UL*1024*1024, limit); + max_rshare = min(6UL*1024*1024, limit); sysctl_tcp_wmem[0] = SK_MEM_QUANTUM; sysctl_tcp_wmem[1] = 16*1024; - sysctl_tcp_wmem[2] = max(64*1024, max_share); + sysctl_tcp_wmem[2] = max(64*1024, max_wshare); sysctl_tcp_rmem[0] = SK_MEM_QUANTUM; sysctl_tcp_rmem[1] = 87380; - sysctl_tcp_rmem[2] = max(87380, max_share); + sysctl_tcp_rmem[2] = max(87380, max_rshare); pr_info("Hash tables configured (established %u bind %u)\n", tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size); + tcp_metrics_init(); + tcp_register_congestion_control(&tcp_reno); memset(&tcp_secret_one.secrets[0], 0, sizeof(tcp_secret_one.secrets)); @@ -3327,4 +3630,5 @@ void __init tcp_init(void) tcp_secret_primary = &tcp_secret_one; tcp_secret_retiring = &tcp_secret_two; tcp_secret_secondary = &tcp_secret_two; + tcp_tasklet_init(); } diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 272a84593c85..4d4db16e336e 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -280,19 +280,19 @@ int tcp_set_congestion_control(struct sock *sk, const char *name) /* RFC2861 Check whether we are limited by application or congestion window * This is the inverse of cwnd check in tcp_tso_should_defer */ -int tcp_is_cwnd_limited(const struct sock *sk, u32 in_flight) +bool tcp_is_cwnd_limited(const struct sock *sk, u32 in_flight) { const struct tcp_sock *tp = tcp_sk(sk); u32 left; if (in_flight >= tp->snd_cwnd) - return 1; + return true; left = tp->snd_cwnd - in_flight; if (sk_can_gso(sk) && left * sysctl_tcp_tso_win_divisor < tp->snd_cwnd && left * tp->mss_cache < sk->sk_gso_max_size) - return 1; + return true; return left <= tcp_max_tso_deferred_mss(tp); } EXPORT_SYMBOL_GPL(tcp_is_cwnd_limited); @@ -307,6 +307,7 @@ EXPORT_SYMBOL_GPL(tcp_is_cwnd_limited); void tcp_slow_start(struct tcp_sock *tp) { int cnt; /* increase in packets */ + unsigned int delta = 0; /* RFC3465: ABC Slow start * Increase only after a full MSS of bytes is acked @@ -333,9 +334,9 @@ void tcp_slow_start(struct tcp_sock *tp) tp->snd_cwnd_cnt += cnt; while (tp->snd_cwnd_cnt >= tp->snd_cwnd) { tp->snd_cwnd_cnt -= tp->snd_cwnd; - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; + delta++; } + tp->snd_cwnd = min(tp->snd_cwnd + delta, tp->snd_cwnd_clamp); } EXPORT_SYMBOL_GPL(tcp_slow_start); diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c new file mode 100644 index 000000000000..a7f729c409d7 --- /dev/null +++ b/net/ipv4/tcp_fastopen.c @@ -0,0 +1,11 @@ +#include <linux/init.h> +#include <linux/kernel.h> + +int sysctl_tcp_fastopen; + +static int __init tcp_fastopen_init(void) +{ + return 0; +} + +late_initcall(tcp_fastopen_init); diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c index fe3ecf484b44..57bdd17dff4d 100644 --- a/net/ipv4/tcp_hybla.c +++ b/net/ipv4/tcp_hybla.c @@ -15,7 +15,7 @@ /* Tcp Hybla structure. */ struct hybla { - u8 hybla_en; + bool hybla_en; u32 snd_cwnd_cents; /* Keeps increment values when it is <1, <<7 */ u32 rho; /* Rho parameter, integer part */ u32 rho2; /* Rho * Rho, integer part */ @@ -24,8 +24,7 @@ struct hybla { u32 minrtt; /* Minimum smoothed round trip time value seen */ }; -/* Hybla reference round trip time (default= 1/40 sec = 25 ms), - expressed in jiffies */ +/* Hybla reference round trip time (default= 1/40 sec = 25 ms), in ms */ static int rtt0 = 25; module_param(rtt0, int, 0644); MODULE_PARM_DESC(rtt0, "reference rout trip time (ms)"); @@ -39,7 +38,7 @@ static inline void hybla_recalc_param (struct sock *sk) ca->rho_3ls = max_t(u32, tcp_sk(sk)->srtt / msecs_to_jiffies(rtt0), 8); ca->rho = ca->rho_3ls >> 3; ca->rho2_7ls = (ca->rho_3ls * ca->rho_3ls) << 1; - ca->rho2 = ca->rho2_7ls >>7; + ca->rho2 = ca->rho2_7ls >> 7; } static void hybla_init(struct sock *sk) @@ -52,7 +51,7 @@ static void hybla_init(struct sock *sk) ca->rho_3ls = 0; ca->rho2_7ls = 0; ca->snd_cwnd_cents = 0; - ca->hybla_en = 1; + ca->hybla_en = true; tp->snd_cwnd = 2; tp->snd_cwnd_clamp = 65535; @@ -67,6 +66,7 @@ static void hybla_init(struct sock *sk) static void hybla_state(struct sock *sk, u8 ca_state) { struct hybla *ca = inet_csk_ca(sk); + ca->hybla_en = (ca_state == TCP_CA_Open); } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index e886e2f7fa8d..21d7f8f3a7a5 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -85,20 +85,23 @@ int sysctl_tcp_ecn __read_mostly = 2; EXPORT_SYMBOL(sysctl_tcp_ecn); int sysctl_tcp_dsack __read_mostly = 1; int sysctl_tcp_app_win __read_mostly = 31; -int sysctl_tcp_adv_win_scale __read_mostly = 2; +int sysctl_tcp_adv_win_scale __read_mostly = 1; EXPORT_SYMBOL(sysctl_tcp_adv_win_scale); +/* rfc5961 challenge ack rate limiting */ +int sysctl_tcp_challenge_ack_limit = 100; + int sysctl_tcp_stdurg __read_mostly; int sysctl_tcp_rfc1337 __read_mostly; int sysctl_tcp_max_orphans __read_mostly = NR_FILE; int sysctl_tcp_frto __read_mostly = 2; int sysctl_tcp_frto_response __read_mostly; -int sysctl_tcp_nometrics_save __read_mostly; int sysctl_tcp_thin_dupack __read_mostly; int sysctl_tcp_moderate_rcvbuf __read_mostly = 1; int sysctl_tcp_abc __read_mostly; +int sysctl_tcp_early_retrans __read_mostly = 2; #define FLAG_DATA 0x01 /* Incoming frame contained data. */ #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ @@ -175,7 +178,7 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb) static void tcp_incr_quickack(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); - unsigned quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss); + unsigned int quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss); if (quickacks == 0) quickacks = 2; @@ -195,9 +198,10 @@ static void tcp_enter_quickack_mode(struct sock *sk) * and the session is not interactive. */ -static inline int tcp_in_quickack_mode(const struct sock *sk) +static inline bool tcp_in_quickack_mode(const struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); + return icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong; } @@ -252,11 +256,11 @@ static inline void TCP_ECN_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th) tp->ecn_flags &= ~TCP_ECN_OK; } -static inline int TCP_ECN_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th) +static bool TCP_ECN_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th) { if (th->ece && !th->syn && (tp->ecn_flags & TCP_ECN_OK)) - return 1; - return 0; + return true; + return false; } /* Buffer size and advertised window tuning. @@ -335,6 +339,7 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb) incr = __tcp_grow_window(sk, skb); if (incr) { + incr = max_t(int, incr, 2 * skb->len); tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr, tp->window_clamp); inet_csk(sk)->icsk_ack.quick |= 1; @@ -474,8 +479,11 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep) if (!win_dep) { m -= (new_sample >> 3); new_sample += m; - } else if (m < new_sample) - new_sample = m << 3; + } else { + m <<= 3; + if (m < new_sample) + new_sample = m; + } } else { /* No previous measure. */ new_sample = m << 3; @@ -491,7 +499,7 @@ static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp) goto new_measure; if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq)) return; - tcp_rcv_rtt_update(tp, jiffies - tp->rcv_rtt_est.time, 1); + tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rcv_rtt_est.time, 1); new_measure: tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd; @@ -695,7 +703,7 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt) /* Calculate rto without backoff. This is the second half of Van Jacobson's * routine referred to above. */ -static inline void tcp_set_rto(struct sock *sk) +void tcp_set_rto(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); /* Old crap is replaced with new one. 8) @@ -722,109 +730,6 @@ static inline void tcp_set_rto(struct sock *sk) tcp_bound_rto(sk); } -/* Save metrics learned by this TCP session. - This function is called only, when TCP finishes successfully - i.e. when it enters TIME-WAIT or goes from LAST-ACK to CLOSE. - */ -void tcp_update_metrics(struct sock *sk) -{ - struct tcp_sock *tp = tcp_sk(sk); - struct dst_entry *dst = __sk_dst_get(sk); - - if (sysctl_tcp_nometrics_save) - return; - - dst_confirm(dst); - - if (dst && (dst->flags & DST_HOST)) { - const struct inet_connection_sock *icsk = inet_csk(sk); - int m; - unsigned long rtt; - - if (icsk->icsk_backoff || !tp->srtt) { - /* This session failed to estimate rtt. Why? - * Probably, no packets returned in time. - * Reset our results. - */ - if (!(dst_metric_locked(dst, RTAX_RTT))) - dst_metric_set(dst, RTAX_RTT, 0); - return; - } - - rtt = dst_metric_rtt(dst, RTAX_RTT); - m = rtt - tp->srtt; - - /* If newly calculated rtt larger than stored one, - * store new one. Otherwise, use EWMA. Remember, - * rtt overestimation is always better than underestimation. - */ - if (!(dst_metric_locked(dst, RTAX_RTT))) { - if (m <= 0) - set_dst_metric_rtt(dst, RTAX_RTT, tp->srtt); - else - set_dst_metric_rtt(dst, RTAX_RTT, rtt - (m >> 3)); - } - - if (!(dst_metric_locked(dst, RTAX_RTTVAR))) { - unsigned long var; - if (m < 0) - m = -m; - - /* Scale deviation to rttvar fixed point */ - m >>= 1; - if (m < tp->mdev) - m = tp->mdev; - - var = dst_metric_rtt(dst, RTAX_RTTVAR); - if (m >= var) - var = m; - else - var -= (var - m) >> 2; - - set_dst_metric_rtt(dst, RTAX_RTTVAR, var); - } - - if (tcp_in_initial_slowstart(tp)) { - /* Slow start still did not finish. */ - if (dst_metric(dst, RTAX_SSTHRESH) && - !dst_metric_locked(dst, RTAX_SSTHRESH) && - (tp->snd_cwnd >> 1) > dst_metric(dst, RTAX_SSTHRESH)) - dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_cwnd >> 1); - if (!dst_metric_locked(dst, RTAX_CWND) && - tp->snd_cwnd > dst_metric(dst, RTAX_CWND)) - dst_metric_set(dst, RTAX_CWND, tp->snd_cwnd); - } else if (tp->snd_cwnd > tp->snd_ssthresh && - icsk->icsk_ca_state == TCP_CA_Open) { - /* Cong. avoidance phase, cwnd is reliable. */ - if (!dst_metric_locked(dst, RTAX_SSTHRESH)) - dst_metric_set(dst, RTAX_SSTHRESH, - max(tp->snd_cwnd >> 1, tp->snd_ssthresh)); - if (!dst_metric_locked(dst, RTAX_CWND)) - dst_metric_set(dst, RTAX_CWND, - (dst_metric(dst, RTAX_CWND) + - tp->snd_cwnd) >> 1); - } else { - /* Else slow start did not finish, cwnd is non-sense, - ssthresh may be also invalid. - */ - if (!dst_metric_locked(dst, RTAX_CWND)) - dst_metric_set(dst, RTAX_CWND, - (dst_metric(dst, RTAX_CWND) + - tp->snd_ssthresh) >> 1); - if (dst_metric(dst, RTAX_SSTHRESH) && - !dst_metric_locked(dst, RTAX_SSTHRESH) && - tp->snd_ssthresh > dst_metric(dst, RTAX_SSTHRESH)) - dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_ssthresh); - } - - if (!dst_metric_locked(dst, RTAX_REORDERING)) { - if (dst_metric(dst, RTAX_REORDERING) < tp->reordering && - tp->reordering != sysctl_tcp_reordering) - dst_metric_set(dst, RTAX_REORDERING, tp->reordering); - } - } -} - __u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst) { __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); @@ -861,7 +766,7 @@ void tcp_enter_cwr(struct sock *sk, const int set_ssthresh) * Packet counting of FACK is based on in-order assumptions, therefore TCP * disables it when reordering is detected */ -static void tcp_disable_fack(struct tcp_sock *tp) +void tcp_disable_fack(struct tcp_sock *tp) { /* RFC3517 uses different metric in lost marker => reset on change */ if (tcp_is_fack(tp)) @@ -875,85 +780,6 @@ static void tcp_dsack_seen(struct tcp_sock *tp) tp->rx_opt.sack_ok |= TCP_DSACK_SEEN; } -/* Initialize metrics on socket. */ - -static void tcp_init_metrics(struct sock *sk) -{ - struct tcp_sock *tp = tcp_sk(sk); - struct dst_entry *dst = __sk_dst_get(sk); - - if (dst == NULL) - goto reset; - - dst_confirm(dst); - - if (dst_metric_locked(dst, RTAX_CWND)) - tp->snd_cwnd_clamp = dst_metric(dst, RTAX_CWND); - if (dst_metric(dst, RTAX_SSTHRESH)) { - tp->snd_ssthresh = dst_metric(dst, RTAX_SSTHRESH); - if (tp->snd_ssthresh > tp->snd_cwnd_clamp) - tp->snd_ssthresh = tp->snd_cwnd_clamp; - } else { - /* ssthresh may have been reduced unnecessarily during. - * 3WHS. Restore it back to its initial default. - */ - tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; - } - if (dst_metric(dst, RTAX_REORDERING) && - tp->reordering != dst_metric(dst, RTAX_REORDERING)) { - tcp_disable_fack(tp); - tp->reordering = dst_metric(dst, RTAX_REORDERING); - } - - if (dst_metric(dst, RTAX_RTT) == 0 || tp->srtt == 0) - goto reset; - - /* Initial rtt is determined from SYN,SYN-ACK. - * The segment is small and rtt may appear much - * less than real one. Use per-dst memory - * to make it more realistic. - * - * A bit of theory. RTT is time passed after "normal" sized packet - * is sent until it is ACKed. In normal circumstances sending small - * packets force peer to delay ACKs and calculation is correct too. - * The algorithm is adaptive and, provided we follow specs, it - * NEVER underestimate RTT. BUT! If peer tries to make some clever - * tricks sort of "quick acks" for time long enough to decrease RTT - * to low value, and then abruptly stops to do it and starts to delay - * ACKs, wait for troubles. - */ - if (dst_metric_rtt(dst, RTAX_RTT) > tp->srtt) { - tp->srtt = dst_metric_rtt(dst, RTAX_RTT); - tp->rtt_seq = tp->snd_nxt; - } - if (dst_metric_rtt(dst, RTAX_RTTVAR) > tp->mdev) { - tp->mdev = dst_metric_rtt(dst, RTAX_RTTVAR); - tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk)); - } - tcp_set_rto(sk); -reset: - if (tp->srtt == 0) { - /* RFC2988bis: We've failed to get a valid RTT sample from - * 3WHS. This is most likely due to retransmission, - * including spurious one. Reset the RTO back to 3secs - * from the more aggressive 1sec to avoid more spurious - * retransmission. - */ - tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_FALLBACK; - inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK; - } - /* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been - * retransmitted. In light of RFC2988bis' more aggressive 1sec - * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK - * retransmission has occurred. - */ - if (tp->total_retrans > 1) - tp->snd_cwnd = 1; - else - tp->snd_cwnd = tcp_init_cwnd(tp, dst); - tp->snd_cwnd_stamp = tcp_time_stamp; -} - static void tcp_update_reordering(struct sock *sk, const int metric, const int ts) { @@ -975,15 +801,18 @@ static void tcp_update_reordering(struct sock *sk, const int metric, NET_INC_STATS_BH(sock_net(sk), mib_idx); #if FASTRETRANS_DEBUG > 1 - printk(KERN_DEBUG "Disorder%d %d %u f%u s%u rr%d\n", - tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state, - tp->reordering, - tp->fackets_out, - tp->sacked_out, - tp->undo_marker ? tp->undo_retrans : 0); + pr_debug("Disorder%d %d %u f%u s%u rr%d\n", + tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state, + tp->reordering, + tp->fackets_out, + tp->sacked_out, + tp->undo_marker ? tp->undo_retrans : 0); #endif tcp_disable_fack(tp); } + + if (metric > 0) + tcp_disable_early_retrans(tp); } /* This must be called before lost_out is incremented */ @@ -1114,36 +943,36 @@ static void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, * the exact amount is rather hard to quantify. However, tp->max_window can * be used as an exaggerated estimate. */ -static int tcp_is_sackblock_valid(struct tcp_sock *tp, int is_dsack, - u32 start_seq, u32 end_seq) +static bool tcp_is_sackblock_valid(struct tcp_sock *tp, bool is_dsack, + u32 start_seq, u32 end_seq) { /* Too far in future, or reversed (interpretation is ambiguous) */ if (after(end_seq, tp->snd_nxt) || !before(start_seq, end_seq)) - return 0; + return false; /* Nasty start_seq wrap-around check (see comments above) */ if (!before(start_seq, tp->snd_nxt)) - return 0; + return false; /* In outstanding window? ...This is valid exit for D-SACKs too. * start_seq == snd_una is non-sensical (see comments above) */ if (after(start_seq, tp->snd_una)) - return 1; + return true; if (!is_dsack || !tp->undo_marker) - return 0; + return false; /* ...Then it's D-SACK, and must reside below snd_una completely */ if (after(end_seq, tp->snd_una)) - return 0; + return false; if (!before(start_seq, tp->undo_marker)) - return 1; + return true; /* Too old */ if (!after(end_seq, tp->undo_marker)) - return 0; + return false; /* Undo_marker boundary crossing (overestimates a lot). Known already: * start_seq < undo_marker and end_seq >= undo_marker. @@ -1215,17 +1044,17 @@ static void tcp_mark_lost_retrans(struct sock *sk) tp->lost_retrans_low = new_low_seq; } -static int tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb, - struct tcp_sack_block_wire *sp, int num_sacks, - u32 prior_snd_una) +static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb, + struct tcp_sack_block_wire *sp, int num_sacks, + u32 prior_snd_una) { struct tcp_sock *tp = tcp_sk(sk); u32 start_seq_0 = get_unaligned_be32(&sp[0].start_seq); u32 end_seq_0 = get_unaligned_be32(&sp[0].end_seq); - int dup_sack = 0; + bool dup_sack = false; if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) { - dup_sack = 1; + dup_sack = true; tcp_dsack_seen(tp); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKRECV); } else if (num_sacks > 1) { @@ -1234,7 +1063,7 @@ static int tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb, if (!after(end_seq_0, end_seq_1) && !before(start_seq_0, start_seq_1)) { - dup_sack = 1; + dup_sack = true; tcp_dsack_seen(tp); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKOFORECV); @@ -1265,9 +1094,10 @@ struct tcp_sacktag_state { * FIXME: this could be merged to shift decision code */ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, - u32 start_seq, u32 end_seq) + u32 start_seq, u32 end_seq) { - int in_sack, err; + int err; + bool in_sack; unsigned int pkt_len; unsigned int mss; @@ -1313,7 +1143,7 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, static u8 tcp_sacktag_one(struct sock *sk, struct tcp_sacktag_state *state, u8 sacked, u32 start_seq, u32 end_seq, - int dup_sack, int pcount) + bool dup_sack, int pcount) { struct tcp_sock *tp = tcp_sk(sk); int fack_count = state->fack_count; @@ -1393,10 +1223,10 @@ static u8 tcp_sacktag_one(struct sock *sk, /* Shift newly-SACKed bytes from this skb to the immediately previous * already-SACKed sk_buff. Mark the newly-SACKed bytes as such. */ -static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, - struct tcp_sacktag_state *state, - unsigned int pcount, int shifted, int mss, - int dup_sack) +static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, + struct tcp_sacktag_state *state, + unsigned int pcount, int shifted, int mss, + bool dup_sack) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *prev = tcp_write_queue_prev(sk, skb); @@ -1446,7 +1276,7 @@ static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, if (skb->len > 0) { BUG_ON(!tcp_skb_pcount(skb)); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKSHIFTED); - return 0; + return false; } /* Whole SKB was eaten :-) */ @@ -1469,7 +1299,7 @@ static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKMERGED); - return 1; + return true; } /* I wish gso_size would have a bit more sane initialization than @@ -1492,7 +1322,7 @@ static int skb_can_shift(const struct sk_buff *skb) static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, struct tcp_sacktag_state *state, u32 start_seq, u32 end_seq, - int dup_sack) + bool dup_sack) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *prev; @@ -1631,14 +1461,14 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, struct tcp_sack_block *next_dup, struct tcp_sacktag_state *state, u32 start_seq, u32 end_seq, - int dup_sack_in) + bool dup_sack_in) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *tmp; tcp_for_write_queue_from(skb, sk) { int in_sack = 0; - int dup_sack = dup_sack_in; + bool dup_sack = dup_sack_in; if (skb == tcp_send_head(sk)) break; @@ -1653,7 +1483,7 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, next_dup->start_seq, next_dup->end_seq); if (in_sack > 0) - dup_sack = 1; + dup_sack = true; } /* skb reference here is a bit tricky to get right, since @@ -1758,7 +1588,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, struct sk_buff *skb; int num_sacks = min(TCP_NUM_SACKS, (ptr[1] - TCPOLEN_SACK_BASE) >> 3); int used_sacks; - int found_dup_sack = 0; + bool found_dup_sack = false; int i, j; int first_sack_index; @@ -1789,7 +1619,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, used_sacks = 0; first_sack_index = 0; for (i = 0; i < num_sacks; i++) { - int dup_sack = !i && found_dup_sack; + bool dup_sack = !i && found_dup_sack; sp[used_sacks].start_seq = get_unaligned_be32(&sp_wire[i].start_seq); sp[used_sacks].end_seq = get_unaligned_be32(&sp_wire[i].end_seq); @@ -1856,7 +1686,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, while (i < used_sacks) { u32 start_seq = sp[i].start_seq; u32 end_seq = sp[i].end_seq; - int dup_sack = (found_dup_sack && (i == first_sack_index)); + bool dup_sack = (found_dup_sack && (i == first_sack_index)); struct tcp_sack_block *next_dup = NULL; if (found_dup_sack && ((i + 1) == first_sack_index)) @@ -1958,9 +1788,9 @@ out: } /* Limits sacked_out so that sum with lost_out isn't ever larger than - * packets_out. Returns zero if sacked_out adjustement wasn't necessary. + * packets_out. Returns false if sacked_out adjustement wasn't necessary. */ -static int tcp_limit_reno_sacked(struct tcp_sock *tp) +static bool tcp_limit_reno_sacked(struct tcp_sock *tp) { u32 holes; @@ -1969,9 +1799,9 @@ static int tcp_limit_reno_sacked(struct tcp_sock *tp) if ((tp->sacked_out + holes) > tp->packets_out) { tp->sacked_out = tp->packets_out - holes; - return 1; + return true; } - return 0; + return false; } /* If we receive more dupacks than we expected counting segments @@ -2025,40 +1855,40 @@ static int tcp_is_sackfrto(const struct tcp_sock *tp) /* F-RTO can only be used if TCP has never retransmitted anything other than * head (SACK enhanced variant from Appendix B of RFC4138 is more robust here) */ -int tcp_use_frto(struct sock *sk) +bool tcp_use_frto(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); struct sk_buff *skb; if (!sysctl_tcp_frto) - return 0; + return false; /* MTU probe and F-RTO won't really play nicely along currently */ if (icsk->icsk_mtup.probe_size) - return 0; + return false; if (tcp_is_sackfrto(tp)) - return 1; + return true; /* Avoid expensive walking of rexmit queue if possible */ if (tp->retrans_out > 1) - return 0; + return false; skb = tcp_write_queue_head(sk); if (tcp_skb_is_last(sk, skb)) - return 1; + return true; skb = tcp_write_queue_next(sk, skb); /* Skips head */ tcp_for_write_queue_from(skb, sk) { if (skb == tcp_send_head(sk)) break; if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) - return 0; + return false; /* Short-circuit when first non-SACKed skb has been checked */ if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) break; } - return 1; + return true; } /* RTO occurred, but do not yet enter Loss state. Instead, defer RTO @@ -2294,7 +2124,7 @@ void tcp_enter_loss(struct sock *sk, int how) * * Do processing similar to RTO timeout. */ -static int tcp_check_sack_reneging(struct sock *sk, int flag) +static bool tcp_check_sack_reneging(struct sock *sk, int flag) { if (flag & FLAG_SACK_RENEGING) { struct inet_connection_sock *icsk = inet_csk(sk); @@ -2305,9 +2135,9 @@ static int tcp_check_sack_reneging(struct sock *sk, int flag) tcp_retransmit_skb(sk, tcp_write_queue_head(sk)); inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX); - return 1; + return true; } - return 0; + return false; } static inline int tcp_fackets_out(const struct tcp_sock *tp) @@ -2335,6 +2165,27 @@ static inline int tcp_dupack_heuristics(const struct tcp_sock *tp) return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1; } +static bool tcp_pause_early_retransmit(struct sock *sk, int flag) +{ + struct tcp_sock *tp = tcp_sk(sk); + unsigned long delay; + + /* Delay early retransmit and entering fast recovery for + * max(RTT/4, 2msec) unless ack has ECE mark, no RTT samples + * available, or RTO is scheduled to fire first. + */ + if (sysctl_tcp_early_retrans < 2 || (flag & FLAG_ECE) || !tp->srtt) + return false; + + delay = max_t(unsigned long, (tp->srtt >> 5), msecs_to_jiffies(2)); + if (!time_after(inet_csk(sk)->icsk_timeout, (jiffies + delay))) + return false; + + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, delay, TCP_RTO_MAX); + tp->early_retrans_delayed = 1; + return true; +} + static inline int tcp_skb_timedout(const struct sock *sk, const struct sk_buff *skb) { @@ -2442,28 +2293,28 @@ static inline int tcp_head_timedout(const struct sock *sk) * Main question: may we further continue forward transmission * with the same cwnd? */ -static int tcp_time_to_recover(struct sock *sk) +static bool tcp_time_to_recover(struct sock *sk, int flag) { struct tcp_sock *tp = tcp_sk(sk); __u32 packets_out; /* Do not perform any recovery during F-RTO algorithm */ if (tp->frto_counter) - return 0; + return false; /* Trick#1: The loss is proven. */ if (tp->lost_out) - return 1; + return true; /* Not-A-Trick#2 : Classic rule... */ if (tcp_dupack_heuristics(tp) > tp->reordering) - return 1; + return true; /* Trick#3 : when we use RFC2988 timer restart, fast * retransmit can be triggered by timeout of queue head. */ if (tcp_is_fack(tp) && tcp_head_timedout(sk)) - return 1; + return true; /* Trick#4: It is still not OK... But will it be useful to delay * recovery more? @@ -2475,7 +2326,7 @@ static int tcp_time_to_recover(struct sock *sk) /* We have nothing to send. This connection is limited * either by receiver window or by application. */ - return 1; + return true; } /* If a thin stream is detected, retransmit after first @@ -2486,9 +2337,19 @@ static int tcp_time_to_recover(struct sock *sk) if ((tp->thin_dupack || sysctl_tcp_thin_dupack) && tcp_stream_is_thin(tp) && tcp_dupack_heuristics(tp) > 1 && tcp_is_sack(tp) && !tcp_send_head(sk)) - return 1; + return true; - return 0; + /* Trick#6: TCP early retransmit, per RFC5827. To avoid spurious + * retransmissions due to small network reorderings, we implement + * Mitigation A.3 in the RFC and delay the retransmission for a short + * interval if appropriate. + */ + if (tp->do_early_retrans && !tp->retrans_out && tp->sacked_out && + (tp->packets_out == (tp->sacked_out + 1) && tp->packets_out < 4) && + !tcp_may_send_now(sk)) + return !tcp_pause_early_retransmit(sk, flag); + + return false; } /* New heuristics: it is possible only after we switched to restart timer @@ -2660,7 +2521,7 @@ static void tcp_cwnd_down(struct sock *sk, int flag) /* Nothing was retransmitted or returned timestamp is less * than timestamp of the first retransmission. */ -static inline int tcp_packet_delayed(const struct tcp_sock *tp) +static inline bool tcp_packet_delayed(const struct tcp_sock *tp) { return !tp->retrans_stamp || (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && @@ -2676,22 +2537,22 @@ static void DBGUNDO(struct sock *sk, const char *msg) struct inet_sock *inet = inet_sk(sk); if (sk->sk_family == AF_INET) { - printk(KERN_DEBUG "Undo %s %pI4/%u c%u l%u ss%u/%u p%u\n", - msg, - &inet->inet_daddr, ntohs(inet->inet_dport), - tp->snd_cwnd, tcp_left_out(tp), - tp->snd_ssthresh, tp->prior_ssthresh, - tp->packets_out); + pr_debug("Undo %s %pI4/%u c%u l%u ss%u/%u p%u\n", + msg, + &inet->inet_daddr, ntohs(inet->inet_dport), + tp->snd_cwnd, tcp_left_out(tp), + tp->snd_ssthresh, tp->prior_ssthresh, + tp->packets_out); } #if IS_ENABLED(CONFIG_IPV6) else if (sk->sk_family == AF_INET6) { struct ipv6_pinfo *np = inet6_sk(sk); - printk(KERN_DEBUG "Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n", - msg, - &np->daddr, ntohs(inet->inet_dport), - tp->snd_cwnd, tcp_left_out(tp), - tp->snd_ssthresh, tp->prior_ssthresh, - tp->packets_out); + pr_debug("Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n", + msg, + &np->daddr, ntohs(inet->inet_dport), + tp->snd_cwnd, tcp_left_out(tp), + tp->snd_ssthresh, tp->prior_ssthresh, + tp->packets_out); } #endif } @@ -2721,13 +2582,13 @@ static void tcp_undo_cwr(struct sock *sk, const bool undo_ssthresh) tp->snd_cwnd_stamp = tcp_time_stamp; } -static inline int tcp_may_undo(const struct tcp_sock *tp) +static inline bool tcp_may_undo(const struct tcp_sock *tp) { return tp->undo_marker && (!tp->undo_retrans || tcp_packet_delayed(tp)); } /* People celebrate: "We love our President!" */ -static int tcp_try_undo_recovery(struct sock *sk) +static bool tcp_try_undo_recovery(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); @@ -2752,10 +2613,10 @@ static int tcp_try_undo_recovery(struct sock *sk) * is ACKed. For Reno it is MUST to prevent false * fast retransmits (RFC2582). SACK TCP is safe. */ tcp_moderate_cwnd(tp); - return 1; + return true; } tcp_set_ca_state(sk, TCP_CA_Open); - return 0; + return false; } /* Try to undo cwnd reduction, because D-SACKs acked all retransmitted data */ @@ -2785,19 +2646,19 @@ static void tcp_try_undo_dsack(struct sock *sk) * that successive retransmissions of a segment must not advance * retrans_stamp under any conditions. */ -static int tcp_any_retrans_done(const struct sock *sk) +static bool tcp_any_retrans_done(const struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; if (tp->retrans_out) - return 1; + return true; skb = tcp_write_queue_head(sk); if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS)) - return 1; + return true; - return 0; + return false; } /* Undo during fast recovery after partial ACK. */ @@ -2831,7 +2692,7 @@ static int tcp_try_undo_partial(struct sock *sk, int acked) } /* Undo during loss recovery after partial ACK. */ -static int tcp_try_undo_loss(struct sock *sk) +static bool tcp_try_undo_loss(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); @@ -2853,9 +2714,9 @@ static int tcp_try_undo_loss(struct sock *sk) tp->undo_marker = 0; if (tcp_is_sack(tp)) tcp_set_ca_state(sk, TCP_CA_Open); - return 1; + return true; } - return 0; + return false; } static inline void tcp_complete_cwr(struct sock *sk) @@ -2864,11 +2725,14 @@ static inline void tcp_complete_cwr(struct sock *sk) /* Do not moderate cwnd if it's already undone in cwr or recovery. */ if (tp->undo_marker) { - if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR) + if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR) { tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); - else /* PRR */ + tp->snd_cwnd_stamp = tcp_time_stamp; + } else if (tp->snd_ssthresh < TCP_INFINITE_SSTHRESH) { + /* PRR algorithm. */ tp->snd_cwnd = tp->snd_ssthresh; - tp->snd_cwnd_stamp = tcp_time_stamp; + tp->snd_cwnd_stamp = tcp_time_stamp; + } } tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR); } @@ -3018,6 +2882,38 @@ static void tcp_update_cwnd_in_recovery(struct sock *sk, int newly_acked_sacked, tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt; } +static void tcp_enter_recovery(struct sock *sk, bool ece_ack) +{ + struct tcp_sock *tp = tcp_sk(sk); + int mib_idx; + + if (tcp_is_reno(tp)) + mib_idx = LINUX_MIB_TCPRENORECOVERY; + else + mib_idx = LINUX_MIB_TCPSACKRECOVERY; + + NET_INC_STATS_BH(sock_net(sk), mib_idx); + + tp->high_seq = tp->snd_nxt; + tp->prior_ssthresh = 0; + tp->undo_marker = tp->snd_una; + tp->undo_retrans = tp->retrans_out; + + if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { + if (!ece_ack) + tp->prior_ssthresh = tcp_current_ssthresh(sk); + tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk); + TCP_ECN_queue_cwr(tp); + } + + tp->bytes_acked = 0; + tp->snd_cwnd_cnt = 0; + tp->prior_cwnd = tp->snd_cwnd; + tp->prr_delivered = 0; + tp->prr_out = 0; + tcp_set_ca_state(sk, TCP_CA_Recovery); +} + /* Process an event, which can update packets-in-flight not trivially. * Main goal of this function is to calculate new estimate for left_out, * taking into account both packets sitting in receiver's buffer and @@ -3037,7 +2933,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, struct tcp_sock *tp = tcp_sk(sk); int do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) && (tcp_fackets_out(tp) > tp->reordering)); - int fast_rexmit = 0, mib_idx; + int fast_rexmit = 0; if (WARN_ON(!tp->packets_out && tp->sacked_out)) tp->sacked_out = 0; @@ -3121,7 +3017,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, if (icsk->icsk_ca_state <= TCP_CA_Disorder) tcp_try_undo_dsack(sk); - if (!tcp_time_to_recover(sk)) { + if (!tcp_time_to_recover(sk, flag)) { tcp_try_to_open(sk, flag); return; } @@ -3138,32 +3034,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, } /* Otherwise enter Recovery state */ - - if (tcp_is_reno(tp)) - mib_idx = LINUX_MIB_TCPRENORECOVERY; - else - mib_idx = LINUX_MIB_TCPSACKRECOVERY; - - NET_INC_STATS_BH(sock_net(sk), mib_idx); - - tp->high_seq = tp->snd_nxt; - tp->prior_ssthresh = 0; - tp->undo_marker = tp->snd_una; - tp->undo_retrans = tp->retrans_out; - - if (icsk->icsk_ca_state < TCP_CA_CWR) { - if (!(flag & FLAG_ECE)) - tp->prior_ssthresh = tcp_current_ssthresh(sk); - tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); - TCP_ECN_queue_cwr(tp); - } - - tp->bytes_acked = 0; - tp->snd_cwnd_cnt = 0; - tp->prior_cwnd = tp->snd_cwnd; - tp->prr_delivered = 0; - tp->prr_out = 0; - tcp_set_ca_state(sk, TCP_CA_Recovery); + tcp_enter_recovery(sk, (flag & FLAG_ECE)); fast_rexmit = 1; } @@ -3245,16 +3116,47 @@ static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) /* Restart timer after forward progress on connection. * RFC2988 recommends to restart timer to now+rto. */ -static void tcp_rearm_rto(struct sock *sk) +void tcp_rearm_rto(struct sock *sk) { - const struct tcp_sock *tp = tcp_sk(sk); + struct tcp_sock *tp = tcp_sk(sk); if (!tp->packets_out) { inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); } else { - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - inet_csk(sk)->icsk_rto, TCP_RTO_MAX); + u32 rto = inet_csk(sk)->icsk_rto; + /* Offset the time elapsed after installing regular RTO */ + if (tp->early_retrans_delayed) { + struct sk_buff *skb = tcp_write_queue_head(sk); + const u32 rto_time_stamp = TCP_SKB_CB(skb)->when + rto; + s32 delta = (s32)(rto_time_stamp - tcp_time_stamp); + /* delta may not be positive if the socket is locked + * when the delayed ER timer fires and is rescheduled. + */ + if (delta > 0) + rto = delta; + } + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto, + TCP_RTO_MAX); } + tp->early_retrans_delayed = 0; +} + +/* This function is called when the delayed ER timer fires. TCP enters + * fast recovery and performs fast-retransmit. + */ +void tcp_resume_early_retransmit(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + tcp_rearm_rto(sk); + + /* Stop if ER is disabled after the delayed ER timer is scheduled */ + if (!tp->do_early_retrans) + return; + + tcp_enter_recovery(sk, false); + tcp_update_scoreboard(sk, 1); + tcp_xmit_retransmit_queue(sk); } /* If we get here, the whole TSO packet has not been acked. */ @@ -3289,7 +3191,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, const struct inet_connection_sock *icsk = inet_csk(sk); struct sk_buff *skb; u32 now = tcp_time_stamp; - int fully_acked = 1; + int fully_acked = true; int flag = 0; u32 pkts_acked = 0; u32 reord = tp->packets_out; @@ -3313,7 +3215,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, if (!acked_pcount) break; - fully_acked = 0; + fully_acked = false; } else { acked_pcount = tcp_skb_pcount(skb); } @@ -3430,18 +3332,18 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, if (!tp->packets_out && tcp_is_sack(tp)) { icsk = inet_csk(sk); if (tp->lost_out) { - printk(KERN_DEBUG "Leak l=%u %d\n", - tp->lost_out, icsk->icsk_ca_state); + pr_debug("Leak l=%u %d\n", + tp->lost_out, icsk->icsk_ca_state); tp->lost_out = 0; } if (tp->sacked_out) { - printk(KERN_DEBUG "Leak s=%u %d\n", - tp->sacked_out, icsk->icsk_ca_state); + pr_debug("Leak s=%u %d\n", + tp->sacked_out, icsk->icsk_ca_state); tp->sacked_out = 0; } if (tp->retrans_out) { - printk(KERN_DEBUG "Leak r=%u %d\n", - tp->retrans_out, icsk->icsk_ca_state); + pr_debug("Leak r=%u %d\n", + tp->retrans_out, icsk->icsk_ca_state); tp->retrans_out = 0; } } @@ -3469,13 +3371,13 @@ static void tcp_ack_probe(struct sock *sk) } } -static inline int tcp_ack_is_dubious(const struct sock *sk, const int flag) +static inline bool tcp_ack_is_dubious(const struct sock *sk, const int flag) { return !(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open; } -static inline int tcp_may_raise_cwnd(const struct sock *sk, const int flag) +static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag) { const struct tcp_sock *tp = tcp_sk(sk); return (!(flag & FLAG_ECE) || tp->snd_cwnd < tp->snd_ssthresh) && @@ -3485,7 +3387,7 @@ static inline int tcp_may_raise_cwnd(const struct sock *sk, const int flag) /* Check that window update is acceptable. * The function assumes that snd_una<=ack<=snd_next. */ -static inline int tcp_may_update_window(const struct tcp_sock *tp, +static inline bool tcp_may_update_window(const struct tcp_sock *tp, const u32 ack, const u32 ack_seq, const u32 nwin) { @@ -3592,7 +3494,7 @@ static void tcp_undo_spur_to_response(struct sock *sk, int flag) * to prove that the RTO is indeed spurious. It transfers the control * from F-RTO to the conventional RTO recovery */ -static int tcp_process_frto(struct sock *sk, int flag) +static bool tcp_process_frto(struct sock *sk, int flag) { struct tcp_sock *tp = tcp_sk(sk); @@ -3608,7 +3510,7 @@ static int tcp_process_frto(struct sock *sk, int flag) if (!before(tp->snd_una, tp->frto_highmark)) { tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 2 : 3), flag); - return 1; + return true; } if (!tcp_is_sackfrto(tp)) { @@ -3617,19 +3519,19 @@ static int tcp_process_frto(struct sock *sk, int flag) * data, winupdate */ if (!(flag & FLAG_ANY_PROGRESS) && (flag & FLAG_NOT_DUP)) - return 1; + return true; if (!(flag & FLAG_DATA_ACKED)) { tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 0 : 3), flag); - return 1; + return true; } } else { if (!(flag & FLAG_DATA_ACKED) && (tp->frto_counter == 1)) { /* Prevent sending of new data. */ tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)); - return 1; + return true; } if ((tp->frto_counter >= 2) && @@ -3639,10 +3541,10 @@ static int tcp_process_frto(struct sock *sk, int flag) /* RFC4138 shortcoming (see comment above) */ if (!(flag & FLAG_FORWARD_PROGRESS) && (flag & FLAG_NOT_DUP)) - return 1; + return true; tcp_enter_frto_loss(sk, 3, flag); - return 1; + return true; } } @@ -3654,7 +3556,7 @@ static int tcp_process_frto(struct sock *sk, int flag) if (!tcp_may_send_now(sk)) tcp_enter_frto_loss(sk, 2, flag); - return 1; + return true; } else { switch (sysctl_tcp_frto_response) { case 2: @@ -3671,7 +3573,7 @@ static int tcp_process_frto(struct sock *sk, int flag) tp->undo_marker = 0; NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSPURIOUSRTOS); } - return 0; + return false; } /* This routine deals with incoming acks, but not outgoing ones. */ @@ -3689,7 +3591,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) int prior_sacked = tp->sacked_out; int pkts_acked = 0; int newly_acked_sacked = 0; - int frto_cwnd = 0; + bool frto_cwnd = false; /* If the ack is older than previous acks * then we can probably ignore it. @@ -3703,6 +3605,9 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (after(ack, tp->snd_nxt)) goto invalid_ack; + if (tp->early_retrans_delayed) + tcp_rearm_rto(sk); + if (after(ack, prior_snd_una)) flag |= FLAG_SND_UNA_ADVANCED; @@ -3783,9 +3688,11 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) tcp_cong_avoid(sk, ack, prior_in_flight); } - if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) - dst_confirm(__sk_dst_get(sk)); - + if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) { + struct dst_entry *dst = __sk_dst_get(sk); + if (dst) + dst_confirm(dst); + } return 1; no_queue: @@ -3825,7 +3732,8 @@ old_ack: * the fast version below fails. */ void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *opt_rx, - const u8 **hvpp, int estab) + const u8 **hvpp, int estab, + struct tcp_fastopen_cookie *foc) { const unsigned char *ptr; const struct tcphdr *th = tcp_hdr(skb); @@ -3868,10 +3776,9 @@ void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *o __u8 snd_wscale = *(__u8 *)ptr; opt_rx->wscale_ok = 1; if (snd_wscale > 14) { - if (net_ratelimit()) - pr_info("%s: Illegal window scaling value %d >14 received\n", - __func__, - snd_wscale); + net_info_ratelimited("%s: Illegal window scaling value %d >14 received\n", + __func__, + snd_wscale); snd_wscale = 14; } opt_rx->snd_wscale = snd_wscale; @@ -3933,8 +3840,25 @@ void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *o break; } break; - } + case TCPOPT_EXP: + /* Fast Open option shares code 254 using a + * 16 bits magic number. It's valid only in + * SYN or SYN-ACK with an even size. + */ + if (opsize < TCPOLEN_EXP_FASTOPEN_BASE || + get_unaligned_be16(ptr) != TCPOPT_FASTOPEN_MAGIC || + foc == NULL || !th->syn || (opsize & 1)) + break; + foc->len = opsize - TCPOLEN_EXP_FASTOPEN_BASE; + if (foc->len >= TCP_FASTOPEN_COOKIE_MIN && + foc->len <= TCP_FASTOPEN_COOKIE_MAX) + memcpy(foc->val, ptr + 2, foc->len); + else if (foc->len != 0) + foc->len = -1; + break; + + } ptr += opsize-2; length -= opsize; } @@ -3942,7 +3866,7 @@ void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *o } EXPORT_SYMBOL(tcp_parse_options); -static int tcp_parse_aligned_timestamp(struct tcp_sock *tp, const struct tcphdr *th) +static bool tcp_parse_aligned_timestamp(struct tcp_sock *tp, const struct tcphdr *th) { const __be32 *ptr = (const __be32 *)(th + 1); @@ -3953,31 +3877,31 @@ static int tcp_parse_aligned_timestamp(struct tcp_sock *tp, const struct tcphdr tp->rx_opt.rcv_tsval = ntohl(*ptr); ++ptr; tp->rx_opt.rcv_tsecr = ntohl(*ptr); - return 1; + return true; } - return 0; + return false; } /* Fast parse options. This hopes to only see timestamps. * If it is wrong it falls back on tcp_parse_options(). */ -static int tcp_fast_parse_options(const struct sk_buff *skb, - const struct tcphdr *th, - struct tcp_sock *tp, const u8 **hvpp) +static bool tcp_fast_parse_options(const struct sk_buff *skb, + const struct tcphdr *th, + struct tcp_sock *tp, const u8 **hvpp) { /* In the spirit of fast parsing, compare doff directly to constant * values. Because equality is used, short doff can be ignored here. */ if (th->doff == (sizeof(*th) / 4)) { tp->rx_opt.saw_tstamp = 0; - return 0; + return false; } else if (tp->rx_opt.tstamp_ok && th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / 4)) { if (tcp_parse_aligned_timestamp(tp, th)) - return 1; + return true; } - tcp_parse_options(skb, &tp->rx_opt, hvpp, 1); - return 1; + tcp_parse_options(skb, &tp->rx_opt, hvpp, 1, NULL); + return true; } #ifdef CONFIG_TCP_MD5SIG @@ -4082,7 +4006,7 @@ static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb) (s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (inet_csk(sk)->icsk_rto * 1024) / HZ); } -static inline int tcp_paws_discard(const struct sock *sk, +static inline bool tcp_paws_discard(const struct sock *sk, const struct sk_buff *skb) { const struct tcp_sock *tp = tcp_sk(sk); @@ -4104,7 +4028,7 @@ static inline int tcp_paws_discard(const struct sock *sk, * (borrowed from freebsd) */ -static inline int tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq) +static inline bool tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq) { return !before(end_seq, tp->rcv_wup) && !after(seq, tp->rcv_nxt + tcp_receive_window(tp)); @@ -4218,7 +4142,7 @@ static void tcp_fin(struct sock *sk) } } -static inline int tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, +static inline bool tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, u32 end_seq) { if (!after(seq, sp->end_seq) && !after(sp->start_seq, end_seq)) { @@ -4226,9 +4150,9 @@ static inline int tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, sp->start_seq = seq; if (after(end_seq, sp->end_seq)) sp->end_seq = end_seq; - return 1; + return true; } - return 0; + return false; } static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq) @@ -4424,10 +4348,10 @@ static void tcp_ofo_queue(struct sock *sk) } } -static int tcp_prune_ofo_queue(struct sock *sk); +static bool tcp_prune_ofo_queue(struct sock *sk); static int tcp_prune_queue(struct sock *sk); -static inline int tcp_try_rmem_schedule(struct sock *sk, unsigned int size) +static int tcp_try_rmem_schedule(struct sock *sk, unsigned int size) { if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || !sk_rmem_schedule(sk, size)) { @@ -4446,6 +4370,46 @@ static inline int tcp_try_rmem_schedule(struct sock *sk, unsigned int size) return 0; } +/** + * tcp_try_coalesce - try to merge skb to prior one + * @sk: socket + * @to: prior buffer + * @from: buffer to add in queue + * @fragstolen: pointer to boolean + * + * Before queueing skb @from after @to, try to merge them + * to reduce overall memory use and queue lengths, if cost is small. + * Packets in ofo or receive queues can stay a long time. + * Better try to coalesce them right now to avoid future collapses. + * Returns true if caller should free @from instead of queueing it + */ +static bool tcp_try_coalesce(struct sock *sk, + struct sk_buff *to, + struct sk_buff *from, + bool *fragstolen) +{ + int delta; + + *fragstolen = false; + + if (tcp_hdr(from)->fin) + return false; + + /* Its possible this segment overlaps with prior segment in queue */ + if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq) + return false; + + if (!skb_try_coalesce(to, from, fragstolen, &delta)) + return false; + + atomic_add(delta, &sk->sk_rmem_alloc); + sk_mem_charge(sk, delta); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE); + TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq; + TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq; + return true; +} + static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); @@ -4454,8 +4418,8 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) TCP_ECN_check_ce(tp, skb); - if (tcp_try_rmem_schedule(sk, skb->truesize)) { - /* TODO: should increment a counter */ + if (unlikely(tcp_try_rmem_schedule(sk, skb->truesize))) { + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFODROP); __kfree_skb(skb); return; } @@ -4464,6 +4428,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) tp->pred_flags = 0; inet_csk_schedule_ack(sk); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOQUEUE); SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n", tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); @@ -4484,23 +4449,13 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) end_seq = TCP_SKB_CB(skb)->end_seq; if (seq == TCP_SKB_CB(skb1)->end_seq) { - /* Packets in ofo can stay in queue a long time. - * Better try to coalesce them right now - * to avoid future tcp_collapse_ofo_queue(), - * probably the most expensive function in tcp stack. - */ - if (skb->len <= skb_tailroom(skb1) && !tcp_hdr(skb)->fin) { - NET_INC_STATS_BH(sock_net(sk), - LINUX_MIB_TCPRCVCOALESCE); - BUG_ON(skb_copy_bits(skb, 0, - skb_put(skb1, skb->len), - skb->len)); - TCP_SKB_CB(skb1)->end_seq = end_seq; - TCP_SKB_CB(skb1)->ack_seq = TCP_SKB_CB(skb)->ack_seq; - __kfree_skb(skb); - skb = NULL; - } else { + bool fragstolen; + + if (!tcp_try_coalesce(sk, skb1, skb, &fragstolen)) { __skb_queue_after(&tp->out_of_order_queue, skb1, skb); + } else { + kfree_skb_partial(skb, fragstolen); + skb = NULL; } if (!tp->rx_opt.num_sacks || @@ -4527,6 +4482,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) { if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) { /* All the bits are present. Drop. */ + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE); __kfree_skb(skb); skb = NULL; tcp_dsack_set(sk, seq, end_seq); @@ -4565,6 +4521,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) __skb_unlink(skb1, &tp->out_of_order_queue); tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq, TCP_SKB_CB(skb1)->end_seq); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE); __kfree_skb(skb1); } @@ -4576,12 +4533,65 @@ end: skb_set_owner_r(skb, sk); } +static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen, + bool *fragstolen) +{ + int eaten; + struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue); + + __skb_pull(skb, hdrlen); + eaten = (tail && + tcp_try_coalesce(sk, tail, skb, fragstolen)) ? 1 : 0; + tcp_sk(sk)->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + if (!eaten) { + __skb_queue_tail(&sk->sk_receive_queue, skb); + skb_set_owner_r(skb, sk); + } + return eaten; +} + +int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size) +{ + struct sk_buff *skb; + struct tcphdr *th; + bool fragstolen; + + if (tcp_try_rmem_schedule(sk, size + sizeof(*th))) + goto err; + + skb = alloc_skb(size + sizeof(*th), sk->sk_allocation); + if (!skb) + goto err; + + th = (struct tcphdr *)skb_put(skb, sizeof(*th)); + skb_reset_transport_header(skb); + memset(th, 0, sizeof(*th)); + + if (memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size)) + goto err_free; + + TCP_SKB_CB(skb)->seq = tcp_sk(sk)->rcv_nxt; + TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size; + TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1; + + if (tcp_queue_rcv(sk, skb, sizeof(*th), &fragstolen)) { + WARN_ON_ONCE(fragstolen); /* should not happen */ + __kfree_skb(skb); + } + return size; + +err_free: + kfree_skb(skb); +err: + return -ENOMEM; +} static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) { const struct tcphdr *th = tcp_hdr(skb); struct tcp_sock *tp = tcp_sk(sk); int eaten = -1; + bool fragstolen = false; if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) goto drop; @@ -4626,8 +4636,7 @@ queue_and_out: tcp_try_rmem_schedule(sk, skb->truesize)) goto drop; - skb_set_owner_r(skb, sk); - __skb_queue_tail(&sk->sk_receive_queue, skb); + eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen); } tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; if (skb->len) @@ -4651,7 +4660,7 @@ queue_and_out: tcp_fast_path_check(sk); if (eaten > 0) - __kfree_skb(skb); + kfree_skb_partial(skb, fragstolen); else if (!sock_flag(sk, SOCK_DEAD)) sk->sk_data_ready(sk, 0); return; @@ -4871,10 +4880,10 @@ static void tcp_collapse_ofo_queue(struct sock *sk) * Purge the out-of-order queue. * Return true if queue was pruned. */ -static int tcp_prune_ofo_queue(struct sock *sk) +static bool tcp_prune_ofo_queue(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - int res = 0; + bool res = false; if (!skb_queue_empty(&tp->out_of_order_queue)) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_OFOPRUNED); @@ -4888,7 +4897,7 @@ static int tcp_prune_ofo_queue(struct sock *sk) if (tp->rx_opt.sack_ok) tcp_sack_reset(&tp->rx_opt); sk_mem_reclaim(sk); - res = 1; + res = true; } return res; } @@ -4965,7 +4974,7 @@ void tcp_cwnd_application_limited(struct sock *sk) tp->snd_cwnd_stamp = tcp_time_stamp; } -static int tcp_should_expand_sndbuf(const struct sock *sk) +static bool tcp_should_expand_sndbuf(const struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); @@ -4973,21 +4982,21 @@ static int tcp_should_expand_sndbuf(const struct sock *sk) * not modify it. */ if (sk->sk_userlocks & SOCK_SNDBUF_LOCK) - return 0; + return false; /* If we are under global TCP memory pressure, do not expand. */ if (sk_under_memory_pressure(sk)) - return 0; + return false; /* If we are under soft global TCP memory pressure, do not expand. */ if (sk_memory_allocated(sk) >= sk_prot_mem_limits(sk, 0)) - return 0; + return false; /* If we filled the congestion window, do not expand. */ if (tp->packets_out >= tp->snd_cwnd) - return 0; + return false; - return 1; + return true; } /* When incoming ACK allowed to free some skb from write_queue, @@ -5205,7 +5214,7 @@ static __sum16 __tcp_checksum_complete_user(struct sock *sk, return result; } -static inline int tcp_checksum_complete_user(struct sock *sk, +static inline bool tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb) { return !skb_csum_unnecessary(skb) && @@ -5213,19 +5222,19 @@ static inline int tcp_checksum_complete_user(struct sock *sk, } #ifdef CONFIG_NET_DMA -static int tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, +static bool tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, int hlen) { struct tcp_sock *tp = tcp_sk(sk); int chunk = skb->len - hlen; int dma_cookie; - int copied_early = 0; + bool copied_early = false; if (tp->ucopy.wakeup) - return 0; + return false; if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) - tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY); + tp->ucopy.dma_chan = net_dma_find_channel(); if (tp->ucopy.dma_chan && skb_csum_unnecessary(skb)) { @@ -5238,7 +5247,7 @@ static int tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, goto out; tp->ucopy.dma_cookie = dma_cookie; - copied_early = 1; + copied_early = true; tp->ucopy.len -= chunk; tp->copied_seq += chunk; @@ -5259,11 +5268,28 @@ out: } #endif /* CONFIG_NET_DMA */ +static void tcp_send_challenge_ack(struct sock *sk) +{ + /* unprotected vars, we dont care of overwrites */ + static u32 challenge_timestamp; + static unsigned int challenge_count; + u32 now = jiffies / HZ; + + if (now != challenge_timestamp) { + challenge_timestamp = now; + challenge_count = 0; + } + if (++challenge_count <= sysctl_tcp_challenge_ack_limit) { + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPCHALLENGEACK); + tcp_send_ack(sk); + } +} + /* Does PAWS and seqno based validation of an incoming segment, flags will * play significant role here. */ -static int tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, - const struct tcphdr *th, int syn_inerr) +static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, + const struct tcphdr *th, int syn_inerr) { const u8 *hash_location; struct tcp_sock *tp = tcp_sk(sk); @@ -5288,14 +5314,26 @@ static int tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, * an acknowledgment should be sent in reply (unless the RST * bit is set, if so drop the segment and return)". */ - if (!th->rst) + if (!th->rst) { + if (th->syn) + goto syn_challenge; tcp_send_dupack(sk, skb); + } goto discard; } /* Step 2: check RST bit */ if (th->rst) { - tcp_reset(sk); + /* RFC 5961 3.2 : + * If sequence number exactly matches RCV.NXT, then + * RESET the connection + * else + * Send a challenge ACK + */ + if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) + tcp_reset(sk); + else + tcp_send_challenge_ack(sk); goto discard; } @@ -5306,20 +5344,23 @@ static int tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, /* step 3: check security and precedence [ignored] */ - /* step 4: Check for a SYN in window. */ - if (th->syn && !before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { + /* step 4: Check for a SYN + * RFC 5691 4.2 : Send a challenge ack + */ + if (th->syn) { +syn_challenge: if (syn_inerr) TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONSYN); - tcp_reset(sk); - return -1; + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE); + tcp_send_challenge_ack(sk); + goto discard; } - return 1; + return true; discard: __kfree_skb(skb); - return 0; + return false; } /* @@ -5349,7 +5390,18 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th, unsigned int len) { struct tcp_sock *tp = tcp_sk(sk); - int res; + + if (sk->sk_rx_dst) { + struct dst_entry *dst = sk->sk_rx_dst; + if (unlikely(dst->obsolete)) { + if (dst->ops->check(dst, 0) == NULL) { + dst_release(dst); + sk->sk_rx_dst = NULL; + } + } + } + if (unlikely(sk->sk_rx_dst == NULL)) + sk->sk_rx_dst = dst_clone(skb_dst(skb)); /* * Header prediction. @@ -5430,6 +5482,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, } else { int eaten = 0; int copied_early = 0; + bool fragstolen = false; if (tp->copied_seq == tp->rcv_nxt && len - tcp_header_len <= tp->ucopy.len) { @@ -5487,10 +5540,8 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITS); /* Bulk data transfer: receiver */ - __skb_pull(skb, tcp_header_len); - __skb_queue_tail(&sk->sk_receive_queue, skb); - skb_set_owner_r(skb, sk); - tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + eaten = tcp_queue_rcv(sk, skb, tcp_header_len, + &fragstolen); } tcp_event_data_recv(sk, skb); @@ -5512,7 +5563,7 @@ no_ack: else #endif if (eaten) - __kfree_skb(skb); + kfree_skb_partial(skb, fragstolen); else sk->sk_data_ready(sk, 0); return 0; @@ -5527,9 +5578,8 @@ slow_path: * Standard slow path. */ - res = tcp_validate_incoming(sk, skb, th, 1); - if (res <= 0) - return -res; + if (!tcp_validate_incoming(sk, skb, th, 1)) + return 0; step5: if (th->ack && tcp_ack(sk, skb, FLAG_SLOWPATH) < 0) @@ -5556,6 +5606,85 @@ discard: } EXPORT_SYMBOL(tcp_rcv_established); +void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); + + tcp_set_state(sk, TCP_ESTABLISHED); + + if (skb != NULL) { + sk->sk_rx_dst = dst_clone(skb_dst(skb)); + security_inet_conn_established(sk, skb); + } + + /* Make sure socket is routed, for correct metrics. */ + icsk->icsk_af_ops->rebuild_header(sk); + + tcp_init_metrics(sk); + + tcp_init_congestion_control(sk); + + /* Prevent spurious tcp_cwnd_restart() on first data + * packet. + */ + tp->lsndtime = tcp_time_stamp; + + tcp_init_buffer_space(sk); + + if (sock_flag(sk, SOCK_KEEPOPEN)) + inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp)); + + if (!tp->rx_opt.snd_wscale) + __tcp_fast_path_on(tp, tp->snd_wnd); + else + tp->pred_flags = 0; + + if (!sock_flag(sk, SOCK_DEAD)) { + sk->sk_state_change(sk); + sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT); + } +} + +static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, + struct tcp_fastopen_cookie *cookie) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *data = tp->syn_data ? tcp_write_queue_head(sk) : NULL; + u16 mss = tp->rx_opt.mss_clamp; + bool syn_drop; + + if (mss == tp->rx_opt.user_mss) { + struct tcp_options_received opt; + const u8 *hash_location; + + /* Get original SYNACK MSS value if user MSS sets mss_clamp */ + tcp_clear_options(&opt); + opt.user_mss = opt.mss_clamp = 0; + tcp_parse_options(synack, &opt, &hash_location, 0, NULL); + mss = opt.mss_clamp; + } + + if (!tp->syn_fastopen) /* Ignore an unsolicited cookie */ + cookie->len = -1; + + /* The SYN-ACK neither has cookie nor acknowledges the data. Presumably + * the remote receives only the retransmitted (regular) SYNs: either + * the original SYN-data or the corresponding SYN-ACK is lost. + */ + syn_drop = (cookie->len <= 0 && data && + inet_csk(sk)->icsk_retransmits); + + tcp_fastopen_cache_set(sk, mss, cookie, syn_drop); + + if (data) { /* Retransmit unacked data in SYN */ + tcp_retransmit_skb(sk, data); + tcp_rearm_rto(sk); + return true; + } + return false; +} + static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th, unsigned int len) { @@ -5563,9 +5692,10 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct tcp_cookie_values *cvp = tp->cookie_values; + struct tcp_fastopen_cookie foc = { .len = -1 }; int saved_clamp = tp->rx_opt.mss_clamp; - tcp_parse_options(skb, &tp->rx_opt, &hash_location, 0); + tcp_parse_options(skb, &tp->rx_opt, &hash_location, 0, &foc); if (th->ack) { /* rfc793: @@ -5575,11 +5705,9 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, * If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send * a reset (unless the RST bit is set, if so drop * the segment and return)" - * - * We do not send data with SYN, so that RFC-correct - * test reduces to: */ - if (TCP_SKB_CB(skb)->ack_seq != tp->snd_nxt) + if (!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_una) || + after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) goto reset_and_undo; if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && @@ -5688,36 +5816,12 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, } smp_mb(); - tcp_set_state(sk, TCP_ESTABLISHED); - - security_inet_conn_established(sk, skb); - - /* Make sure socket is routed, for correct metrics. */ - icsk->icsk_af_ops->rebuild_header(sk); - - tcp_init_metrics(sk); - - tcp_init_congestion_control(sk); - - /* Prevent spurious tcp_cwnd_restart() on first data - * packet. - */ - tp->lsndtime = tcp_time_stamp; - - tcp_init_buffer_space(sk); - if (sock_flag(sk, SOCK_KEEPOPEN)) - inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp)); + tcp_finish_connect(sk, skb); - if (!tp->rx_opt.snd_wscale) - __tcp_fast_path_on(tp, tp->snd_wnd); - else - tp->pred_flags = 0; - - if (!sock_flag(sk, SOCK_DEAD)) { - sk->sk_state_change(sk); - sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT); - } + if ((tp->syn_fastopen || tp->syn_data) && + tcp_rcv_fastopen_synack(sk, skb, &foc)) + return -1; if (sk->sk_write_pending || icsk->icsk_accept_queue.rskq_defer_accept || @@ -5731,8 +5835,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, */ inet_csk_schedule_ack(sk); icsk->icsk_ack.lrcvtime = tcp_time_stamp; - icsk->icsk_ack.ato = TCP_ATO_MIN; - tcp_incr_quickack(sk); tcp_enter_quickack_mode(sk); inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, TCP_DELACK_MAX, TCP_RTO_MAX); @@ -5839,7 +5941,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); int queued = 0; - int res; tp->rx_opt.saw_tstamp = 0; @@ -5894,9 +5995,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, return 0; } - res = tcp_validate_incoming(sk, skb, th, 0); - if (res <= 0) - return -res; + if (!tcp_validate_incoming(sk, skb, th, 0)) + return 0; /* step 5: check the ACK field */ if (th->ack) { @@ -5952,9 +6052,14 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, case TCP_FIN_WAIT1: if (tp->snd_una == tp->write_seq) { + struct dst_entry *dst; + tcp_set_state(sk, TCP_FIN_WAIT2); sk->sk_shutdown |= SEND_SHUTDOWN; - dst_confirm(__sk_dst_get(sk)); + + dst = __sk_dst_get(sk); + if (dst) + dst_confirm(dst); if (!sock_flag(sk, SOCK_DEAD)) /* Wake up lingering close() */ diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 3a25cf743f8b..1d8b75a58981 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -138,6 +138,14 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) } EXPORT_SYMBOL_GPL(tcp_twsk_unique); +static int tcp_repair_connect(struct sock *sk) +{ + tcp_connect_init(sk); + tcp_finish_connect(sk, NULL); + + return 0; +} + /* This will initiate an outgoing connection. */ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { @@ -196,26 +204,13 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) /* Reset inherited state */ tp->rx_opt.ts_recent = 0; tp->rx_opt.ts_recent_stamp = 0; - tp->write_seq = 0; + if (likely(!tp->repair)) + tp->write_seq = 0; } if (tcp_death_row.sysctl_tw_recycle && - !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr) { - struct inet_peer *peer = rt_get_peer(rt, fl4->daddr); - /* - * VJ's idea. We save last timestamp seen from - * the destination in peer table, when entering state - * TIME-WAIT * and initialize rx_opt.ts_recent from it, - * when trying new connection. - */ - if (peer) { - inet_peer_refcheck(peer); - if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) { - tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp; - tp->rx_opt.ts_recent = peer->tcp_ts; - } - } - } + !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr) + tcp_fetch_timewait_stamp(sk, &rt->dst); inet->inet_dport = usin->sin_port; inet->inet_daddr = daddr; @@ -247,7 +242,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) sk->sk_gso_type = SKB_GSO_TCPV4; sk_setup_caps(sk, &rt->dst); - if (!tp->write_seq) + if (!tp->write_seq && likely(!tp->repair)) tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr, inet->inet_daddr, inet->inet_sport, @@ -255,7 +250,11 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) inet->inet_id = tp->write_seq ^ jiffies; - err = tcp_connect(sk); + if (likely(!tp->repair)) + err = tcp_connect(sk); + else + err = tcp_repair_connect(sk); + rt = NULL; if (err) goto failure; @@ -290,17 +289,10 @@ static void do_pmtu_discovery(struct sock *sk, const struct iphdr *iph, u32 mtu) if (sk->sk_state == TCP_LISTEN) return; - /* We don't check in the destentry if pmtu discovery is forbidden - * on this route. We just assume that no packet_to_big packets - * are send back when pmtu discovery is not active. - * There is a small race when the user changes this flag in the - * route, but I think that's acceptable. - */ - if ((dst = __sk_dst_check(sk, 0)) == NULL) + dst = inet_csk_update_pmtu(sk, mtu); + if (!dst) return; - dst->ops->update_pmtu(dst, mtu); - /* Something is about to be wrong... Remember soft error * for the case, if this connection will not able to recover. */ @@ -322,6 +314,14 @@ static void do_pmtu_discovery(struct sock *sk, const struct iphdr *iph, u32 mtu) } /* else let the usual retransmit timer handle it */ } +static void do_redirect(struct sk_buff *skb, struct sock *sk) +{ + struct dst_entry *dst = __sk_dst_check(sk, 0); + + if (dst) + dst->ops->redirect(dst, sk, skb); +} + /* * This routine is called by the ICMP module when it gets some * sort of error condition. If err < 0 then the socket should @@ -395,6 +395,9 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) } switch (type) { + case ICMP_REDIRECT: + do_redirect(icmp_skb, sk); + goto out; case ICMP_SOURCE_QUENCH: /* Just silently ignore these. */ goto out; @@ -685,8 +688,8 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) net = dev_net(skb_dst(skb)->dev); arg.tos = ip_hdr(skb)->tos; - ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr, - &arg, arg.iov[0].iov_len); + ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr, + ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len); TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS); @@ -768,8 +771,8 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, if (oif) arg.bound_dev_if = oif; arg.tos = tos; - ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr, - &arg, arg.iov[0].iov_len); + ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr, + ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len); TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); } @@ -811,7 +814,9 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, */ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, struct request_sock *req, - struct request_values *rvp) + struct request_values *rvp, + u16 queue_mapping, + bool nocache) { const struct inet_request_sock *ireq = inet_rsk(req); struct flowi4 fl4; @@ -819,7 +824,7 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, struct sk_buff * skb; /* First, grab a route. */ - if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) + if (!dst && (dst = inet_csk_route_req(sk, &fl4, req, nocache)) == NULL) return -1; skb = tcp_make_synack(sk, dst, req, rvp); @@ -827,13 +832,13 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, if (skb) { __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr); + skb_set_queue_mapping(skb, queue_mapping); err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr, ireq->rmt_addr, ireq->opt); err = net_xmit_eval(err); } - dst_release(dst); return err; } @@ -841,7 +846,7 @@ static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req, struct request_values *rvp) { TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); - return tcp_v4_send_synack(sk, NULL, req, rvp); + return tcp_v4_send_synack(sk, NULL, req, rvp, 0, false); } /* @@ -853,14 +858,14 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req) } /* - * Return 1 if a syncookie should be sent + * Return true if a syncookie should be sent */ -int tcp_syn_flood_action(struct sock *sk, +bool tcp_syn_flood_action(struct sock *sk, const struct sk_buff *skb, const char *proto) { const char *msg = "Dropping request"; - int want_cookie = 0; + bool want_cookie = false; struct listen_sock *lopt; @@ -868,7 +873,7 @@ int tcp_syn_flood_action(struct sock *sk, #ifdef CONFIG_SYN_COOKIES if (sysctl_tcp_syncookies) { msg = "Sending cookies"; - want_cookie = 1; + want_cookie = true; NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES); } else #endif @@ -1183,7 +1188,7 @@ clear_hash_noput: } EXPORT_SYMBOL(tcp_v4_md5_hash_skb); -static int tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb) +static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb) { /* * This gets called for each TCP segment that arrives @@ -1206,16 +1211,16 @@ static int tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb) /* We've parsed the options - do we have a hash? */ if (!hash_expected && !hash_location) - return 0; + return false; if (hash_expected && !hash_location) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND); - return 1; + return true; } if (!hash_expected && hash_location) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED); - return 1; + return true; } /* Okay, so this is hash_expected and hash_location - @@ -1226,15 +1231,14 @@ static int tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb) NULL, NULL, skb); if (genhash || memcmp(hash_location, newhash, 16) != 0) { - if (net_ratelimit()) { - pr_info("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n", - &iph->saddr, ntohs(th->source), - &iph->daddr, ntohs(th->dest), - genhash ? " tcp_v4_calc_md5_hash failed" : ""); - } - return 1; + net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n", + &iph->saddr, ntohs(th->source), + &iph->daddr, ntohs(th->dest), + genhash ? " tcp_v4_calc_md5_hash failed" + : ""); + return true; } - return 0; + return false; } #endif @@ -1268,7 +1272,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) __be32 saddr = ip_hdr(skb)->saddr; __be32 daddr = ip_hdr(skb)->daddr; __u32 isn = TCP_SKB_CB(skb)->when; - int want_cookie = 0; + bool want_cookie = false; /* Never answer to SYNs send to broadcast or multicast */ if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) @@ -1303,7 +1307,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) tcp_clear_options(&tmp_opt); tmp_opt.mss_clamp = TCP_MSS_DEFAULT; tmp_opt.user_mss = tp->rx_opt.user_mss; - tcp_parse_options(skb, &tmp_opt, &hash_location, 0); + tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL); if (tmp_opt.cookie_plus > 0 && tmp_opt.saw_tstamp && @@ -1327,7 +1331,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) while (l-- > 0) *c++ ^= *hash_location++; - want_cookie = 0; /* not our kind of cookie */ + want_cookie = false; /* not our kind of cookie */ tmp_ext.cookie_out_never = 0; /* false */ tmp_ext.cookie_plus = tmp_opt.cookie_plus; } else if (!tp->rx_opt.cookie_in_always) { @@ -1355,13 +1359,12 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) goto drop_and_free; if (!want_cookie || tmp_opt.tstamp_ok) - TCP_ECN_create_request(req, tcp_hdr(skb)); + TCP_ECN_create_request(req, skb); if (want_cookie) { isn = cookie_v4_init_sequence(sk, skb, &req->mss); req->cookie_ts = tmp_opt.tstamp_ok; } else if (!isn) { - struct inet_peer *peer = NULL; struct flowi4 fl4; /* VJ's idea. We save last timestamp seen @@ -1375,13 +1378,9 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) */ if (tmp_opt.saw_tstamp && tcp_death_row.sysctl_tw_recycle && - (dst = inet_csk_route_req(sk, &fl4, req)) != NULL && - fl4.daddr == saddr && - (peer = rt_get_peer((struct rtable *)dst, fl4.daddr)) != NULL) { - inet_peer_refcheck(peer); - if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL && - (s32)(peer->tcp_ts - req->ts_recent) > - TCP_PAWS_WINDOW) { + (dst = inet_csk_route_req(sk, &fl4, req, want_cookie)) != NULL && + fl4.daddr == saddr) { + if (!tcp_peer_is_proven(req, dst, true)) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED); goto drop_and_release; } @@ -1390,8 +1389,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) else if (!sysctl_tcp_syncookies && (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < (sysctl_max_syn_backlog >> 2)) && - (!peer || !peer->tcp_ts_stamp) && - (!dst || !dst_metric(dst, RTAX_RTT))) { + !tcp_peer_is_proven(req, dst, false)) { /* Without syncookies last quarter of * backlog is filled with destinations, * proven to be alive. @@ -1410,7 +1408,9 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) tcp_rsk(req)->snt_synack = tcp_time_stamp; if (tcp_v4_send_synack(sk, dst, req, - (struct request_values *)&tmp_ext) || + (struct request_values *)&tmp_ext, + skb_get_queue_mapping(skb), + want_cookie) || want_cookie) goto drop_and_free; @@ -1657,6 +1657,51 @@ csum_err: } EXPORT_SYMBOL(tcp_v4_do_rcv); +void tcp_v4_early_demux(struct sk_buff *skb) +{ + struct net *net = dev_net(skb->dev); + const struct iphdr *iph; + const struct tcphdr *th; + struct net_device *dev; + struct sock *sk; + + if (skb->pkt_type != PACKET_HOST) + return; + + if (!pskb_may_pull(skb, ip_hdrlen(skb) + sizeof(struct tcphdr))) + return; + + iph = ip_hdr(skb); + th = (struct tcphdr *) ((char *)iph + ip_hdrlen(skb)); + + if (th->doff < sizeof(struct tcphdr) / 4) + return; + + if (!pskb_may_pull(skb, ip_hdrlen(skb) + th->doff * 4)) + return; + + dev = skb->dev; + sk = __inet_lookup_established(net, &tcp_hashinfo, + iph->saddr, th->source, + iph->daddr, ntohs(th->dest), + dev->ifindex); + if (sk) { + skb->sk = sk; + skb->destructor = sock_edemux; + if (sk->sk_state != TCP_TIME_WAIT) { + struct dst_entry *dst = sk->sk_rx_dst; + if (dst) + dst = dst_check(dst, 0); + if (dst) { + struct rtable *rt = (struct rtable *) dst; + + if (rt->rt_iif == dev->ifindex) + skb_dst_set_noref(skb, dst); + } + } + } +} + /* * From tcp_input.c */ @@ -1730,7 +1775,7 @@ process: #ifdef CONFIG_NET_DMA struct tcp_sock *tp = tcp_sk(sk); if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) - tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY); + tp->ucopy.dma_chan = net_dma_find_channel(); if (tp->ucopy.dma_chan) ret = tcp_v4_do_rcv(sk, skb); else @@ -1739,7 +1784,8 @@ process: if (!tcp_prequeue(sk, skb)) ret = tcp_v4_do_rcv(sk, skb); } - } else if (unlikely(sk_add_backlog(sk, skb))) { + } else if (unlikely(sk_add_backlog(sk, skb, + sk->sk_rcvbuf + sk->sk_sndbuf))) { bh_unlock_sock(sk); NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP); goto discard_and_relse; @@ -1805,40 +1851,10 @@ do_time_wait: goto discard_it; } -struct inet_peer *tcp_v4_get_peer(struct sock *sk, bool *release_it) -{ - struct rtable *rt = (struct rtable *) __sk_dst_get(sk); - struct inet_sock *inet = inet_sk(sk); - struct inet_peer *peer; - - if (!rt || - inet->cork.fl.u.ip4.daddr != inet->inet_daddr) { - peer = inet_getpeer_v4(inet->inet_daddr, 1); - *release_it = true; - } else { - if (!rt->peer) - rt_bind_peer(rt, inet->inet_daddr, 1); - peer = rt->peer; - *release_it = false; - } - - return peer; -} -EXPORT_SYMBOL(tcp_v4_get_peer); - -void *tcp_v4_tw_get_peer(struct sock *sk) -{ - const struct inet_timewait_sock *tw = inet_twsk(sk); - - return inet_getpeer_v4(tw->tw_daddr, 1); -} -EXPORT_SYMBOL(tcp_v4_tw_get_peer); - static struct timewait_sock_ops tcp_timewait_sock_ops = { .twsk_obj_size = sizeof(struct tcp_timewait_sock), .twsk_unique = tcp_twsk_unique, .twsk_destructor= tcp_twsk_destructor, - .twsk_getpeer = tcp_v4_tw_get_peer, }; const struct inet_connection_sock_af_ops ipv4_specific = { @@ -1847,7 +1863,6 @@ const struct inet_connection_sock_af_ops ipv4_specific = { .rebuild_header = inet_sk_rebuild_header, .conn_request = tcp_v4_conn_request, .syn_recv_sock = tcp_v4_syn_recv_sock, - .get_peer = tcp_v4_get_peer, .net_header_len = sizeof(struct iphdr), .setsockopt = ip_setsockopt, .getsockopt = ip_getsockopt, @@ -1875,64 +1890,15 @@ static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = { static int tcp_v4_init_sock(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); - struct tcp_sock *tp = tcp_sk(sk); - - skb_queue_head_init(&tp->out_of_order_queue); - tcp_init_xmit_timers(sk); - tcp_prequeue_init(tp); - - icsk->icsk_rto = TCP_TIMEOUT_INIT; - tp->mdev = TCP_TIMEOUT_INIT; - - /* So many TCP implementations out there (incorrectly) count the - * initial SYN frame in their delayed-ACK and congestion control - * algorithms that we must have the following bandaid to talk - * efficiently to them. -DaveM - */ - tp->snd_cwnd = TCP_INIT_CWND; - - /* See draft-stevens-tcpca-spec-01 for discussion of the - * initialization of these values. - */ - tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; - tp->snd_cwnd_clamp = ~0; - tp->mss_cache = TCP_MSS_DEFAULT; - tp->reordering = sysctl_tcp_reordering; - icsk->icsk_ca_ops = &tcp_init_congestion_ops; - - sk->sk_state = TCP_CLOSE; - - sk->sk_write_space = sk_stream_write_space; - sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); + tcp_init_sock(sk); icsk->icsk_af_ops = &ipv4_specific; - icsk->icsk_sync_mss = tcp_sync_mss; + #ifdef CONFIG_TCP_MD5SIG - tp->af_specific = &tcp_sock_ipv4_specific; + tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific; #endif - /* TCP Cookie Transactions */ - if (sysctl_tcp_cookie_size > 0) { - /* Default, cookies without s_data_payload. */ - tp->cookie_values = - kzalloc(sizeof(*tp->cookie_values), - sk->sk_allocation); - if (tp->cookie_values != NULL) - kref_init(&tp->cookie_values->kref); - } - /* Presumed zeroed, in order of appearance: - * cookie_in_always, cookie_out_never, - * s_data_constant, s_data_in, s_data_out - */ - sk->sk_sndbuf = sysctl_tcp_wmem[1]; - sk->sk_rcvbuf = sysctl_tcp_rmem[1]; - - local_bh_disable(); - sock_update_memcg(sk); - sk_sockets_allocated_inc(sk); - local_bh_enable(); - return 0; } @@ -1986,6 +1952,9 @@ void tcp_v4_destroy_sock(struct sock *sk) tp->cookie_values = NULL; } + /* If socket is aborted during connect operation */ + tcp_free_fastopen_req(tp); + sk_sockets_allocated_dec(sk); sock_release_memcg(sk); } @@ -2109,7 +2078,7 @@ static void *listening_get_idx(struct seq_file *seq, loff_t *pos) return rc; } -static inline int empty_bucket(struct tcp_iter_state *st) +static inline bool empty_bucket(struct tcp_iter_state *st) { return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) && hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain); @@ -2626,6 +2595,7 @@ struct proto tcp_prot = { .sendmsg = tcp_sendmsg, .sendpage = tcp_sendpage, .backlog_rcv = tcp_v4_do_rcv, + .release_cb = tcp_release_cb, .hash = inet_hash, .unhash = inet_unhash, .get_port = inet_csk_get_port, @@ -2657,13 +2627,11 @@ EXPORT_SYMBOL(tcp_prot); static int __net_init tcp_sk_init(struct net *net) { - return inet_ctl_sock_create(&net->ipv4.tcp_sock, - PF_INET, SOCK_RAW, IPPROTO_TCP, net); + return 0; } static void __net_exit tcp_sk_exit(struct net *net) { - inet_ctl_sock_destroy(net->ipv4.tcp_sock); } static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c index e795272fbe9e..b6f3583ddfe8 100644 --- a/net/ipv4/tcp_memcontrol.c +++ b/net/ipv4/tcp_memcontrol.c @@ -6,37 +6,6 @@ #include <linux/memcontrol.h> #include <linux/module.h> -static u64 tcp_cgroup_read(struct cgroup *cont, struct cftype *cft); -static int tcp_cgroup_write(struct cgroup *cont, struct cftype *cft, - const char *buffer); -static int tcp_cgroup_reset(struct cgroup *cont, unsigned int event); - -static struct cftype tcp_files[] = { - { - .name = "kmem.tcp.limit_in_bytes", - .write_string = tcp_cgroup_write, - .read_u64 = tcp_cgroup_read, - .private = RES_LIMIT, - }, - { - .name = "kmem.tcp.usage_in_bytes", - .read_u64 = tcp_cgroup_read, - .private = RES_USAGE, - }, - { - .name = "kmem.tcp.failcnt", - .private = RES_FAILCNT, - .trigger = tcp_cgroup_reset, - .read_u64 = tcp_cgroup_read, - }, - { - .name = "kmem.tcp.max_usage_in_bytes", - .private = RES_MAX_USAGE, - .trigger = tcp_cgroup_reset, - .read_u64 = tcp_cgroup_read, - }, -}; - static inline struct tcp_memcontrol *tcp_from_cgproto(struct cg_proto *cg_proto) { return container_of(cg_proto, struct tcp_memcontrol, cg_proto); @@ -49,7 +18,7 @@ static void memcg_tcp_enter_memory_pressure(struct sock *sk) } EXPORT_SYMBOL(memcg_tcp_enter_memory_pressure); -int tcp_init_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss) +int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss) { /* * The root cgroup does not use res_counters, but rather, @@ -59,13 +28,12 @@ int tcp_init_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss) struct res_counter *res_parent = NULL; struct cg_proto *cg_proto, *parent_cg; struct tcp_memcontrol *tcp; - struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); struct mem_cgroup *parent = parent_mem_cgroup(memcg); struct net *net = current->nsproxy->net_ns; cg_proto = tcp_prot.proto_cgroup(memcg); if (!cg_proto) - goto create_files; + return 0; tcp = tcp_from_cgproto(cg_proto); @@ -88,15 +56,12 @@ int tcp_init_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss) cg_proto->sockets_allocated = &tcp->tcp_sockets_allocated; cg_proto->memcg = memcg; -create_files: - return cgroup_add_files(cgrp, ss, tcp_files, - ARRAY_SIZE(tcp_files)); + return 0; } EXPORT_SYMBOL(tcp_init_cgroup); -void tcp_destroy_cgroup(struct cgroup *cgrp) +void tcp_destroy_cgroup(struct mem_cgroup *memcg) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); struct cg_proto *cg_proto; struct tcp_memcontrol *tcp; u64 val; @@ -109,9 +74,6 @@ void tcp_destroy_cgroup(struct cgroup *cgrp) percpu_counter_destroy(&tcp->tcp_sockets_allocated); val = res_counter_read_u64(&tcp->tcp_memory_allocated, RES_LIMIT); - - if (val != RESOURCE_MAX) - static_key_slow_dec(&memcg_socket_limit_enabled); } EXPORT_SYMBOL(tcp_destroy_cgroup); @@ -142,10 +104,33 @@ static int tcp_update_limit(struct mem_cgroup *memcg, u64 val) tcp->tcp_prot_mem[i] = min_t(long, val >> PAGE_SHIFT, net->ipv4.sysctl_tcp_mem[i]); - if (val == RESOURCE_MAX && old_lim != RESOURCE_MAX) - static_key_slow_dec(&memcg_socket_limit_enabled); - else if (old_lim == RESOURCE_MAX && val != RESOURCE_MAX) - static_key_slow_inc(&memcg_socket_limit_enabled); + if (val == RESOURCE_MAX) + clear_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags); + else if (val != RESOURCE_MAX) { + /* + * The active bit needs to be written after the static_key + * update. This is what guarantees that the socket activation + * function is the last one to run. See sock_update_memcg() for + * details, and note that we don't mark any socket as belonging + * to this memcg until that flag is up. + * + * We need to do this, because static_keys will span multiple + * sites, but we can't control their order. If we mark a socket + * as accounted, but the accounting functions are not patched in + * yet, we'll lose accounting. + * + * We never race with the readers in sock_update_memcg(), + * because when this value change, the code to process it is not + * patched in yet. + * + * The activated bit is used to guarantee that no two writers + * will do the update in the same memcg. Without that, we can't + * properly shutdown the static key. + */ + if (!test_and_set_bit(MEMCG_SOCK_ACTIVATED, &cg_proto->flags)) + static_key_slow_inc(&memcg_socket_limit_enabled); + set_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags); + } return 0; } @@ -270,3 +255,37 @@ void tcp_prot_mem(struct mem_cgroup *memcg, long val, int idx) tcp->tcp_prot_mem[idx] = val; } + +static struct cftype tcp_files[] = { + { + .name = "kmem.tcp.limit_in_bytes", + .write_string = tcp_cgroup_write, + .read_u64 = tcp_cgroup_read, + .private = RES_LIMIT, + }, + { + .name = "kmem.tcp.usage_in_bytes", + .read_u64 = tcp_cgroup_read, + .private = RES_USAGE, + }, + { + .name = "kmem.tcp.failcnt", + .private = RES_FAILCNT, + .trigger = tcp_cgroup_reset, + .read_u64 = tcp_cgroup_read, + }, + { + .name = "kmem.tcp.max_usage_in_bytes", + .private = RES_MAX_USAGE, + .trigger = tcp_cgroup_reset, + .read_u64 = tcp_cgroup_read, + }, + { } /* terminate */ +}; + +static int __init tcp_memcontrol_init(void) +{ + WARN_ON(cgroup_add_cftypes(&mem_cgroup_subsys, tcp_files)); + return 0; +} +__initcall(tcp_memcontrol_init); diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c new file mode 100644 index 000000000000..992f1bff4fc6 --- /dev/null +++ b/net/ipv4/tcp_metrics.c @@ -0,0 +1,744 @@ +#include <linux/rcupdate.h> +#include <linux/spinlock.h> +#include <linux/jiffies.h> +#include <linux/bootmem.h> +#include <linux/module.h> +#include <linux/cache.h> +#include <linux/slab.h> +#include <linux/init.h> +#include <linux/tcp.h> +#include <linux/hash.h> + +#include <net/inet_connection_sock.h> +#include <net/net_namespace.h> +#include <net/request_sock.h> +#include <net/inetpeer.h> +#include <net/sock.h> +#include <net/ipv6.h> +#include <net/dst.h> +#include <net/tcp.h> + +int sysctl_tcp_nometrics_save __read_mostly; + +enum tcp_metric_index { + TCP_METRIC_RTT, + TCP_METRIC_RTTVAR, + TCP_METRIC_SSTHRESH, + TCP_METRIC_CWND, + TCP_METRIC_REORDERING, + + /* Always last. */ + TCP_METRIC_MAX, +}; + +struct tcp_fastopen_metrics { + u16 mss; + u16 syn_loss:10; /* Recurring Fast Open SYN losses */ + unsigned long last_syn_loss; /* Last Fast Open SYN loss */ + struct tcp_fastopen_cookie cookie; +}; + +struct tcp_metrics_block { + struct tcp_metrics_block __rcu *tcpm_next; + struct inetpeer_addr tcpm_addr; + unsigned long tcpm_stamp; + u32 tcpm_ts; + u32 tcpm_ts_stamp; + u32 tcpm_lock; + u32 tcpm_vals[TCP_METRIC_MAX]; + struct tcp_fastopen_metrics tcpm_fastopen; +}; + +static bool tcp_metric_locked(struct tcp_metrics_block *tm, + enum tcp_metric_index idx) +{ + return tm->tcpm_lock & (1 << idx); +} + +static u32 tcp_metric_get(struct tcp_metrics_block *tm, + enum tcp_metric_index idx) +{ + return tm->tcpm_vals[idx]; +} + +static u32 tcp_metric_get_jiffies(struct tcp_metrics_block *tm, + enum tcp_metric_index idx) +{ + return msecs_to_jiffies(tm->tcpm_vals[idx]); +} + +static void tcp_metric_set(struct tcp_metrics_block *tm, + enum tcp_metric_index idx, + u32 val) +{ + tm->tcpm_vals[idx] = val; +} + +static void tcp_metric_set_msecs(struct tcp_metrics_block *tm, + enum tcp_metric_index idx, + u32 val) +{ + tm->tcpm_vals[idx] = jiffies_to_msecs(val); +} + +static bool addr_same(const struct inetpeer_addr *a, + const struct inetpeer_addr *b) +{ + const struct in6_addr *a6, *b6; + + if (a->family != b->family) + return false; + if (a->family == AF_INET) + return a->addr.a4 == b->addr.a4; + + a6 = (const struct in6_addr *) &a->addr.a6[0]; + b6 = (const struct in6_addr *) &b->addr.a6[0]; + + return ipv6_addr_equal(a6, b6); +} + +struct tcpm_hash_bucket { + struct tcp_metrics_block __rcu *chain; +}; + +static DEFINE_SPINLOCK(tcp_metrics_lock); + +static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst) +{ + u32 val; + + val = 0; + if (dst_metric_locked(dst, RTAX_RTT)) + val |= 1 << TCP_METRIC_RTT; + if (dst_metric_locked(dst, RTAX_RTTVAR)) + val |= 1 << TCP_METRIC_RTTVAR; + if (dst_metric_locked(dst, RTAX_SSTHRESH)) + val |= 1 << TCP_METRIC_SSTHRESH; + if (dst_metric_locked(dst, RTAX_CWND)) + val |= 1 << TCP_METRIC_CWND; + if (dst_metric_locked(dst, RTAX_REORDERING)) + val |= 1 << TCP_METRIC_REORDERING; + tm->tcpm_lock = val; + + tm->tcpm_vals[TCP_METRIC_RTT] = dst_metric_raw(dst, RTAX_RTT); + tm->tcpm_vals[TCP_METRIC_RTTVAR] = dst_metric_raw(dst, RTAX_RTTVAR); + tm->tcpm_vals[TCP_METRIC_SSTHRESH] = dst_metric_raw(dst, RTAX_SSTHRESH); + tm->tcpm_vals[TCP_METRIC_CWND] = dst_metric_raw(dst, RTAX_CWND); + tm->tcpm_vals[TCP_METRIC_REORDERING] = dst_metric_raw(dst, RTAX_REORDERING); + tm->tcpm_ts = 0; + tm->tcpm_ts_stamp = 0; + tm->tcpm_fastopen.mss = 0; + tm->tcpm_fastopen.syn_loss = 0; + tm->tcpm_fastopen.cookie.len = 0; +} + +static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst, + struct inetpeer_addr *addr, + unsigned int hash, + bool reclaim) +{ + struct tcp_metrics_block *tm; + struct net *net; + + spin_lock_bh(&tcp_metrics_lock); + net = dev_net(dst->dev); + if (unlikely(reclaim)) { + struct tcp_metrics_block *oldest; + + oldest = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); + for (tm = rcu_dereference(oldest->tcpm_next); tm; + tm = rcu_dereference(tm->tcpm_next)) { + if (time_before(tm->tcpm_stamp, oldest->tcpm_stamp)) + oldest = tm; + } + tm = oldest; + } else { + tm = kmalloc(sizeof(*tm), GFP_ATOMIC); + if (!tm) + goto out_unlock; + } + tm->tcpm_addr = *addr; + tm->tcpm_stamp = jiffies; + + tcpm_suck_dst(tm, dst); + + if (likely(!reclaim)) { + tm->tcpm_next = net->ipv4.tcp_metrics_hash[hash].chain; + rcu_assign_pointer(net->ipv4.tcp_metrics_hash[hash].chain, tm); + } + +out_unlock: + spin_unlock_bh(&tcp_metrics_lock); + return tm; +} + +#define TCP_METRICS_TIMEOUT (60 * 60 * HZ) + +static void tcpm_check_stamp(struct tcp_metrics_block *tm, struct dst_entry *dst) +{ + if (tm && unlikely(time_after(jiffies, tm->tcpm_stamp + TCP_METRICS_TIMEOUT))) + tcpm_suck_dst(tm, dst); +} + +#define TCP_METRICS_RECLAIM_DEPTH 5 +#define TCP_METRICS_RECLAIM_PTR (struct tcp_metrics_block *) 0x1UL + +static struct tcp_metrics_block *tcp_get_encode(struct tcp_metrics_block *tm, int depth) +{ + if (tm) + return tm; + if (depth > TCP_METRICS_RECLAIM_DEPTH) + return TCP_METRICS_RECLAIM_PTR; + return NULL; +} + +static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *addr, + struct net *net, unsigned int hash) +{ + struct tcp_metrics_block *tm; + int depth = 0; + + for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm; + tm = rcu_dereference(tm->tcpm_next)) { + if (addr_same(&tm->tcpm_addr, addr)) + break; + depth++; + } + return tcp_get_encode(tm, depth); +} + +static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req, + struct dst_entry *dst) +{ + struct tcp_metrics_block *tm; + struct inetpeer_addr addr; + unsigned int hash; + struct net *net; + + addr.family = req->rsk_ops->family; + switch (addr.family) { + case AF_INET: + addr.addr.a4 = inet_rsk(req)->rmt_addr; + hash = (__force unsigned int) addr.addr.a4; + break; + case AF_INET6: + *(struct in6_addr *)addr.addr.a6 = inet6_rsk(req)->rmt_addr; + hash = ipv6_addr_hash(&inet6_rsk(req)->rmt_addr); + break; + default: + return NULL; + } + + net = dev_net(dst->dev); + hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log); + + for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm; + tm = rcu_dereference(tm->tcpm_next)) { + if (addr_same(&tm->tcpm_addr, &addr)) + break; + } + tcpm_check_stamp(tm, dst); + return tm; +} + +static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock *tw) +{ + struct inet6_timewait_sock *tw6; + struct tcp_metrics_block *tm; + struct inetpeer_addr addr; + unsigned int hash; + struct net *net; + + addr.family = tw->tw_family; + switch (addr.family) { + case AF_INET: + addr.addr.a4 = tw->tw_daddr; + hash = (__force unsigned int) addr.addr.a4; + break; + case AF_INET6: + tw6 = inet6_twsk((struct sock *)tw); + *(struct in6_addr *)addr.addr.a6 = tw6->tw_v6_daddr; + hash = ipv6_addr_hash(&tw6->tw_v6_daddr); + break; + default: + return NULL; + } + + net = twsk_net(tw); + hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log); + + for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm; + tm = rcu_dereference(tm->tcpm_next)) { + if (addr_same(&tm->tcpm_addr, &addr)) + break; + } + return tm; +} + +static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk, + struct dst_entry *dst, + bool create) +{ + struct tcp_metrics_block *tm; + struct inetpeer_addr addr; + unsigned int hash; + struct net *net; + bool reclaim; + + addr.family = sk->sk_family; + switch (addr.family) { + case AF_INET: + addr.addr.a4 = inet_sk(sk)->inet_daddr; + hash = (__force unsigned int) addr.addr.a4; + break; + case AF_INET6: + *(struct in6_addr *)addr.addr.a6 = inet6_sk(sk)->daddr; + hash = ipv6_addr_hash(&inet6_sk(sk)->daddr); + break; + default: + return NULL; + } + + net = dev_net(dst->dev); + hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log); + + tm = __tcp_get_metrics(&addr, net, hash); + reclaim = false; + if (tm == TCP_METRICS_RECLAIM_PTR) { + reclaim = true; + tm = NULL; + } + if (!tm && create) + tm = tcpm_new(dst, &addr, hash, reclaim); + else + tcpm_check_stamp(tm, dst); + + return tm; +} + +/* Save metrics learned by this TCP session. This function is called + * only, when TCP finishes successfully i.e. when it enters TIME-WAIT + * or goes from LAST-ACK to CLOSE. + */ +void tcp_update_metrics(struct sock *sk) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + struct dst_entry *dst = __sk_dst_get(sk); + struct tcp_sock *tp = tcp_sk(sk); + struct tcp_metrics_block *tm; + unsigned long rtt; + u32 val; + int m; + + if (sysctl_tcp_nometrics_save || !dst) + return; + + if (dst->flags & DST_HOST) + dst_confirm(dst); + + rcu_read_lock(); + if (icsk->icsk_backoff || !tp->srtt) { + /* This session failed to estimate rtt. Why? + * Probably, no packets returned in time. Reset our + * results. + */ + tm = tcp_get_metrics(sk, dst, false); + if (tm && !tcp_metric_locked(tm, TCP_METRIC_RTT)) + tcp_metric_set(tm, TCP_METRIC_RTT, 0); + goto out_unlock; + } else + tm = tcp_get_metrics(sk, dst, true); + + if (!tm) + goto out_unlock; + + rtt = tcp_metric_get_jiffies(tm, TCP_METRIC_RTT); + m = rtt - tp->srtt; + + /* If newly calculated rtt larger than stored one, store new + * one. Otherwise, use EWMA. Remember, rtt overestimation is + * always better than underestimation. + */ + if (!tcp_metric_locked(tm, TCP_METRIC_RTT)) { + if (m <= 0) + rtt = tp->srtt; + else + rtt -= (m >> 3); + tcp_metric_set_msecs(tm, TCP_METRIC_RTT, rtt); + } + + if (!tcp_metric_locked(tm, TCP_METRIC_RTTVAR)) { + unsigned long var; + + if (m < 0) + m = -m; + + /* Scale deviation to rttvar fixed point */ + m >>= 1; + if (m < tp->mdev) + m = tp->mdev; + + var = tcp_metric_get_jiffies(tm, TCP_METRIC_RTTVAR); + if (m >= var) + var = m; + else + var -= (var - m) >> 2; + + tcp_metric_set_msecs(tm, TCP_METRIC_RTTVAR, var); + } + + if (tcp_in_initial_slowstart(tp)) { + /* Slow start still did not finish. */ + if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) { + val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH); + if (val && (tp->snd_cwnd >> 1) > val) + tcp_metric_set(tm, TCP_METRIC_SSTHRESH, + tp->snd_cwnd >> 1); + } + if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) { + val = tcp_metric_get(tm, TCP_METRIC_CWND); + if (tp->snd_cwnd > val) + tcp_metric_set(tm, TCP_METRIC_CWND, + tp->snd_cwnd); + } + } else if (tp->snd_cwnd > tp->snd_ssthresh && + icsk->icsk_ca_state == TCP_CA_Open) { + /* Cong. avoidance phase, cwnd is reliable. */ + if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) + tcp_metric_set(tm, TCP_METRIC_SSTHRESH, + max(tp->snd_cwnd >> 1, tp->snd_ssthresh)); + if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) { + val = tcp_metric_get(tm, TCP_METRIC_CWND); + tcp_metric_set(tm, TCP_METRIC_CWND, (val + tp->snd_cwnd) >> 1); + } + } else { + /* Else slow start did not finish, cwnd is non-sense, + * ssthresh may be also invalid. + */ + if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) { + val = tcp_metric_get(tm, TCP_METRIC_CWND); + tcp_metric_set(tm, TCP_METRIC_CWND, + (val + tp->snd_ssthresh) >> 1); + } + if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) { + val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH); + if (val && tp->snd_ssthresh > val) + tcp_metric_set(tm, TCP_METRIC_SSTHRESH, + tp->snd_ssthresh); + } + if (!tcp_metric_locked(tm, TCP_METRIC_REORDERING)) { + val = tcp_metric_get(tm, TCP_METRIC_REORDERING); + if (val < tp->reordering && + tp->reordering != sysctl_tcp_reordering) + tcp_metric_set(tm, TCP_METRIC_REORDERING, + tp->reordering); + } + } + tm->tcpm_stamp = jiffies; +out_unlock: + rcu_read_unlock(); +} + +/* Initialize metrics on socket. */ + +void tcp_init_metrics(struct sock *sk) +{ + struct dst_entry *dst = __sk_dst_get(sk); + struct tcp_sock *tp = tcp_sk(sk); + struct tcp_metrics_block *tm; + u32 val; + + if (dst == NULL) + goto reset; + + dst_confirm(dst); + + rcu_read_lock(); + tm = tcp_get_metrics(sk, dst, true); + if (!tm) { + rcu_read_unlock(); + goto reset; + } + + if (tcp_metric_locked(tm, TCP_METRIC_CWND)) + tp->snd_cwnd_clamp = tcp_metric_get(tm, TCP_METRIC_CWND); + + val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH); + if (val) { + tp->snd_ssthresh = val; + if (tp->snd_ssthresh > tp->snd_cwnd_clamp) + tp->snd_ssthresh = tp->snd_cwnd_clamp; + } else { + /* ssthresh may have been reduced unnecessarily during. + * 3WHS. Restore it back to its initial default. + */ + tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; + } + val = tcp_metric_get(tm, TCP_METRIC_REORDERING); + if (val && tp->reordering != val) { + tcp_disable_fack(tp); + tcp_disable_early_retrans(tp); + tp->reordering = val; + } + + val = tcp_metric_get(tm, TCP_METRIC_RTT); + if (val == 0 || tp->srtt == 0) { + rcu_read_unlock(); + goto reset; + } + /* Initial rtt is determined from SYN,SYN-ACK. + * The segment is small and rtt may appear much + * less than real one. Use per-dst memory + * to make it more realistic. + * + * A bit of theory. RTT is time passed after "normal" sized packet + * is sent until it is ACKed. In normal circumstances sending small + * packets force peer to delay ACKs and calculation is correct too. + * The algorithm is adaptive and, provided we follow specs, it + * NEVER underestimate RTT. BUT! If peer tries to make some clever + * tricks sort of "quick acks" for time long enough to decrease RTT + * to low value, and then abruptly stops to do it and starts to delay + * ACKs, wait for troubles. + */ + val = msecs_to_jiffies(val); + if (val > tp->srtt) { + tp->srtt = val; + tp->rtt_seq = tp->snd_nxt; + } + val = tcp_metric_get_jiffies(tm, TCP_METRIC_RTTVAR); + if (val > tp->mdev) { + tp->mdev = val; + tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk)); + } + rcu_read_unlock(); + + tcp_set_rto(sk); +reset: + if (tp->srtt == 0) { + /* RFC6298: 5.7 We've failed to get a valid RTT sample from + * 3WHS. This is most likely due to retransmission, + * including spurious one. Reset the RTO back to 3secs + * from the more aggressive 1sec to avoid more spurious + * retransmission. + */ + tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_FALLBACK; + inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK; + } + /* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been + * retransmitted. In light of RFC6298 more aggressive 1sec + * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK + * retransmission has occurred. + */ + if (tp->total_retrans > 1) + tp->snd_cwnd = 1; + else + tp->snd_cwnd = tcp_init_cwnd(tp, dst); + tp->snd_cwnd_stamp = tcp_time_stamp; +} + +bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst, bool paws_check) +{ + struct tcp_metrics_block *tm; + bool ret; + + if (!dst) + return false; + + rcu_read_lock(); + tm = __tcp_get_metrics_req(req, dst); + if (paws_check) { + if (tm && + (u32)get_seconds() - tm->tcpm_ts_stamp < TCP_PAWS_MSL && + (s32)(tm->tcpm_ts - req->ts_recent) > TCP_PAWS_WINDOW) + ret = false; + else + ret = true; + } else { + if (tm && tcp_metric_get(tm, TCP_METRIC_RTT) && tm->tcpm_ts_stamp) + ret = true; + else + ret = false; + } + rcu_read_unlock(); + + return ret; +} +EXPORT_SYMBOL_GPL(tcp_peer_is_proven); + +void tcp_fetch_timewait_stamp(struct sock *sk, struct dst_entry *dst) +{ + struct tcp_metrics_block *tm; + + rcu_read_lock(); + tm = tcp_get_metrics(sk, dst, true); + if (tm) { + struct tcp_sock *tp = tcp_sk(sk); + + if ((u32)get_seconds() - tm->tcpm_ts_stamp <= TCP_PAWS_MSL) { + tp->rx_opt.ts_recent_stamp = tm->tcpm_ts_stamp; + tp->rx_opt.ts_recent = tm->tcpm_ts; + } + } + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(tcp_fetch_timewait_stamp); + +/* VJ's idea. Save last timestamp seen from this destination and hold + * it at least for normal timewait interval to use for duplicate + * segment detection in subsequent connections, before they enter + * synchronized state. + */ +bool tcp_remember_stamp(struct sock *sk) +{ + struct dst_entry *dst = __sk_dst_get(sk); + bool ret = false; + + if (dst) { + struct tcp_metrics_block *tm; + + rcu_read_lock(); + tm = tcp_get_metrics(sk, dst, true); + if (tm) { + struct tcp_sock *tp = tcp_sk(sk); + + if ((s32)(tm->tcpm_ts - tp->rx_opt.ts_recent) <= 0 || + ((u32)get_seconds() - tm->tcpm_ts_stamp > TCP_PAWS_MSL && + tm->tcpm_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) { + tm->tcpm_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp; + tm->tcpm_ts = tp->rx_opt.ts_recent; + } + ret = true; + } + rcu_read_unlock(); + } + return ret; +} + +bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw) +{ + struct tcp_metrics_block *tm; + bool ret = false; + + rcu_read_lock(); + tm = __tcp_get_metrics_tw(tw); + if (tw) { + const struct tcp_timewait_sock *tcptw; + struct sock *sk = (struct sock *) tw; + + tcptw = tcp_twsk(sk); + if ((s32)(tm->tcpm_ts - tcptw->tw_ts_recent) <= 0 || + ((u32)get_seconds() - tm->tcpm_ts_stamp > TCP_PAWS_MSL && + tm->tcpm_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) { + tm->tcpm_ts_stamp = (u32)tcptw->tw_ts_recent_stamp; + tm->tcpm_ts = tcptw->tw_ts_recent; + } + ret = true; + } + rcu_read_unlock(); + + return ret; +} + +static DEFINE_SEQLOCK(fastopen_seqlock); + +void tcp_fastopen_cache_get(struct sock *sk, u16 *mss, + struct tcp_fastopen_cookie *cookie, + int *syn_loss, unsigned long *last_syn_loss) +{ + struct tcp_metrics_block *tm; + + rcu_read_lock(); + tm = tcp_get_metrics(sk, __sk_dst_get(sk), false); + if (tm) { + struct tcp_fastopen_metrics *tfom = &tm->tcpm_fastopen; + unsigned int seq; + + do { + seq = read_seqbegin(&fastopen_seqlock); + if (tfom->mss) + *mss = tfom->mss; + *cookie = tfom->cookie; + *syn_loss = tfom->syn_loss; + *last_syn_loss = *syn_loss ? tfom->last_syn_loss : 0; + } while (read_seqretry(&fastopen_seqlock, seq)); + } + rcu_read_unlock(); +} + +void tcp_fastopen_cache_set(struct sock *sk, u16 mss, + struct tcp_fastopen_cookie *cookie, bool syn_lost) +{ + struct tcp_metrics_block *tm; + + rcu_read_lock(); + tm = tcp_get_metrics(sk, __sk_dst_get(sk), true); + if (tm) { + struct tcp_fastopen_metrics *tfom = &tm->tcpm_fastopen; + + write_seqlock_bh(&fastopen_seqlock); + tfom->mss = mss; + if (cookie->len > 0) + tfom->cookie = *cookie; + if (syn_lost) { + ++tfom->syn_loss; + tfom->last_syn_loss = jiffies; + } else + tfom->syn_loss = 0; + write_sequnlock_bh(&fastopen_seqlock); + } + rcu_read_unlock(); +} + +static unsigned int tcpmhash_entries; +static int __init set_tcpmhash_entries(char *str) +{ + ssize_t ret; + + if (!str) + return 0; + + ret = kstrtouint(str, 0, &tcpmhash_entries); + if (ret) + return 0; + + return 1; +} +__setup("tcpmhash_entries=", set_tcpmhash_entries); + +static int __net_init tcp_net_metrics_init(struct net *net) +{ + size_t size; + unsigned int slots; + + slots = tcpmhash_entries; + if (!slots) { + if (totalram_pages >= 128 * 1024) + slots = 16 * 1024; + else + slots = 8 * 1024; + } + + net->ipv4.tcp_metrics_hash_log = order_base_2(slots); + size = sizeof(struct tcpm_hash_bucket) << net->ipv4.tcp_metrics_hash_log; + + net->ipv4.tcp_metrics_hash = kzalloc(size, GFP_KERNEL); + if (!net->ipv4.tcp_metrics_hash) + return -ENOMEM; + + return 0; +} + +static void __net_exit tcp_net_metrics_exit(struct net *net) +{ + kfree(net->ipv4.tcp_metrics_hash); +} + +static __net_initdata struct pernet_operations tcp_net_metrics_ops = { + .init = tcp_net_metrics_init, + .exit = tcp_net_metrics_exit, +}; + +void __init tcp_metrics_init(void) +{ + register_pernet_subsys(&tcp_net_metrics_ops); +} diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 3cabafb5cdd1..5912ac3fd240 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -49,62 +49,12 @@ struct inet_timewait_death_row tcp_death_row = { }; EXPORT_SYMBOL_GPL(tcp_death_row); -/* VJ's idea. Save last timestamp seen from this destination - * and hold it at least for normal timewait interval to use for duplicate - * segment detection in subsequent connections, before they enter synchronized - * state. - */ - -static int tcp_remember_stamp(struct sock *sk) -{ - const struct inet_connection_sock *icsk = inet_csk(sk); - struct tcp_sock *tp = tcp_sk(sk); - struct inet_peer *peer; - bool release_it; - - peer = icsk->icsk_af_ops->get_peer(sk, &release_it); - if (peer) { - if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 || - ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL && - peer->tcp_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) { - peer->tcp_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp; - peer->tcp_ts = tp->rx_opt.ts_recent; - } - if (release_it) - inet_putpeer(peer); - return 1; - } - - return 0; -} - -static int tcp_tw_remember_stamp(struct inet_timewait_sock *tw) -{ - struct sock *sk = (struct sock *) tw; - struct inet_peer *peer; - - peer = twsk_getpeer(sk); - if (peer) { - const struct tcp_timewait_sock *tcptw = tcp_twsk(sk); - - if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 || - ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL && - peer->tcp_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) { - peer->tcp_ts_stamp = (u32)tcptw->tw_ts_recent_stamp; - peer->tcp_ts = tcptw->tw_ts_recent; - } - inet_putpeer(peer); - return 1; - } - return 0; -} - -static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) +static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) { if (seq == s_win) - return 1; + return true; if (after(end_seq, s_win) && before(seq, e_win)) - return 1; + return true; return seq == e_win && seq == end_seq; } @@ -143,11 +93,11 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, struct tcp_options_received tmp_opt; const u8 *hash_location; struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); - int paws_reject = 0; + bool paws_reject = false; tmp_opt.saw_tstamp = 0; if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) { - tcp_parse_options(skb, &tmp_opt, &hash_location, 0); + tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL); if (tmp_opt.saw_tstamp) { tmp_opt.ts_recent = tcptw->tw_ts_recent; @@ -316,7 +266,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) struct inet_timewait_sock *tw = NULL; const struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_sock *tp = tcp_sk(sk); - int recycle_ok = 0; + bool recycle_ok = false; if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp) recycle_ok = tcp_remember_stamp(sk); @@ -327,8 +277,9 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) if (tw != NULL) { struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1); + struct inet_sock *inet = inet_sk(sk); - tw->tw_transparent = inet_sk(sk)->transparent; + tw->tw_transparent = inet->transparent; tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale; tcptw->tw_rcv_nxt = tp->rcv_nxt; tcptw->tw_snd_nxt = tp->snd_nxt; @@ -403,6 +354,7 @@ void tcp_twsk_destructor(struct sock *sk) { #ifdef CONFIG_TCP_MD5SIG struct tcp_timewait_sock *twsk = tcp_twsk(sk); + if (twsk->tw_md5_key) { tcp_free_md5sig_pool(); kfree_rcu(twsk->tw_md5_key, rcu); @@ -435,6 +387,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct tcp_sock *oldtp = tcp_sk(sk); struct tcp_cookie_values *oldcvp = oldtp->cookie_values; + newsk->sk_rx_dst = dst_clone(skb_dst(skb)); + /* TCP Cookie Transactions require space for the cookie pair, * as it differs for each connection. There is no need to * copy any s_data_payload stored at the original socket. @@ -470,6 +424,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, treq->snt_isn + 1 + tcp_s_data_size(oldtp); tcp_prequeue_init(newtp); + INIT_LIST_HEAD(&newtp->tsq_node); tcp_init_wl(newtp, treq->rcv_isn); @@ -482,6 +437,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, newtp->sacked_out = 0; newtp->fackets_out = 0; newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH; + tcp_enable_early_retrans(newtp); /* So many TCP implementations out there (incorrectly) count the * initial SYN frame in their delayed-ACK and congestion control @@ -574,11 +530,11 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, struct sock *child; const struct tcphdr *th = tcp_hdr(skb); __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); - int paws_reject = 0; + bool paws_reject = false; tmp_opt.saw_tstamp = 0; if (th->doff > (sizeof(struct tcphdr)>>2)) { - tcp_parse_options(skb, &tmp_opt, &hash_location, 0); + tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL); if (tmp_opt.saw_tstamp) { tmp_opt.ts_recent = req->ts_recent; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 364784a91939..950aebfd9967 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -34,6 +34,8 @@ * */ +#define pr_fmt(fmt) "TCP: " fmt + #include <net/tcp.h> #include <linux/compiler.h> @@ -48,6 +50,9 @@ int sysctl_tcp_retrans_collapse __read_mostly = 1; */ int sysctl_tcp_workaround_signed_windows __read_mostly = 0; +/* Default TSQ limit of two TSO segments */ +int sysctl_tcp_limit_output_bytes __read_mostly = 131072; + /* This limits the percentage of the congestion window which we * will allow a single TSO frame to consume. Building TSO frames * which are too large can cause TCP streams to be bursty. @@ -63,6 +68,8 @@ int sysctl_tcp_slow_start_after_idle __read_mostly = 1; int sysctl_tcp_cookie_size __read_mostly = 0; /* TCP_COOKIE_MAX */ EXPORT_SYMBOL_GPL(sysctl_tcp_cookie_size); +static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, + int push_one, gfp_t gfp); /* Account for new data that has been sent to the network. */ static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb) @@ -78,9 +85,8 @@ static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb) tp->frto_counter = 3; tp->packets_out += tcp_skb_pcount(skb); - if (!prior_packets) - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - inet_csk(sk)->icsk_rto, TCP_RTO_MAX); + if (!prior_packets || tp->early_retrans_delayed) + tcp_rearm_rto(sk); } /* SND.NXT, if window was not shrunk. @@ -369,7 +375,7 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) TCP_SKB_CB(skb)->end_seq = seq; } -static inline int tcp_urg_mode(const struct tcp_sock *tp) +static inline bool tcp_urg_mode(const struct tcp_sock *tp) { return tp->snd_una != tp->snd_up; } @@ -379,15 +385,17 @@ static inline int tcp_urg_mode(const struct tcp_sock *tp) #define OPTION_MD5 (1 << 2) #define OPTION_WSCALE (1 << 3) #define OPTION_COOKIE_EXTENSION (1 << 4) +#define OPTION_FAST_OPEN_COOKIE (1 << 8) struct tcp_out_options { - u8 options; /* bit field of OPTION_* */ + u16 options; /* bit field of OPTION_* */ + u16 mss; /* 0 to disable */ u8 ws; /* window scale, 0 to disable */ u8 num_sack_blocks; /* number of SACK blocks to include */ u8 hash_size; /* bytes in hash_location */ - u16 mss; /* 0 to disable */ - __u32 tsval, tsecr; /* need to include OPTION_TS */ __u8 *hash_location; /* temporary pointer, overloaded */ + __u32 tsval, tsecr; /* need to include OPTION_TS */ + struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */ }; /* The sysctl int routines are generic, so check consistency here. @@ -436,7 +444,7 @@ static u8 tcp_cookie_size_check(u8 desired) static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, struct tcp_out_options *opts) { - u8 options = opts->options; /* mungable copy */ + u16 options = opts->options; /* mungable copy */ /* Having both authentication and cookies for security is redundant, * and there's certainly not enough room. Instead, the cookie-less @@ -558,21 +566,37 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, tp->rx_opt.dsack = 0; } + + if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) { + struct tcp_fastopen_cookie *foc = opts->fastopen_cookie; + + *ptr++ = htonl((TCPOPT_EXP << 24) | + ((TCPOLEN_EXP_FASTOPEN_BASE + foc->len) << 16) | + TCPOPT_FASTOPEN_MAGIC); + + memcpy(ptr, foc->val, foc->len); + if ((foc->len & 3) == 2) { + u8 *align = ((u8 *)ptr) + foc->len; + align[0] = align[1] = TCPOPT_NOP; + } + ptr += (foc->len + 3) >> 2; + } } /* Compute TCP options for SYN packets. This is not the final * network wire format yet. */ -static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb, +static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, struct tcp_out_options *opts, struct tcp_md5sig_key **md5) { struct tcp_sock *tp = tcp_sk(sk); struct tcp_cookie_values *cvp = tp->cookie_values; - unsigned remaining = MAX_TCP_OPTION_SPACE; + unsigned int remaining = MAX_TCP_OPTION_SPACE; u8 cookie_size = (!tp->rx_opt.cookie_out_never && cvp != NULL) ? tcp_cookie_size_check(cvp->cookie_desired) : 0; + struct tcp_fastopen_request *fastopen = tp->fastopen_req; #ifdef CONFIG_TCP_MD5SIG *md5 = tp->af_specific->md5_lookup(sk, sk); @@ -613,6 +637,16 @@ static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb, remaining -= TCPOLEN_SACKPERM_ALIGNED; } + if (fastopen && fastopen->cookie.len >= 0) { + u32 need = TCPOLEN_EXP_FASTOPEN_BASE + fastopen->cookie.len; + need = (need + 3) & ~3U; /* Align to 32 bits */ + if (remaining >= need) { + opts->options |= OPTION_FAST_OPEN_COOKIE; + opts->fastopen_cookie = &fastopen->cookie; + remaining -= need; + tp->syn_fastopen = 1; + } + } /* Note that timestamps are required by the specification. * * Odd numbers of bytes are prohibited by the specification, ensuring @@ -663,15 +697,15 @@ static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb, } /* Set up TCP options for SYN-ACKs. */ -static unsigned tcp_synack_options(struct sock *sk, +static unsigned int tcp_synack_options(struct sock *sk, struct request_sock *req, - unsigned mss, struct sk_buff *skb, + unsigned int mss, struct sk_buff *skb, struct tcp_out_options *opts, struct tcp_md5sig_key **md5, struct tcp_extend_values *xvp) { struct inet_request_sock *ireq = inet_rsk(req); - unsigned remaining = MAX_TCP_OPTION_SPACE; + unsigned int remaining = MAX_TCP_OPTION_SPACE; u8 cookie_plus = (xvp != NULL && !xvp->cookie_out_never) ? xvp->cookie_plus : 0; @@ -742,13 +776,13 @@ static unsigned tcp_synack_options(struct sock *sk, /* Compute TCP options for ESTABLISHED sockets. This is not the * final wire format yet. */ -static unsigned tcp_established_options(struct sock *sk, struct sk_buff *skb, +static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb, struct tcp_out_options *opts, struct tcp_md5sig_key **md5) { struct tcp_skb_cb *tcb = skb ? TCP_SKB_CB(skb) : NULL; struct tcp_sock *tp = tcp_sk(sk); - unsigned size = 0; + unsigned int size = 0; unsigned int eff_sacks; #ifdef CONFIG_TCP_MD5SIG @@ -770,9 +804,9 @@ static unsigned tcp_established_options(struct sock *sk, struct sk_buff *skb, eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack; if (unlikely(eff_sacks)) { - const unsigned remaining = MAX_TCP_OPTION_SPACE - size; + const unsigned int remaining = MAX_TCP_OPTION_SPACE - size; opts->num_sack_blocks = - min_t(unsigned, eff_sacks, + min_t(unsigned int, eff_sacks, (remaining - TCPOLEN_SACK_BASE_ALIGNED) / TCPOLEN_SACK_PERBLOCK); size += TCPOLEN_SACK_BASE_ALIGNED + @@ -782,6 +816,152 @@ static unsigned tcp_established_options(struct sock *sk, struct sk_buff *skb, return size; } + +/* TCP SMALL QUEUES (TSQ) + * + * TSQ goal is to keep small amount of skbs per tcp flow in tx queues (qdisc+dev) + * to reduce RTT and bufferbloat. + * We do this using a special skb destructor (tcp_wfree). + * + * Its important tcp_wfree() can be replaced by sock_wfree() in the event skb + * needs to be reallocated in a driver. + * The invariant being skb->truesize substracted from sk->sk_wmem_alloc + * + * Since transmit from skb destructor is forbidden, we use a tasklet + * to process all sockets that eventually need to send more skbs. + * We use one tasklet per cpu, with its own queue of sockets. + */ +struct tsq_tasklet { + struct tasklet_struct tasklet; + struct list_head head; /* queue of tcp sockets */ +}; +static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet); + +static void tcp_tsq_handler(struct sock *sk) +{ + if ((1 << sk->sk_state) & + (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING | + TCPF_CLOSE_WAIT | TCPF_LAST_ACK)) + tcp_write_xmit(sk, tcp_current_mss(sk), 0, 0, GFP_ATOMIC); +} +/* + * One tasklest per cpu tries to send more skbs. + * We run in tasklet context but need to disable irqs when + * transfering tsq->head because tcp_wfree() might + * interrupt us (non NAPI drivers) + */ +static void tcp_tasklet_func(unsigned long data) +{ + struct tsq_tasklet *tsq = (struct tsq_tasklet *)data; + LIST_HEAD(list); + unsigned long flags; + struct list_head *q, *n; + struct tcp_sock *tp; + struct sock *sk; + + local_irq_save(flags); + list_splice_init(&tsq->head, &list); + local_irq_restore(flags); + + list_for_each_safe(q, n, &list) { + tp = list_entry(q, struct tcp_sock, tsq_node); + list_del(&tp->tsq_node); + + sk = (struct sock *)tp; + bh_lock_sock(sk); + + if (!sock_owned_by_user(sk)) { + tcp_tsq_handler(sk); + } else { + /* defer the work to tcp_release_cb() */ + set_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags); + } + bh_unlock_sock(sk); + + clear_bit(TSQ_QUEUED, &tp->tsq_flags); + sk_free(sk); + } +} + +#define TCP_DEFERRED_ALL ((1UL << TCP_TSQ_DEFERRED) | \ + (1UL << TCP_WRITE_TIMER_DEFERRED) | \ + (1UL << TCP_DELACK_TIMER_DEFERRED)) +/** + * tcp_release_cb - tcp release_sock() callback + * @sk: socket + * + * called from release_sock() to perform protocol dependent + * actions before socket release. + */ +void tcp_release_cb(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + unsigned long flags, nflags; + + /* perform an atomic operation only if at least one flag is set */ + do { + flags = tp->tsq_flags; + if (!(flags & TCP_DEFERRED_ALL)) + return; + nflags = flags & ~TCP_DEFERRED_ALL; + } while (cmpxchg(&tp->tsq_flags, flags, nflags) != flags); + + if (flags & (1UL << TCP_TSQ_DEFERRED)) + tcp_tsq_handler(sk); + + if (flags & (1UL << TCP_WRITE_TIMER_DEFERRED)) + tcp_write_timer_handler(sk); + + if (flags & (1UL << TCP_DELACK_TIMER_DEFERRED)) + tcp_delack_timer_handler(sk); +} +EXPORT_SYMBOL(tcp_release_cb); + +void __init tcp_tasklet_init(void) +{ + int i; + + for_each_possible_cpu(i) { + struct tsq_tasklet *tsq = &per_cpu(tsq_tasklet, i); + + INIT_LIST_HEAD(&tsq->head); + tasklet_init(&tsq->tasklet, + tcp_tasklet_func, + (unsigned long)tsq); + } +} + +/* + * Write buffer destructor automatically called from kfree_skb. + * We cant xmit new skbs from this context, as we might already + * hold qdisc lock. + */ +void tcp_wfree(struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + struct tcp_sock *tp = tcp_sk(sk); + + if (test_and_clear_bit(TSQ_THROTTLED, &tp->tsq_flags) && + !test_and_set_bit(TSQ_QUEUED, &tp->tsq_flags)) { + unsigned long flags; + struct tsq_tasklet *tsq; + + /* Keep a ref on socket. + * This last ref will be released in tcp_tasklet_func() + */ + atomic_sub(skb->truesize - 1, &sk->sk_wmem_alloc); + + /* queue this socket to tasklet queue */ + local_irq_save(flags); + tsq = &__get_cpu_var(tsq_tasklet); + list_add(&tp->tsq_node, &tsq->head); + tasklet_schedule(&tsq->tasklet); + local_irq_restore(flags); + } else { + sock_wfree(skb); + } +} + /* This routine actually transmits TCP packets queued in by * tcp_do_sendmsg(). This is used by both the initial * transmission and possible later retransmissions. @@ -801,7 +981,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, struct tcp_sock *tp; struct tcp_skb_cb *tcb; struct tcp_out_options opts; - unsigned tcp_options_size, tcp_header_size; + unsigned int tcp_options_size, tcp_header_size; struct tcp_md5sig_key *md5; struct tcphdr *th; int err; @@ -843,7 +1023,12 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, skb_push(skb, tcp_header_size); skb_reset_transport_header(skb); - skb_set_owner_w(skb, sk); + + skb_orphan(skb); + skb->sk = sk; + skb->destructor = (sysctl_tcp_limit_output_bytes > 0) ? + tcp_wfree : sock_wfree; + atomic_add(skb->truesize, &sk->sk_wmem_alloc); /* Build TCP header and checksum it. */ th = tcp_hdr(skb); @@ -1096,6 +1281,7 @@ static void __pskb_trim_head(struct sk_buff *skb, int len) eat = min_t(int, len, skb_headlen(skb)); if (eat) { __skb_pull(skb, eat); + skb->avail_size -= eat; len -= eat; if (!len) return; @@ -1149,7 +1335,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) } /* Calculate MSS. Not accounting for SACKs here. */ -int tcp_mtu_to_mss(const struct sock *sk, int pmtu) +int tcp_mtu_to_mss(struct sock *sk, int pmtu) { const struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); @@ -1160,6 +1346,14 @@ int tcp_mtu_to_mss(const struct sock *sk, int pmtu) */ mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr); + /* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */ + if (icsk->icsk_af_ops->net_frag_header_len) { + const struct dst_entry *dst = __sk_dst_get(sk); + + if (dst && dst_allfrag(dst)) + mss_now -= icsk->icsk_af_ops->net_frag_header_len; + } + /* Clamp it (mss_clamp does not include tcp options) */ if (mss_now > tp->rx_opt.mss_clamp) mss_now = tp->rx_opt.mss_clamp; @@ -1178,7 +1372,7 @@ int tcp_mtu_to_mss(const struct sock *sk, int pmtu) } /* Inverse of above */ -int tcp_mss_to_mtu(const struct sock *sk, int mss) +int tcp_mss_to_mtu(struct sock *sk, int mss) { const struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); @@ -1189,6 +1383,13 @@ int tcp_mss_to_mtu(const struct sock *sk, int mss) icsk->icsk_ext_hdr_len + icsk->icsk_af_ops->net_header_len; + /* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */ + if (icsk->icsk_af_ops->net_frag_header_len) { + const struct dst_entry *dst = __sk_dst_get(sk); + + if (dst && dst_allfrag(dst)) + mtu += icsk->icsk_af_ops->net_frag_header_len; + } return mtu; } @@ -1258,7 +1459,7 @@ unsigned int tcp_current_mss(struct sock *sk) const struct tcp_sock *tp = tcp_sk(sk); const struct dst_entry *dst = __sk_dst_get(sk); u32 mss_now; - unsigned header_len; + unsigned int header_len; struct tcp_out_options opts; struct tcp_md5sig_key *md5; @@ -1374,33 +1575,33 @@ static int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb, } /* Minshall's variant of the Nagle send check. */ -static inline int tcp_minshall_check(const struct tcp_sock *tp) +static inline bool tcp_minshall_check(const struct tcp_sock *tp) { return after(tp->snd_sml, tp->snd_una) && !after(tp->snd_sml, tp->snd_nxt); } -/* Return 0, if packet can be sent now without violation Nagle's rules: +/* Return false, if packet can be sent now without violation Nagle's rules: * 1. It is full sized. * 2. Or it contains FIN. (already checked by caller) * 3. Or TCP_CORK is not set, and TCP_NODELAY is set. * 4. Or TCP_CORK is not set, and all sent packets are ACKed. * With Minshall's modification: all sent small packets are ACKed. */ -static inline int tcp_nagle_check(const struct tcp_sock *tp, +static inline bool tcp_nagle_check(const struct tcp_sock *tp, const struct sk_buff *skb, - unsigned mss_now, int nonagle) + unsigned int mss_now, int nonagle) { return skb->len < mss_now && ((nonagle & TCP_NAGLE_CORK) || (!nonagle && tp->packets_out && tcp_minshall_check(tp))); } -/* Return non-zero if the Nagle test allows this packet to be +/* Return true if the Nagle test allows this packet to be * sent now. */ -static inline int tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb, - unsigned int cur_mss, int nonagle) +static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb, + unsigned int cur_mss, int nonagle) { /* Nagle rule does not apply to frames, which sit in the middle of the * write_queue (they have no chances to get new data). @@ -1409,24 +1610,25 @@ static inline int tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff * argument based upon the location of SKB in the send queue. */ if (nonagle & TCP_NAGLE_PUSH) - return 1; + return true; /* Don't use the nagle rule for urgent data (or for the final FIN). * Nagle can be ignored during F-RTO too (see RFC4138). */ if (tcp_urg_mode(tp) || (tp->frto_counter == 2) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) - return 1; + return true; if (!tcp_nagle_check(tp, skb, cur_mss, nonagle)) - return 1; + return true; - return 0; + return false; } /* Does at least the first segment of SKB fit into the send window? */ -static inline int tcp_snd_wnd_test(const struct tcp_sock *tp, const struct sk_buff *skb, - unsigned int cur_mss) +static bool tcp_snd_wnd_test(const struct tcp_sock *tp, + const struct sk_buff *skb, + unsigned int cur_mss) { u32 end_seq = TCP_SKB_CB(skb)->end_seq; @@ -1459,7 +1661,7 @@ static unsigned int tcp_snd_test(const struct sock *sk, struct sk_buff *skb, } /* Test if sending is allowed right now. */ -int tcp_may_send_now(struct sock *sk) +bool tcp_may_send_now(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb = tcp_send_head(sk); @@ -1529,7 +1731,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, * * This algorithm is from John Heffner. */ -static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) +static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); @@ -1589,11 +1791,11 @@ static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) /* Ok, it looks like it is advisable to defer. */ tp->tso_deferred = 1 | (jiffies << 1); - return 1; + return true; send_now: tp->tso_deferred = 0; - return 0; + return false; } /* Create a new MTU probe if we are ready. @@ -1735,11 +1937,11 @@ static int tcp_mtu_probe(struct sock *sk) * snd_up-64k-mss .. snd_up cannot be large. However, taking into * account rare use of URG, this is not a big flaw. * - * Returns 1, if no segments are in flight and we have queued segments, but - * cannot send anything now because of SWS or another problem. + * Returns true, if no segments are in flight and we have queued segments, + * but cannot send anything now because of SWS or another problem. */ -static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, - int push_one, gfp_t gfp) +static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, + int push_one, gfp_t gfp) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; @@ -1753,7 +1955,7 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, /* Do MTU probing. */ result = tcp_mtu_probe(sk); if (!result) { - return 0; + return false; } else if (result > 0) { sent_pkts = 1; } @@ -1762,6 +1964,7 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, while ((skb = tcp_send_head(sk))) { unsigned int limit; + tso_segs = tcp_init_tso_segs(sk, skb, mss_now); BUG_ON(!tso_segs); @@ -1782,6 +1985,13 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, break; } + /* TSQ : sk_wmem_alloc accounts skb truesize, + * including skb overhead. But thats OK. + */ + if (atomic_read(&sk->sk_wmem_alloc) >= sysctl_tcp_limit_output_bytes) { + set_bit(TSQ_THROTTLED, &tp->tsq_flags); + break; + } limit = mss_now; if (tso_segs > 1 && !tcp_urg_mode(tp)) limit = tcp_mss_split_point(sk, skb, mss_now, @@ -1812,7 +2022,7 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, if (likely(sent_pkts)) { tcp_cwnd_validate(sk); - return 0; + return false; } return !tp->packets_out && tcp_send_head(sk); } @@ -2011,22 +2221,22 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) } /* Check if coalescing SKBs is legal. */ -static int tcp_can_collapse(const struct sock *sk, const struct sk_buff *skb) +static bool tcp_can_collapse(const struct sock *sk, const struct sk_buff *skb) { if (tcp_skb_pcount(skb) > 1) - return 0; + return false; /* TODO: SACK collapsing could be used to remove this condition */ if (skb_shinfo(skb)->nr_frags != 0) - return 0; + return false; if (skb_cloned(skb)) - return 0; + return false; if (skb == tcp_send_head(sk)) - return 0; + return false; /* Some heurestics for collapsing over SACK'd could be invented */ if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) - return 0; + return false; - return 1; + return true; } /* Collapse packets in the retransmit queue to make to create @@ -2037,7 +2247,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to, { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb = to, *tmp; - int first = 1; + bool first = true; if (!sysctl_tcp_retrans_collapse) return; @@ -2051,7 +2261,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to, space -= skb->len; if (first) { - first = 0; + first = false; continue; } @@ -2060,7 +2270,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to, /* Punt if not enough space exists in the first SKB for * the data in the second */ - if (skb->len > skb_tailroom(to)) + if (skb->len > skb_availroom(to)) break; if (after(TCP_SKB_CB(skb)->end_seq, tcp_wnd_end(tp))) @@ -2166,8 +2376,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) #if FASTRETRANS_DEBUG > 0 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { - if (net_ratelimit()) - printk(KERN_DEBUG "retrans_out leaked.\n"); + net_dbg_ratelimited("retrans_out leaked\n"); } #endif if (!tp->retrans_out) @@ -2192,18 +2401,18 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) /* Check if we forward retransmits are possible in the current * window/congestion state. */ -static int tcp_can_forward_retransmit(struct sock *sk) +static bool tcp_can_forward_retransmit(struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_sock *tp = tcp_sk(sk); /* Forward retransmissions are possible only during Recovery. */ if (icsk->icsk_ca_state != TCP_CA_Recovery) - return 0; + return false; /* No forward retransmissions in Reno are possible. */ if (tcp_is_reno(tp)) - return 0; + return false; /* Yeah, we have to make difficult choice between forward transmission * and retransmission... Both ways have their merits... @@ -2214,9 +2423,9 @@ static int tcp_can_forward_retransmit(struct sock *sk) */ if (tcp_may_send_now(sk)) - return 0; + return false; - return 1; + return true; } /* This gets called after a retransmit timeout, and the initially @@ -2401,7 +2610,7 @@ int tcp_send_synack(struct sock *sk) skb = tcp_write_queue_head(sk); if (skb == NULL || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { - printk(KERN_DEBUG "tcp_send_synack: wrong queue state\n"); + pr_debug("%s: wrong queue state\n", __func__); return -EFAULT; } if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) { @@ -2425,7 +2634,16 @@ int tcp_send_synack(struct sock *sk) return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); } -/* Prepare a SYN-ACK. */ +/** + * tcp_make_synack - Prepare a SYN-ACK. + * sk: listener socket + * dst: dst entry attached to the SYNACK + * req: request_sock pointer + * rvp: request_values pointer + * + * Allocate one skb and build a SYNACK packet. + * @dst is consumed : Caller should not use it again. + */ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, struct request_sock *req, struct request_values *rvp) @@ -2444,14 +2662,15 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, if (cvp != NULL && cvp->s_data_constant && cvp->s_data_desired) s_data_desired = cvp->s_data_desired; - skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15 + s_data_desired, 1, GFP_ATOMIC); - if (skb == NULL) + skb = alloc_skb(MAX_TCP_HEADER + 15 + s_data_desired, GFP_ATOMIC); + if (unlikely(!skb)) { + dst_release(dst); return NULL; - + } /* Reserve space for headers. */ skb_reserve(skb, MAX_TCP_HEADER); - skb_dst_set(skb, dst_clone(dst)); + skb_dst_set(skb, dst); mss = dst_metric_advmss(dst); if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss) @@ -2561,7 +2780,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, EXPORT_SYMBOL(tcp_make_synack); /* Do all connect socket setups that can be done AF independent. */ -static void tcp_connect_init(struct sock *sk) +void tcp_connect_init(struct sock *sk) { const struct dst_entry *dst = __sk_dst_get(sk); struct tcp_sock *tp = tcp_sk(sk); @@ -2616,15 +2835,121 @@ static void tcp_connect_init(struct sock *sk) tp->snd_una = tp->write_seq; tp->snd_sml = tp->write_seq; tp->snd_up = tp->write_seq; - tp->rcv_nxt = 0; - tp->rcv_wup = 0; - tp->copied_seq = 0; + tp->snd_nxt = tp->write_seq; + + if (likely(!tp->repair)) + tp->rcv_nxt = 0; + tp->rcv_wup = tp->rcv_nxt; + tp->copied_seq = tp->rcv_nxt; inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT; inet_csk(sk)->icsk_retransmits = 0; tcp_clear_retrans(tp); } +static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); + + tcb->end_seq += skb->len; + skb_header_release(skb); + __tcp_add_write_queue_tail(sk, skb); + sk->sk_wmem_queued += skb->truesize; + sk_mem_charge(sk, skb->truesize); + tp->write_seq = tcb->end_seq; + tp->packets_out += tcp_skb_pcount(skb); +} + +/* Build and send a SYN with data and (cached) Fast Open cookie. However, + * queue a data-only packet after the regular SYN, such that regular SYNs + * are retransmitted on timeouts. Also if the remote SYN-ACK acknowledges + * only the SYN sequence, the data are retransmitted in the first ACK. + * If cookie is not cached or other error occurs, falls back to send a + * regular SYN with Fast Open cookie request option. + */ +static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct tcp_fastopen_request *fo = tp->fastopen_req; + int syn_loss = 0, space, i, err = 0, iovlen = fo->data->msg_iovlen; + struct sk_buff *syn_data = NULL, *data; + unsigned long last_syn_loss = 0; + + tp->rx_opt.mss_clamp = tp->advmss; /* If MSS is not cached */ + tcp_fastopen_cache_get(sk, &tp->rx_opt.mss_clamp, &fo->cookie, + &syn_loss, &last_syn_loss); + /* Recurring FO SYN losses: revert to regular handshake temporarily */ + if (syn_loss > 1 && + time_before(jiffies, last_syn_loss + (60*HZ << syn_loss))) { + fo->cookie.len = -1; + goto fallback; + } + + if (sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE) + fo->cookie.len = -1; + else if (fo->cookie.len <= 0) + goto fallback; + + /* MSS for SYN-data is based on cached MSS and bounded by PMTU and + * user-MSS. Reserve maximum option space for middleboxes that add + * private TCP options. The cost is reduced data space in SYN :( + */ + if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->rx_opt.mss_clamp) + tp->rx_opt.mss_clamp = tp->rx_opt.user_mss; + space = tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) - + MAX_TCP_OPTION_SPACE; + + syn_data = skb_copy_expand(syn, skb_headroom(syn), space, + sk->sk_allocation); + if (syn_data == NULL) + goto fallback; + + for (i = 0; i < iovlen && syn_data->len < space; ++i) { + struct iovec *iov = &fo->data->msg_iov[i]; + unsigned char __user *from = iov->iov_base; + int len = iov->iov_len; + + if (syn_data->len + len > space) + len = space - syn_data->len; + else if (i + 1 == iovlen) + /* No more data pending in inet_wait_for_connect() */ + fo->data = NULL; + + if (skb_add_data(syn_data, from, len)) + goto fallback; + } + + /* Queue a data-only packet after the regular SYN for retransmission */ + data = pskb_copy(syn_data, sk->sk_allocation); + if (data == NULL) + goto fallback; + TCP_SKB_CB(data)->seq++; + TCP_SKB_CB(data)->tcp_flags &= ~TCPHDR_SYN; + TCP_SKB_CB(data)->tcp_flags = (TCPHDR_ACK|TCPHDR_PSH); + tcp_connect_queue_skb(sk, data); + fo->copied = data->len; + + if (tcp_transmit_skb(sk, syn_data, 0, sk->sk_allocation) == 0) { + tp->syn_data = (fo->copied > 0); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE); + goto done; + } + syn_data = NULL; + +fallback: + /* Send a regular SYN with Fast Open cookie request option */ + if (fo->cookie.len > 0) + fo->cookie.len = 0; + err = tcp_transmit_skb(sk, syn, 1, sk->sk_allocation); + if (err) + tp->syn_fastopen = 0; + kfree_skb(syn_data); +done: + fo->cookie.len = -1; /* Exclude Fast Open option for SYN retries */ + return err; +} + /* Build a SYN and send it off. */ int tcp_connect(struct sock *sk) { @@ -2641,19 +2966,14 @@ int tcp_connect(struct sock *sk) /* Reserve space for headers. */ skb_reserve(buff, MAX_TCP_HEADER); - tp->snd_nxt = tp->write_seq; tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN); + tp->retrans_stamp = TCP_SKB_CB(buff)->when = tcp_time_stamp; + tcp_connect_queue_skb(sk, buff); TCP_ECN_send_syn(sk, buff); - /* Send it off. */ - TCP_SKB_CB(buff)->when = tcp_time_stamp; - tp->retrans_stamp = TCP_SKB_CB(buff)->when; - skb_header_release(buff); - __tcp_add_write_queue_tail(sk, buff); - sk->sk_wmem_queued += buff->truesize; - sk_mem_charge(sk, buff->truesize); - tp->packets_out += tcp_skb_pcount(buff); - err = tcp_transmit_skb(sk, buff, 1, sk->sk_allocation); + /* Send off SYN; include data in Fast Open. */ + err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) : + tcp_transmit_skb(sk, buff, 1, sk->sk_allocation); if (err == -ECONNREFUSED) return err; @@ -2790,6 +3110,15 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent) return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC); } +void tcp_send_window_probe(struct sock *sk) +{ + if (sk->sk_state == TCP_ESTABLISHED) { + tcp_sk(sk)->snd_wl1 = tcp_sk(sk)->rcv_nxt - 1; + tcp_sk(sk)->snd_nxt = tcp_sk(sk)->write_seq; + tcp_xmit_probe_skb(sk, 0); + } +} + /* Initiate keepalive or window probe from timer. */ int tcp_write_wakeup(struct sock *sk) { diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c index a981cdc0a6e9..4526fe68e60e 100644 --- a/net/ipv4/tcp_probe.c +++ b/net/ipv4/tcp_probe.c @@ -91,7 +91,7 @@ static inline int tcp_probe_avail(void) * Note: arguments must match tcp_rcv_established()! */ static int jtcp_rcv_established(struct sock *sk, struct sk_buff *skb, - struct tcphdr *th, unsigned len) + struct tcphdr *th, unsigned int len) { const struct tcp_sock *tp = tcp_sk(sk); const struct inet_sock *inet = inet_sk(sk); @@ -138,7 +138,7 @@ static struct jprobe tcp_jprobe = { .entry = jtcp_rcv_established, }; -static int tcpprobe_open(struct inode * inode, struct file * file) +static int tcpprobe_open(struct inode *inode, struct file *file) { /* Reset (empty) log */ spin_lock_bh(&tcp_probe.lock); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 34d4a02c2f16..6df36ad55a38 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -32,17 +32,6 @@ int sysctl_tcp_retries2 __read_mostly = TCP_RETR2; int sysctl_tcp_orphan_retries __read_mostly; int sysctl_tcp_thin_linear_timeouts __read_mostly; -static void tcp_write_timer(unsigned long); -static void tcp_delack_timer(unsigned long); -static void tcp_keepalive_timer (unsigned long data); - -void tcp_init_xmit_timers(struct sock *sk) -{ - inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer, - &tcp_keepalive_timer); -} -EXPORT_SYMBOL(tcp_init_xmit_timers); - static void tcp_write_err(struct sock *sk) { sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT; @@ -205,21 +194,11 @@ static int tcp_write_timeout(struct sock *sk) return 0; } -static void tcp_delack_timer(unsigned long data) +void tcp_delack_timer_handler(struct sock *sk) { - struct sock *sk = (struct sock *)data; struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); - bh_lock_sock(sk); - if (sock_owned_by_user(sk)) { - /* Try again later. */ - icsk->icsk_ack.blocked = 1; - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED); - sk_reset_timer(sk, &icsk->icsk_delack_timer, jiffies + TCP_DELACK_MIN); - goto out_unlock; - } - sk_mem_reclaim_partial(sk); if (sk->sk_state == TCP_CLOSE || !(icsk->icsk_ack.pending & ICSK_ACK_TIMER)) @@ -260,7 +239,21 @@ static void tcp_delack_timer(unsigned long data) out: if (sk_under_memory_pressure(sk)) sk_mem_reclaim(sk); -out_unlock: +} + +static void tcp_delack_timer(unsigned long data) +{ + struct sock *sk = (struct sock *)data; + + bh_lock_sock(sk); + if (!sock_owned_by_user(sk)) { + tcp_delack_timer_handler(sk); + } else { + inet_csk(sk)->icsk_ack.blocked = 1; + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED); + /* deleguate our work to tcp_release_cb() */ + set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags); + } bh_unlock_sock(sk); sock_put(sk); } @@ -319,6 +312,11 @@ void tcp_retransmit_timer(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); + if (tp->early_retrans_delayed) { + tcp_resume_early_retransmit(sk); + return; + } + if (!tp->packets_out) goto out; @@ -445,19 +443,11 @@ out_reset_timer: out:; } -static void tcp_write_timer(unsigned long data) +void tcp_write_timer_handler(struct sock *sk) { - struct sock *sk = (struct sock *)data; struct inet_connection_sock *icsk = inet_csk(sk); int event; - bh_lock_sock(sk); - if (sock_owned_by_user(sk)) { - /* Try again later */ - sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + (HZ / 20)); - goto out_unlock; - } - if (sk->sk_state == TCP_CLOSE || !icsk->icsk_pending) goto out; @@ -480,7 +470,19 @@ static void tcp_write_timer(unsigned long data) out: sk_mem_reclaim(sk); -out_unlock: +} + +static void tcp_write_timer(unsigned long data) +{ + struct sock *sk = (struct sock *)data; + + bh_lock_sock(sk); + if (!sock_owned_by_user(sk)) { + tcp_write_timer_handler(sk); + } else { + /* deleguate our work to tcp_release_cb() */ + set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags); + } bh_unlock_sock(sk); sock_put(sk); } @@ -597,3 +599,10 @@ out: bh_unlock_sock(sk); sock_put(sk); } + +void tcp_init_xmit_timers(struct sock *sk) +{ + inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer, + &tcp_keepalive_timer); +} +EXPORT_SYMBOL(tcp_init_xmit_timers); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index fe141052a1be..b4c3582a991f 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -107,6 +107,8 @@ #include <net/checksum.h> #include <net/xfrm.h> #include <trace/events/udp.h> +#include <linux/static_key.h> +#include <trace/events/skb.h> #include "udp_impl.h" struct udp_table udp_table __read_mostly; @@ -206,7 +208,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum, if (!snum) { int low, high, remaining; - unsigned rand; + unsigned int rand; unsigned short first, last; DECLARE_BITMAP(bitmap, PORTS_PER_CHAIN); @@ -614,6 +616,7 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) break; case ICMP_DEST_UNREACH: if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */ + ipv4_sk_update_pmtu(skb, sk, info); if (inet->pmtudisc != IP_PMTUDISC_DONT) { err = EMSGSIZE; harderr = 1; @@ -627,6 +630,9 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) err = icmp_err_convert[code].errno; } break; + case ICMP_REDIRECT: + ipv4_sk_redirect(skb, sk); + break; } /* @@ -846,7 +852,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, * Get and verify the address. */ if (msg->msg_name) { - struct sockaddr_in * usin = (struct sockaddr_in *)msg->msg_name; + struct sockaddr_in *usin = (struct sockaddr_in *)msg->msg_name; if (msg->msg_namelen < sizeof(*usin)) return -EINVAL; if (usin->sin_family != AF_INET) { @@ -1218,8 +1224,10 @@ try_again: goto csum_copy_err; } - if (err) + if (unlikely(err)) { + trace_kfree_skb(skb, udp_recvmsg); goto out_free; + } if (!peeked) UDP_INC_STATS_USER(sock_net(sk), @@ -1379,6 +1387,14 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) } +static struct static_key udp_encap_needed __read_mostly; +void udp_encap_enable(void) +{ + if (!static_key_enabled(&udp_encap_needed)) + static_key_slow_inc(&udp_encap_needed); +} +EXPORT_SYMBOL(udp_encap_enable); + /* returns: * -1: error * 0: success @@ -1400,7 +1416,7 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) goto drop; nf_reset(skb); - if (up->encap_type) { + if (static_key_false(&udp_encap_needed) && up->encap_type) { int (*encap_rcv)(struct sock *sk, struct sk_buff *skb); /* @@ -1470,7 +1486,7 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) goto drop; - if (sk_rcvqueues_full(sk, skb)) + if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) goto drop; rc = 0; @@ -1479,7 +1495,7 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) bh_lock_sock(sk); if (!sock_owned_by_user(sk)) rc = __udp_queue_rcv_skb(sk, skb); - else if (sk_add_backlog(sk, skb)) { + else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) { bh_unlock_sock(sk); goto drop; } @@ -1760,6 +1776,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, /* FALLTHROUGH */ case UDP_ENCAP_L2TPINUDP: up->encap_type = val; + udp_encap_enable(); break; default: err = -ENOPROTOOPT; @@ -2163,9 +2180,15 @@ void udp4_proc_exit(void) static __initdata unsigned long uhash_entries; static int __init set_uhash_entries(char *str) { + ssize_t ret; + if (!str) return 0; - uhash_entries = simple_strtoul(str, &str, 0); + + ret = kstrtoul(str, 0, &uhash_entries); + if (ret) + return 0; + if (uhash_entries && uhash_entries < UDP_HTABLE_SIZE_MIN) uhash_entries = UDP_HTABLE_SIZE_MIN; return 1; @@ -2176,26 +2199,16 @@ void __init udp_table_init(struct udp_table *table, const char *name) { unsigned int i; - if (!CONFIG_BASE_SMALL) - table->hash = alloc_large_system_hash(name, - 2 * sizeof(struct udp_hslot), - uhash_entries, - 21, /* one slot per 2 MB */ - 0, - &table->log, - &table->mask, - 64 * 1024); - /* - * Make sure hash table has the minimum size - */ - if (CONFIG_BASE_SMALL || table->mask < UDP_HTABLE_SIZE_MIN - 1) { - table->hash = kmalloc(UDP_HTABLE_SIZE_MIN * - 2 * sizeof(struct udp_hslot), GFP_KERNEL); - if (!table->hash) - panic(name); - table->log = ilog2(UDP_HTABLE_SIZE_MIN); - table->mask = UDP_HTABLE_SIZE_MIN - 1; - } + table->hash = alloc_large_system_hash(name, + 2 * sizeof(struct udp_hslot), + uhash_entries, + 21, /* one slot per 2 MB */ + 0, + &table->log, + &table->mask, + UDP_HTABLE_SIZE_MIN, + 64 * 1024); + table->hash2 = table->hash + (table->mask + 1); for (i = 0; i <= table->mask; i++) { INIT_HLIST_NULLS_HEAD(&table->hash[i].head, i); diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c index 8a949f19deb6..16d0960062be 100644 --- a/net/ipv4/udp_diag.c +++ b/net/ipv4/udp_diag.c @@ -34,15 +34,16 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb, int err = -EINVAL; struct sock *sk; struct sk_buff *rep; + struct net *net = sock_net(in_skb->sk); if (req->sdiag_family == AF_INET) - sk = __udp4_lib_lookup(&init_net, + sk = __udp4_lib_lookup(net, req->id.idiag_src[0], req->id.idiag_sport, req->id.idiag_dst[0], req->id.idiag_dport, req->id.idiag_if, tbl); #if IS_ENABLED(CONFIG_IPV6) else if (req->sdiag_family == AF_INET6) - sk = __udp6_lib_lookup(&init_net, + sk = __udp6_lib_lookup(net, (struct in6_addr *)req->id.idiag_src, req->id.idiag_sport, (struct in6_addr *)req->id.idiag_dst, @@ -75,7 +76,7 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb, kfree_skb(rep); goto out; } - err = netlink_unicast(sock_diag_nlsk, rep, NETLINK_CB(in_skb).pid, + err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).pid, MSG_DONTWAIT); if (err > 0) err = 0; @@ -90,6 +91,7 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb, struct netlin struct inet_diag_req_v2 *r, struct nlattr *bc) { int num, s_num, slot, s_slot; + struct net *net = sock_net(skb->sk); s_slot = cb->args[0]; num = s_num = cb->args[1]; @@ -106,6 +108,8 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb, struct netlin sk_nulls_for_each(sk, node, &hslot->head) { struct inet_sock *inet = inet_sk(sk); + if (!net_eq(sock_net(sk), net)) + continue; if (num < s_num) goto next; if (!(r->idiag_states & (1 << sk->sk_state))) @@ -146,9 +150,17 @@ static int udp_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh, return udp_dump_one(&udp_table, in_skb, nlh, req); } +static void udp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, + void *info) +{ + r->idiag_rqueue = sk_rmem_alloc_get(sk); + r->idiag_wqueue = sk_wmem_alloc_get(sk); +} + static const struct inet_diag_handler udp_diag_handler = { .dump = udp_diag_dump, .dump_one = udp_diag_dump_one, + .idiag_get_info = udp_diag_get_info, .idiag_type = IPPROTO_UDP, }; @@ -167,6 +179,7 @@ static int udplite_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr * static const struct inet_diag_handler udplite_diag_handler = { .dump = udplite_diag_dump, .dump_one = udplite_diag_dump_one, + .idiag_get_info = udp_diag_get_info, .idiag_type = IPPROTO_UDPLITE, }; diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h index aaad650d47d9..5a681e298b90 100644 --- a/net/ipv4/udp_impl.h +++ b/net/ipv4/udp_impl.h @@ -25,7 +25,7 @@ extern int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t len, int noblock, int flags, int *addr_len); extern int udp_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags); -extern int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb); +extern int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); extern void udp_destroy_sock(struct sock *sk); #ifdef CONFIG_PROC_FS diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c index ed4bf11ef9f4..ddee0a099a2c 100644 --- a/net/ipv4/xfrm4_mode_tunnel.c +++ b/net/ipv4/xfrm4_mode_tunnel.c @@ -15,6 +15,65 @@ #include <net/ip.h> #include <net/xfrm.h> +/* Informational hook. The decap is still done here. */ +static struct xfrm_tunnel __rcu *rcv_notify_handlers __read_mostly; +static DEFINE_MUTEX(xfrm4_mode_tunnel_input_mutex); + +int xfrm4_mode_tunnel_input_register(struct xfrm_tunnel *handler) +{ + struct xfrm_tunnel __rcu **pprev; + struct xfrm_tunnel *t; + int ret = -EEXIST; + int priority = handler->priority; + + mutex_lock(&xfrm4_mode_tunnel_input_mutex); + + for (pprev = &rcv_notify_handlers; + (t = rcu_dereference_protected(*pprev, + lockdep_is_held(&xfrm4_mode_tunnel_input_mutex))) != NULL; + pprev = &t->next) { + if (t->priority > priority) + break; + if (t->priority == priority) + goto err; + + } + + handler->next = *pprev; + rcu_assign_pointer(*pprev, handler); + + ret = 0; + +err: + mutex_unlock(&xfrm4_mode_tunnel_input_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(xfrm4_mode_tunnel_input_register); + +int xfrm4_mode_tunnel_input_deregister(struct xfrm_tunnel *handler) +{ + struct xfrm_tunnel __rcu **pprev; + struct xfrm_tunnel *t; + int ret = -ENOENT; + + mutex_lock(&xfrm4_mode_tunnel_input_mutex); + for (pprev = &rcv_notify_handlers; + (t = rcu_dereference_protected(*pprev, + lockdep_is_held(&xfrm4_mode_tunnel_input_mutex))) != NULL; + pprev = &t->next) { + if (t == handler) { + *pprev = handler->next; + ret = 0; + break; + } + } + mutex_unlock(&xfrm4_mode_tunnel_input_mutex); + synchronize_net(); + + return ret; +} +EXPORT_SYMBOL_GPL(xfrm4_mode_tunnel_input_deregister); + static inline void ipip_ecn_decapsulate(struct sk_buff *skb) { struct iphdr *inner_iph = ipip_hdr(skb); @@ -64,8 +123,14 @@ static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) return 0; } +#define for_each_input_rcu(head, handler) \ + for (handler = rcu_dereference(head); \ + handler != NULL; \ + handler = rcu_dereference(handler->next)) + static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) { + struct xfrm_tunnel *handler; int err = -EINVAL; if (XFRM_MODE_SKB_CB(skb)->protocol != IPPROTO_IPIP) @@ -74,6 +139,9 @@ static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) if (!pskb_may_pull(skb, sizeof(struct iphdr))) goto out; + for_each_input_rcu(rcv_notify_handlers, handler) + handler->handler(skb); + if (skb_cloned(skb) && (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) goto out; diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index a0b4c5da8d43..fcf7678bc009 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -90,10 +90,6 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, xdst->u.dst.dev = dev; dev_hold(dev); - xdst->u.rt.peer = rt->peer; - if (rt->peer) - atomic_inc(&rt->peer->refcnt); - /* Sheit... I remember I did this right. Apparently, * it was magically lost, so this code needs audit */ xdst->u.rt.rt_flags = rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST | @@ -102,7 +98,7 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, xdst->u.rt.rt_src = rt->rt_src; xdst->u.rt.rt_dst = rt->rt_dst; xdst->u.rt.rt_gateway = rt->rt_gateway; - xdst->u.rt.rt_spec_dst = rt->rt_spec_dst; + xdst->u.rt.rt_pmtu = rt->rt_pmtu; return 0; } @@ -152,7 +148,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) case IPPROTO_AH: if (pskb_may_pull(skb, xprth + 8 - skb->data)) { - __be32 *ah_hdr = (__be32*)xprth; + __be32 *ah_hdr = (__be32 *)xprth; fl4->fl4_ipsec_spi = ah_hdr[1]; } @@ -198,12 +194,22 @@ static inline int xfrm4_garbage_collect(struct dst_ops *ops) return (dst_entries_get_slow(ops) > ops->gc_thresh * 2); } -static void xfrm4_update_pmtu(struct dst_entry *dst, u32 mtu) +static void xfrm4_update_pmtu(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb, u32 mtu) +{ + struct xfrm_dst *xdst = (struct xfrm_dst *)dst; + struct dst_entry *path = xdst->route; + + path->ops->update_pmtu(path, sk, skb, mtu); +} + +static void xfrm4_redirect(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb) { struct xfrm_dst *xdst = (struct xfrm_dst *)dst; struct dst_entry *path = xdst->route; - path->ops->update_pmtu(path, mtu); + path->ops->redirect(path, sk, skb); } static void xfrm4_dst_destroy(struct dst_entry *dst) @@ -212,9 +218,6 @@ static void xfrm4_dst_destroy(struct dst_entry *dst) dst_destroy_metrics_generic(dst); - if (likely(xdst->u.rt.peer)) - inet_putpeer(xdst->u.rt.peer); - xfrm_dst_destroy(xdst); } @@ -232,6 +235,7 @@ static struct dst_ops xfrm4_dst_ops = { .protocol = cpu_to_be16(ETH_P_IP), .gc = xfrm4_garbage_collect, .update_pmtu = xfrm4_update_pmtu, + .redirect = xfrm4_redirect, .cow_metrics = dst_cow_metrics_generic, .destroy = xfrm4_dst_destroy, .ifdown = xfrm4_dst_ifdown, @@ -298,8 +302,8 @@ void __init xfrm4_init(int rt_max_size) xfrm4_state_init(); xfrm4_policy_init(); #ifdef CONFIG_SYSCTL - sysctl_hdr = register_net_sysctl_table(&init_net, net_ipv4_ctl_path, - xfrm4_policy_table); + sysctl_hdr = register_net_sysctl(&init_net, "net/ipv4", + xfrm4_policy_table); #endif } |
